Skip to content

Main class DataReader

src.data_readers.data_reader.DataReader

DataReader class for reading pre-determined dataset and data transformation for the UI.

Source code in src/data_readers/data_reader.py
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
class DataReader():
    """
    DataReader class for reading pre-determined dataset and data transformation for the UI. 
    """

    def __init__(self, configs):
        """
        Data reader init class.
        Attributes
        ----------
        configs : dict
            configuration dict of the dataset
        name : str
            name of the dataset set in run_apps args
        query_col : str
            name of the query column
        sensitive_col : str
            name of the sensitive column (e.g. gender) used for applying fairness interventions (optional)
        data_path : str
            path to the dataset file
        output_file_path : str
            path to the output file where the transformed dataset will be saved
        """
        self.name = configs["name"]
        self.query_col = configs["query"]
        self.score_col = configs["score"]

        if 'group' in configs:
            self.sensitive_col = configs["group"]
        else:
            self.sensitive_col =  None

        self.data_path = os.path.join(
            project_dir, 'dataset/' + self.name)
        self.output_file_path = os.path.join(self.data_path, 'format_data')
        if not os.path.exists(self.output_file_path):
            # save transformed data
            self.save_data()

    def read(self, split):
        """Read dataset file.

        Args:
            split (str): The split of the dataset to read ('test' or 'train').

        Returns:
            If split is 'test':
                tuple: A tuple containing the dataframes of document, query, and experiment lists.
            If split is 'train':
                tuple: A tuple containing the dataframes of document and query.

        Raises:
            FileNotFoundError: If the dataset file or query file is not found.

        """
        dataframe_data = pd.read_csv(os.path.join(self.output_file_path, split, 'data.csv'))
        dataframe_query = pd.read_csv(os.path.join(self.output_file_path, 'query.csv'))

        if split == 'test':
            experiments_files = [file for file in os.listdir(os.path.join(self.data_path, 'experiments')) if
                                    file.endswith('.json')]
            experiments_info = []
            for exp_file in experiments_files:
                with open(os.path.join(self.data_path, 'experiments', exp_file)) as f:
                    exp_info = json.load(f)
                    experiments_info.append(exp_info)

            return dataframe_data, dataframe_query, experiments_info
        else:
            return dataframe_data, dataframe_query

    def save_data(self):
        """Save the transformed data in splits.

        This method creates the necessary directories and saves the transformed data to CSV files.
        The data is saved in the following structure:
        - The main output directory is created at `self.output_file_path`.
        - Inside the main output directory, two subdirectories are created: 'test' and 'train'.
        - The transformed test data is saved as 'data.csv' inside the 'test' subdirectory.
            This will be displayed in the UI.
        - If there is transformed train data available, it is saved as 'data.csv' inside the 'train' subdirectory.
            This will be used for training the ranker or fairness intervention.
        - The dataset queries are saved as 'query.csv' inside the main output directory.

        Note: The method assumes that the necessary data has already been transformed and is available.

        Returns:
            None
        """

        # transform dataset into a pandas.DataFrame
        dataset_queries, data_train, data_test = self.transform_data()

        os.makedirs(self.output_file_path)
        os.makedirs(os.path.join(self.output_file_path, 'test'))
        os.makedirs(os.path.join(self.output_file_path, 'train'))
        data_test.to_csv(os.path.join(self.output_file_path, 'test', 'data.csv'), index=False)
        if data_train is not None:
            data_train.to_csv(os.path.join(self.output_file_path, 'train', 'data.csv'), index=False)
        dataset_queries.to_csv(os.path.join(self.output_file_path, 'query.csv'), index=False)

__init__(configs)

Data reader init class. Attributes


configs : dict configuration dict of the dataset name : str name of the dataset set in run_apps args query_col : str name of the query column sensitive_col : str name of the sensitive column (e.g. gender) used for applying fairness interventions (optional) data_path : str path to the dataset file output_file_path : str path to the output file where the transformed dataset will be saved

Source code in src/data_readers/data_reader.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
def __init__(self, configs):
    """
    Data reader init class.
    Attributes
    ----------
    configs : dict
        configuration dict of the dataset
    name : str
        name of the dataset set in run_apps args
    query_col : str
        name of the query column
    sensitive_col : str
        name of the sensitive column (e.g. gender) used for applying fairness interventions (optional)
    data_path : str
        path to the dataset file
    output_file_path : str
        path to the output file where the transformed dataset will be saved
    """
    self.name = configs["name"]
    self.query_col = configs["query"]
    self.score_col = configs["score"]

    if 'group' in configs:
        self.sensitive_col = configs["group"]
    else:
        self.sensitive_col =  None

    self.data_path = os.path.join(
        project_dir, 'dataset/' + self.name)
    self.output_file_path = os.path.join(self.data_path, 'format_data')
    if not os.path.exists(self.output_file_path):
        # save transformed data
        self.save_data()

read(split)

Read dataset file.

Parameters:

Name Type Description Default
split str

The split of the dataset to read ('test' or 'train').

required

Returns:

Type Description

If split is 'test': tuple: A tuple containing the dataframes of document, query, and experiment lists.

If split is 'train': tuple: A tuple containing the dataframes of document and query.

Raises:

Type Description
FileNotFoundError

If the dataset file or query file is not found.

Source code in src/data_readers/data_reader.py
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
def read(self, split):
    """Read dataset file.

    Args:
        split (str): The split of the dataset to read ('test' or 'train').

    Returns:
        If split is 'test':
            tuple: A tuple containing the dataframes of document, query, and experiment lists.
        If split is 'train':
            tuple: A tuple containing the dataframes of document and query.

    Raises:
        FileNotFoundError: If the dataset file or query file is not found.

    """
    dataframe_data = pd.read_csv(os.path.join(self.output_file_path, split, 'data.csv'))
    dataframe_query = pd.read_csv(os.path.join(self.output_file_path, 'query.csv'))

    if split == 'test':
        experiments_files = [file for file in os.listdir(os.path.join(self.data_path, 'experiments')) if
                                file.endswith('.json')]
        experiments_info = []
        for exp_file in experiments_files:
            with open(os.path.join(self.data_path, 'experiments', exp_file)) as f:
                exp_info = json.load(f)
                experiments_info.append(exp_info)

        return dataframe_data, dataframe_query, experiments_info
    else:
        return dataframe_data, dataframe_query

save_data()

Save the transformed data in splits.

This method creates the necessary directories and saves the transformed data to CSV files. The data is saved in the following structure: - The main output directory is created at self.output_file_path. - Inside the main output directory, two subdirectories are created: 'test' and 'train'. - The transformed test data is saved as 'data.csv' inside the 'test' subdirectory. This will be displayed in the UI. - If there is transformed train data available, it is saved as 'data.csv' inside the 'train' subdirectory. This will be used for training the ranker or fairness intervention. - The dataset queries are saved as 'query.csv' inside the main output directory.

Note: The method assumes that the necessary data has already been transformed and is available.

Returns:

Type Description

None

Source code in src/data_readers/data_reader.py
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
def save_data(self):
    """Save the transformed data in splits.

    This method creates the necessary directories and saves the transformed data to CSV files.
    The data is saved in the following structure:
    - The main output directory is created at `self.output_file_path`.
    - Inside the main output directory, two subdirectories are created: 'test' and 'train'.
    - The transformed test data is saved as 'data.csv' inside the 'test' subdirectory.
        This will be displayed in the UI.
    - If there is transformed train data available, it is saved as 'data.csv' inside the 'train' subdirectory.
        This will be used for training the ranker or fairness intervention.
    - The dataset queries are saved as 'query.csv' inside the main output directory.

    Note: The method assumes that the necessary data has already been transformed and is available.

    Returns:
        None
    """

    # transform dataset into a pandas.DataFrame
    dataset_queries, data_train, data_test = self.transform_data()

    os.makedirs(self.output_file_path)
    os.makedirs(os.path.join(self.output_file_path, 'test'))
    os.makedirs(os.path.join(self.output_file_path, 'train'))
    data_test.to_csv(os.path.join(self.output_file_path, 'test', 'data.csv'), index=False)
    if data_train is not None:
        data_train.to_csv(os.path.join(self.output_file_path, 'train', 'data.csv'), index=False)
    dataset_queries.to_csv(os.path.join(self.output_file_path, 'query.csv'), index=False)

Extending DataReader Class

Here are few examples of how to extend DataReader Class.

Data Reader for Amazon dataset

src.data_readers.data_reader_amazon.DataReaderAmazon

Bases: DataReader

Source code in src/data_readers/data_reader_amazon.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
class DataReaderAmazon(DataReader):
    def __init__(self, configs):
        super().__init__(configs)

    def transform_data(self):
        """Transform data into pandas.DataFrame and apply cleaning steps.

        Reads the 'amazon.csv' file from the specified data path, drops rows with missing values,
        and performs data transformations on the columns. Returns the transformed data.

        Returns:
            dataframe_query (pandas.DataFrame): A DataFrame containing the transformed query data.
            data_train (pandas.DataFrame): A DataFrame containing the transformed training data.
            data_test (pandas.DataFrame): A DataFrame containing the transformed testing data.
        """
        amazon_product_df = pd.read_csv(os.path.join(self.data_path, 'data', 'amazon.csv'))
        amazon_product_df = amazon_product_df.dropna(how='any', axis=0)

        # the query dataframe
        query_df = pd.DataFrame(columns=['title', 'text'])
        query_df['title'] = amazon_product_df["amazon_category_and_sub_category"]
        query_df['text'] = amazon_product_df["amazon_category_and_sub_category"].apply(lambda x: x.split(">")[-1])

        amazon_product_df["number_of_reviews_display"] = amazon_product_df["number_of_reviews"].apply(lambda x: str(x) + "reviews")

        # split data into train and test
        data_train = amazon_product_df.head(101)
        data_test = amazon_product_df.tail(410)

        dataframe_query = query_df

        return dataframe_query, data_train, data_test

transform_data()

Transform data into pandas.DataFrame and apply cleaning steps.

Reads the 'amazon.csv' file from the specified data path, drops rows with missing values, and performs data transformations on the columns. Returns the transformed data.

Returns:

Name Type Description
dataframe_query DataFrame

A DataFrame containing the transformed query data.

data_train DataFrame

A DataFrame containing the transformed training data.

data_test DataFrame

A DataFrame containing the transformed testing data.

Source code in src/data_readers/data_reader_amazon.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
def transform_data(self):
    """Transform data into pandas.DataFrame and apply cleaning steps.

    Reads the 'amazon.csv' file from the specified data path, drops rows with missing values,
    and performs data transformations on the columns. Returns the transformed data.

    Returns:
        dataframe_query (pandas.DataFrame): A DataFrame containing the transformed query data.
        data_train (pandas.DataFrame): A DataFrame containing the transformed training data.
        data_test (pandas.DataFrame): A DataFrame containing the transformed testing data.
    """
    amazon_product_df = pd.read_csv(os.path.join(self.data_path, 'data', 'amazon.csv'))
    amazon_product_df = amazon_product_df.dropna(how='any', axis=0)

    # the query dataframe
    query_df = pd.DataFrame(columns=['title', 'text'])
    query_df['title'] = amazon_product_df["amazon_category_and_sub_category"]
    query_df['text'] = amazon_product_df["amazon_category_and_sub_category"].apply(lambda x: x.split(">")[-1])

    amazon_product_df["number_of_reviews_display"] = amazon_product_df["number_of_reviews"].apply(lambda x: str(x) + "reviews")

    # split data into train and test
    data_train = amazon_product_df.head(101)
    data_test = amazon_product_df.tail(410)

    dataframe_query = query_df

    return dataframe_query, data_train, data_test

Data Reader for CVS dataset

src.data_readers.data_reader_cvs.DataReaderCvs

Bases: DataReader

Source code in src/data_readers/data_reader_cvs.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
class DataReaderCvs(DataReader):

    def __init__(self, configs):
        super().__init__(configs)

    def transform_data(self):
        """
        Transform data into pandas.DataFrame and apply cleaning steps.

        This method reads and preprocesses data from multiple files and directories.
        It iterates over each occupation directory, reads the query description from a JSON file,
        formats the query as plain text, and appends it to the `dataframes_occupations` list.
        It then lists all JSON files in each occupation directory, reads the candidate data from each file,
        preprocesses the candidate data, and appends it to the `dataframes_candidates` list.
        Finally, it concatenates all the DataFrames into a single DataFrame and returns the result.

        Returns:
            dataframe_occupations (pandas.DataFrame): A DataFrame containing the preprocessed query data.
            data_train (pandas.DataFrame): A DataFrame containing the preprocessed candidate data for training.
            data_test (pandas.DataFrame): A DataFrame containing the preprocessed candidate data for testing.
        """
        occupation_dirs = [dir_name for dir_name in os.listdir(os.path.join(self.data_path, 'data')) if
                           dir_name != 'experiments' and dir_name != 'format_data' and dir_name != 'models']
        dataframes_occupations = []
        dataframes_candidates = []
        for dir_name in occupation_dirs:
            # Read query description and format the query as plain text
            with open(os.path.join(self.data_path, 'data', dir_name, 'description.json'), 'r') as json_file:
                query = json.load(json_file)
            query = pd.json_normalize(query)
            query['text'] = clean_text(dir_name, upper=True) + "\n" + query_to_text(query)
            query['title'] = dir_name
            dataframes_occupations.append(query)

            # List all files in the folder with a .json extension
            json_files = [file for file in os.listdir(os.path.join(self.data_path, 'data', dir_name)) if
                          file.endswith('.json') and file != 'description.json']

            # Iterate over each JSON file
            for json_file in json_files:
                file_path = os.path.join(self.data_path, 'data', dir_name, json_file)

                with open(os.path.join(self.data_path, 'data', dir_name, file_path), 'r') as json_file:
                    candidate_data = json.load(json_file)

                candidate_data = pd.json_normalize(candidate_data)
                candidate_data = candidate_to_text(candidate_data)

                candidate_data['query'] = dir_name

                dataframes_candidates.append(candidate_data)

        # Concatenate all DataFrames into a single DataFrame
        data_test = pd.concat(dataframes_candidates, ignore_index=True)
        # Set data_train to be the same as data_test for testing the ranker and fairness intervention on this dataset
        data_train = data_test

        dataframe_occupations = pd.concat(dataframes_occupations, ignore_index=True)

        return dataframe_occupations, data_train, data_test

transform_data()

Transform data into pandas.DataFrame and apply cleaning steps.

This method reads and preprocesses data from multiple files and directories. It iterates over each occupation directory, reads the query description from a JSON file, formats the query as plain text, and appends it to the dataframes_occupations list. It then lists all JSON files in each occupation directory, reads the candidate data from each file, preprocesses the candidate data, and appends it to the dataframes_candidates list. Finally, it concatenates all the DataFrames into a single DataFrame and returns the result.

Returns:

Name Type Description
dataframe_occupations DataFrame

A DataFrame containing the preprocessed query data.

data_train DataFrame

A DataFrame containing the preprocessed candidate data for training.

data_test DataFrame

A DataFrame containing the preprocessed candidate data for testing.

Source code in src/data_readers/data_reader_cvs.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
def transform_data(self):
    """
    Transform data into pandas.DataFrame and apply cleaning steps.

    This method reads and preprocesses data from multiple files and directories.
    It iterates over each occupation directory, reads the query description from a JSON file,
    formats the query as plain text, and appends it to the `dataframes_occupations` list.
    It then lists all JSON files in each occupation directory, reads the candidate data from each file,
    preprocesses the candidate data, and appends it to the `dataframes_candidates` list.
    Finally, it concatenates all the DataFrames into a single DataFrame and returns the result.

    Returns:
        dataframe_occupations (pandas.DataFrame): A DataFrame containing the preprocessed query data.
        data_train (pandas.DataFrame): A DataFrame containing the preprocessed candidate data for training.
        data_test (pandas.DataFrame): A DataFrame containing the preprocessed candidate data for testing.
    """
    occupation_dirs = [dir_name for dir_name in os.listdir(os.path.join(self.data_path, 'data')) if
                       dir_name != 'experiments' and dir_name != 'format_data' and dir_name != 'models']
    dataframes_occupations = []
    dataframes_candidates = []
    for dir_name in occupation_dirs:
        # Read query description and format the query as plain text
        with open(os.path.join(self.data_path, 'data', dir_name, 'description.json'), 'r') as json_file:
            query = json.load(json_file)
        query = pd.json_normalize(query)
        query['text'] = clean_text(dir_name, upper=True) + "\n" + query_to_text(query)
        query['title'] = dir_name
        dataframes_occupations.append(query)

        # List all files in the folder with a .json extension
        json_files = [file for file in os.listdir(os.path.join(self.data_path, 'data', dir_name)) if
                      file.endswith('.json') and file != 'description.json']

        # Iterate over each JSON file
        for json_file in json_files:
            file_path = os.path.join(self.data_path, 'data', dir_name, json_file)

            with open(os.path.join(self.data_path, 'data', dir_name, file_path), 'r') as json_file:
                candidate_data = json.load(json_file)

            candidate_data = pd.json_normalize(candidate_data)
            candidate_data = candidate_to_text(candidate_data)

            candidate_data['query'] = dir_name

            dataframes_candidates.append(candidate_data)

    # Concatenate all DataFrames into a single DataFrame
    data_test = pd.concat(dataframes_candidates, ignore_index=True)
    # Set data_train to be the same as data_test for testing the ranker and fairness intervention on this dataset
    data_train = data_test

    dataframe_occupations = pd.concat(dataframes_occupations, ignore_index=True)

    return dataframe_occupations, data_train, data_test

Data Reader for Flickr dataset

src.data_readers.data_reader_flickr.DataReaderFlickr

Bases: DataReader

Source code in src/data_readers/data_reader_flickr.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
class DataReaderFlickr(DataReader):

    def __init__(self, configs):
        super().__init__(configs)

    def transform_data(self):
        """
        Transform data into pandas.DataFrame and apply cleaning steps.

        Reads the flickr data from a CSV file, performs data preprocessing, and returns the transformed data.

        Returns:
            tuple: A tuple containing the transformed data.
                - dataframe_query (pandas.DataFrame): A DataFrame containing the transformed query data.
                - data_train (pandas.DataFrame): A DataFrame containing the transformed training data.
                - data_test (pandas.DataFrame): A DataFrame containing the transformed testing data.
        """
        flickr_df = pd.read_csv(os.path.join(self.data_path, 'data', 'flickr.csv'))
        flickr_df = flickr_df.dropna(how='any', axis=0)

        # the query dataframe
        query_df = pd.DataFrame(columns=['title', 'text'])
        query_df['title'] = flickr_df['image']
        query_df['text'] = flickr_df['base64']

        # split data into train and test
        data_train = flickr_df.head(10)
        data_test = flickr_df.tail(20)

        dataframe_query = query_df

        return dataframe_query, data_train, data_test

transform_data()

Transform data into pandas.DataFrame and apply cleaning steps.

Reads the flickr data from a CSV file, performs data preprocessing, and returns the transformed data.

Returns:

Name Type Description
tuple

A tuple containing the transformed data. - dataframe_query (pandas.DataFrame): A DataFrame containing the transformed query data. - data_train (pandas.DataFrame): A DataFrame containing the transformed training data. - data_test (pandas.DataFrame): A DataFrame containing the transformed testing data.

Source code in src/data_readers/data_reader_flickr.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
def transform_data(self):
    """
    Transform data into pandas.DataFrame and apply cleaning steps.

    Reads the flickr data from a CSV file, performs data preprocessing, and returns the transformed data.

    Returns:
        tuple: A tuple containing the transformed data.
            - dataframe_query (pandas.DataFrame): A DataFrame containing the transformed query data.
            - data_train (pandas.DataFrame): A DataFrame containing the transformed training data.
            - data_test (pandas.DataFrame): A DataFrame containing the transformed testing data.
    """
    flickr_df = pd.read_csv(os.path.join(self.data_path, 'data', 'flickr.csv'))
    flickr_df = flickr_df.dropna(how='any', axis=0)

    # the query dataframe
    query_df = pd.DataFrame(columns=['title', 'text'])
    query_df['title'] = flickr_df['image']
    query_df['text'] = flickr_df['base64']

    # split data into train and test
    data_train = flickr_df.head(10)
    data_test = flickr_df.tail(20)

    dataframe_query = query_df

    return dataframe_query, data_train, data_test

Data Reader for Xing dataset

src.data_readers.data_reader_xing.Candidate

Bases: object

represents a candidate in a set that is passed to a search algorithm a candidate composes of a qualification and a list of protected attributes (strings) if the list of protected attributes is empty/null this is a candidate from a non-protected group natural ordering established by the qualification

Source code in src/data_readers/data_reader_xing.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
class Candidate(object):
    """
    represents a candidate in a set that is passed to a search algorithm
    a candidate composes of a qualification and a list of protected attributes (strings)
    if the list of protected attributes is empty/null this is a candidate from a non-protected group
    natural ordering established by the qualification
    """

    def __init__(self, work_experience, edu_experience, hits, qualification, protectedAttributes, member_since, degree):
        """
        @param qualification : describes how qualified the candidate is to match the search query
        @param protectedAttributes: list of strings that represent the protected attributes this
                                    candidate has (e.g. gender, race, etc)
                                    if the list is empty/null this is a candidate from a non-protected group
        """
        self.__qualification = qualification
        self.__protectedAttributes = protectedAttributes
        self.__work_experience = work_experience
        self.__edu_experience = edu_experience
        self.__member_since = member_since
        self.__hits = hits
        self.__degree = degree
        # keeps the candidate's initial qualification for evaluation purposes
        self.__originalQualification = qualification
        self.uuid = uuid.uuid4()

    @property
    def qualification(self):
        return self.__qualification

    @qualification.setter
    def qualification(self, value):
        self.__qualification = value

    @property
    def originalQualification(self):
        return self.__originalQualification

    @originalQualification.setter
    def originalQualification(self, value):
        self.__qualification = value

    @property
    def isProtected(self):
        '''
        true if the list of ProtectedAttribute elements actually contains anything
        false otherwise
        '''
        return not self.__protectedAttributes == []

isProtected property

true if the list of ProtectedAttribute elements actually contains anything false otherwise

__init__(work_experience, edu_experience, hits, qualification, protectedAttributes, member_since, degree)

@param qualification : describes how qualified the candidate is to match the search query @param protectedAttributes: list of strings that represent the protected attributes this candidate has (e.g. gender, race, etc) if the list is empty/null this is a candidate from a non-protected group

Source code in src/data_readers/data_reader_xing.py
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
def __init__(self, work_experience, edu_experience, hits, qualification, protectedAttributes, member_since, degree):
    """
    @param qualification : describes how qualified the candidate is to match the search query
    @param protectedAttributes: list of strings that represent the protected attributes this
                                candidate has (e.g. gender, race, etc)
                                if the list is empty/null this is a candidate from a non-protected group
    """
    self.__qualification = qualification
    self.__protectedAttributes = protectedAttributes
    self.__work_experience = work_experience
    self.__edu_experience = edu_experience
    self.__member_since = member_since
    self.__hits = hits
    self.__degree = degree
    # keeps the candidate's initial qualification for evaluation purposes
    self.__originalQualification = qualification
    self.uuid = uuid.uuid4()

src.data_readers.data_reader_xing.DataReaderXing

Bases: DataReader

reads profiles collected from Xing on certain job description queries profiles are available in JSON format they are read into a data frame indexed by the search queries we used to obtain candidate profiles

the columns consists of arrays of Candidates, the protected ones, the non-protected ones and one that contains all candidates in the same order as was collected from Xing website.

                     |          PROTECTED            |            NON-PROTECTED            |       ORIGINAL ORDERING

Administrative Assistant | [protected1, protected2, ...] | [nonProtected1, nonProtected2, ...] | [nonProtected1, protected1, ...] Auditor | [protected3, protected4, ...] | [nonProtected3, nonProtected3, ...] | [protected4, nonProtected3, ...] ... | ... | ... | ...

the protected attribute of a candidate is their sex a candidate's sex was manually determined from the profile name depending on the dominating sex of a search query result, the other one was set as the protected attribute (e.g. for administrative assistant the protected attribute is male, for auditor it's female)

Source code in src/data_readers/data_reader_xing.py
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
class DataReaderXing(DataReader):
    """
    reads profiles collected from Xing on certain job description queries
    profiles are available in JSON format
    they are read into a data frame indexed by the search queries we used to obtain candidate profiles

    the columns consists of arrays of Candidates, the protected ones, the non-protected ones and
    one that contains all candidates in the same order as was collected from Xing website.

                             |          PROTECTED            |            NON-PROTECTED            |       ORIGINAL ORDERING
    Administrative Assistant | [protected1, protected2, ...] | [nonProtected1, nonProtected2, ...] | [nonProtected1, protected1, ...]
    Auditor                  | [protected3, protected4, ...] | [nonProtected3, nonProtected3, ...] | [protected4, nonProtected3, ...]
            ...              |            ...                |               ...                   |             ...


    the protected attribute of a candidate is their sex
    a candidate's sex was manually determined from the profile name
    depending on the dominating sex of a search query result, the other one was set as the protected
    attribute (e.g. for administrative assistant the protected attribute is male, for auditor it's female)
    """

    EDUCATION_OR_JOB_WITH_NO_DATES = 3  # months count if you had a job that has no associated dates
    EDUCATION_OR_JOB_WITH_SAME_YEAR = 6  # months count if you had a job that started and finished in the same year
    EDUCATION_OR_JOB_WITH_UNDEFINED_DATES = 1  # month given that the person entered the job

    def __init__(self, configs):
        super().__init__(configs)

    def transform_data(self):
        """
        Transform data into pandas.DataFrame and apply cleaning steps.

        Reads the XING data from a JSON file, performs data preprocessing, and returns the transformed data.

        Returns:
            tuple: A tuple containing the transformed data.
                - dataframe_query (pandas.DataFrame): A DataFrame containing the transformed query data.
                - data_train (pandas.DataFrame): A DataFrame containing the transformed training data.
                - data_test (pandas.DataFrame): A DataFrame containing the transformed testing data.
        """

        dataset = self.concat_data()

        dataset[self.query_col] = dataset[self.query_col].apply(lambda x: x.title())
        dataset["edu_experience_string"] = dataset["edu_experience"].apply(lambda x: str(x) + (" months"))
        dataset["work_experience_string"] = dataset["work_experience"].apply(lambda x: str(x) + (" months"))
        dataset["member_since_string"] = dataset["member_since"].apply(lambda x: "Member Since: " + str(x))
        dataset["degree_string"] = dataset["degree"].apply(lambda x: "Degree: " + str(x))

        # the query dataframe
        dataset_queries = pd.DataFrame(columns=['title', 'text'])
        dataset_queries['text'] = dataset[self.query_col].unique()
        dataset_queries['title'] = dataset[self.query_col].unique()

        data_train, data_test = self.create_train_test_split(dataset)
        return dataset_queries, data_train, data_test

    def create_train_test_split(self, dataset):
        data_train_list = []
        data_test_list = []
        for query, group in dataset.groupby([self.query_col]):
            data_train = []
            data_test = []
            for group, gender_group in group.groupby(['gender']):
                if len(gender_group) >= 2:
                    data_train_gr, data_test_gr = train_test_split(gender_group, test_size=0.3)
                    data_train.append(data_train_gr)
                    data_test.append(data_test_gr)
            if len(data_train) == 2:
                data_train_list.append(pd.concat(data_train))
                data_test_list.append(pd.concat(data_test))
        data_train = pd.concat(data_train_list)
        data_test = pd.concat(data_test_list)

        return data_train, data_test

    def concat_data(self):
        entireDataSet = pd.DataFrame(columns=['protected', 'nonProtected', 'originalOrdering'])
        files = glob.glob(os.path.join(self.data_path, 'data', '*.json'))

        df_lists = []
        for filename in files:
            key, protected, nonProtected, origOrder = self.__readFileOfQuery(filename)
            entireDataSet.loc[key] = [protected, nonProtected, origOrder]
            df_temp = pd.DataFrame([o.__dict__ for o in origOrder])
            df_temp['title'] = key
            df_lists.append(df_temp)
        dataset = pd.concat(df_lists)
        new_cols = []
        for col in dataset.columns:
            new_col = col.split('__')[-1]
            if new_col == 'uuid':
                new_col = 'cid'
            if new_col == 'protectedAttributes':
                new_col = self.sensitive_col
            new_cols.append(new_col)
            dataset[new_col] = dataset[col].values
        dataset = dataset[new_cols]
        return dataset

    def dumpDataSet(self, pathToFile):
        with open(pathToFile, 'wb') as handle:
            pickle.dump(self.entireDataSet, handle, protocol=pickle.HIGHEST_PROTOCOL)

    def __readFileOfQuery(self, filename):
        """
        takes one .json file and reads all information, creates candidate objects from these
        information and sorts them into 3 arrays. One contains all protected candidates, one contains
        all non-protected candidates, one contains all candidates in the same order as they appear
        in the json-file

        @param filename: the json's filename

        @return:
            key: the search query string
            protected: array that contains all protected candidates
            nonProtected: array that contains all nonProtected candidates

        """
        protected = []
        nonProtected = []
        originalOrdering = []

        currentfile = open(filename)
        data = json.load(currentfile)

        xingSearchQuery = data['category']
        # if the Xing search query results in a gender neutral list,
        # we take female as the protected attribute
        protectedAttribute = 'm' if data['dominantSexXing'] == 'f' else 'f'

        for r in data['profiles']:
            # determine Member since / Hits
            if 'memberSince_Hits' in r['profile'][0]:
                hits_string = r['profile'][0]['memberSince_Hits']
                hits = hits_string.split(' / ')[1]
                member_since = hits_string.split(' / ')[0]
            else:
                hits = 1
                member_since = "unknown"

            work_experience = self.__determineWorkMonths(r)
            edu_experience = self.__determineEduMonths(r)
            if "education" in r['profile'][0]:
                degree = r['profile'][0]['education']['degree']
            else:
                degree = "unknown"
            score = (work_experience + edu_experience) * int(hits)

            if self.__determineIfProtected(r, protectedAttribute):
                protected.append(
                    Candidate(work_experience, edu_experience, hits, score, [protectedAttribute], member_since, degree))
            else:
                nonProtected.append(Candidate(work_experience, edu_experience, hits, score, [], member_since, degree))

            sex = r['profile'][0]['sex']
            originalOrdering.append(Candidate(work_experience, edu_experience, hits, score, sex, member_since, degree))

        protected.sort(key=lambda candidate: candidate.qualification, reverse=True)
        nonProtected.sort(key=lambda candidate: candidate.qualification, reverse=True)

        self.__normalizeQualifications(protected + nonProtected)
        self.__normalizeQualifications(originalOrdering)

        currentfile.close()
        return xingSearchQuery, protected, nonProtected, originalOrdering

    def __normalizeQualifications(self, ranking):
        # find highest qualification of candidate
        qualifications = [ranking[i].qualification for i in range(len(ranking))]
        highest = max(qualifications)
        for candidate in ranking:
            candidate.qualification = candidate.qualification / highest
            candidate.originalQualification = candidate.originalQualification / highest

    def __determineIfProtected(self, r, protAttr):
        """
        takes a JSON profile and finds if the person belongs to the protected group

        Parameter:
        ---------
        r : JSON node
        a person description in JSON, everything below node "profile"

        """

        if 'sex' in r['profile'][0]:
            if r['profile'][0]['sex'] == protAttr:
                return True
            else:
                return False
        else:
            print('>>> undetermined\n')
            return False

    def __determineWorkMonths(self, r):
        """
        takes a person's profile as JSON node and computes the total amount of work months this
        person has

        Parameters:
        ----------
        r : JSON node
        """

        total_working_months = 0  # ..of that profile
        job_duration = 0

        if len(r['profile'][0]) >= 4:  # a job is on the profile
            list_of_Jobs = r['profile'][0]['jobs']
            # print('profile summary' + str(r['profile'][0]['jobs']))
            for count in range(0, len(list_of_Jobs)):
                if len(list_of_Jobs[count]) > 3:  # an exact duration is given at 5 nodes!
                    job_duration_string = list_of_Jobs[count]['jobDates']
                    if job_duration_string == 'bis heute':
                        # print('job with no dates found - will be count for ' + str(job_with_no_dates) + ' months.')
                        job_duration = self.EDUCATION_OR_JOB_WITH_NO_DATES

                    else:
                        job_start_string, job_end_string = job_duration_string.split(' - ')

                        if len(job_start_string) == 4:
                            job_start = datetime.datetime.strptime(job_start_string, "%Y")
                        elif len(job_start_string) == 7:
                            job_start = datetime.datetime.strptime(job_start_string, "%m/%Y")
                        else:
                            print("error reading start date")

                        if len(job_end_string) == 4:
                            job_end = datetime.datetime.strptime(job_end_string, "%Y")
                        elif len(job_end_string) == 7:
                            job_end = datetime.datetime.strptime(job_end_string, "%m/%Y")
                        else:
                            print("error reading end date")

                        if job_end - job_start == 0:
                            delta = self.EDUCATION_OR_JOB_WITH_SAME_YEAR
                        else:
                            delta = job_end - job_start

                        job_duration = math.ceil(delta.total_seconds() / 2629743.83)
                total_working_months += job_duration
        else:
            print('-no jobs on profile-')

        return total_working_months

    def __determineEduMonths(self, r):
        """
        takes a person's profile as JSON node and computes the total amount of work months this
        person has

        Parameters:
        ----------
        r : JSON node
        """

        total_education_months = 0  # ..of that profile
        edu_duration = 0

        if 'education' in r:  # education info is on the profile
            list_of_edu = r['education']  # edu child nodes {institution, url, degree, eduDuration}
            # print('education summary' + str(r['education']))
            for count in range(0, len(list_of_edu)):
                if 'eduDuration' in list_of_edu[count]:  # there are education dates

                    edu_duration_string = list_of_edu[count]['eduDuration']
                    if edu_duration_string == ('bis heute' or None or ''):
                        edu_duration = self.EDUCATION_OR_JOB_WITH_NO_DATES
                    else:
                        edu_start_string, edu_end_string = edu_duration_string.split(' - ')

                        if len(edu_start_string) == 4:
                            edu_start = datetime.datetime.strptime(edu_start_string, "%Y")
                        elif len(edu_start_string) == 7:
                            edu_start = datetime.datetime.strptime(edu_start_string, "%m/%Y")
                        else:
                            print("error reading start date")

                        if len(edu_end_string) == 4:
                            edu_end = datetime.datetime.strptime(edu_end_string, "%Y")
                        elif len(edu_end_string) == 7:
                            edu_end = datetime.datetime.strptime(edu_end_string, "%m/%Y")
                        else:
                            print("error reading end date")

                        if edu_end - edu_start == 0:
                            delta = self.EDUCATION_OR_JOB_WITH_SAME_YEAR
                        else:
                            delta = edu_end - edu_start

                        edu_duration = math.ceil(delta.total_seconds() / 2629743.83)

                        # print(job_duration_string)
                        # print('this job: ' + str(job_duration))

                else:
                    edu_duration = self.EDUCATION_OR_JOB_WITH_NO_DATES

                total_education_months += edu_duration
                # print('total jobs: ' + str(total_working_months))

            # print("studying: " + str(total_education_months))
        else:
            print('-no education on profile-')

        return total_education_months

__determineEduMonths(r)

takes a person's profile as JSON node and computes the total amount of work months this person has

Parameters:

r : JSON node

Source code in src/data_readers/data_reader_xing.py
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
def __determineEduMonths(self, r):
    """
    takes a person's profile as JSON node and computes the total amount of work months this
    person has

    Parameters:
    ----------
    r : JSON node
    """

    total_education_months = 0  # ..of that profile
    edu_duration = 0

    if 'education' in r:  # education info is on the profile
        list_of_edu = r['education']  # edu child nodes {institution, url, degree, eduDuration}
        # print('education summary' + str(r['education']))
        for count in range(0, len(list_of_edu)):
            if 'eduDuration' in list_of_edu[count]:  # there are education dates

                edu_duration_string = list_of_edu[count]['eduDuration']
                if edu_duration_string == ('bis heute' or None or ''):
                    edu_duration = self.EDUCATION_OR_JOB_WITH_NO_DATES
                else:
                    edu_start_string, edu_end_string = edu_duration_string.split(' - ')

                    if len(edu_start_string) == 4:
                        edu_start = datetime.datetime.strptime(edu_start_string, "%Y")
                    elif len(edu_start_string) == 7:
                        edu_start = datetime.datetime.strptime(edu_start_string, "%m/%Y")
                    else:
                        print("error reading start date")

                    if len(edu_end_string) == 4:
                        edu_end = datetime.datetime.strptime(edu_end_string, "%Y")
                    elif len(edu_end_string) == 7:
                        edu_end = datetime.datetime.strptime(edu_end_string, "%m/%Y")
                    else:
                        print("error reading end date")

                    if edu_end - edu_start == 0:
                        delta = self.EDUCATION_OR_JOB_WITH_SAME_YEAR
                    else:
                        delta = edu_end - edu_start

                    edu_duration = math.ceil(delta.total_seconds() / 2629743.83)

                    # print(job_duration_string)
                    # print('this job: ' + str(job_duration))

            else:
                edu_duration = self.EDUCATION_OR_JOB_WITH_NO_DATES

            total_education_months += edu_duration
            # print('total jobs: ' + str(total_working_months))

        # print("studying: " + str(total_education_months))
    else:
        print('-no education on profile-')

    return total_education_months

__determineIfProtected(r, protAttr)

takes a JSON profile and finds if the person belongs to the protected group

Parameter:

r : JSON node a person description in JSON, everything below node "profile"

Source code in src/data_readers/data_reader_xing.py
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
def __determineIfProtected(self, r, protAttr):
    """
    takes a JSON profile and finds if the person belongs to the protected group

    Parameter:
    ---------
    r : JSON node
    a person description in JSON, everything below node "profile"

    """

    if 'sex' in r['profile'][0]:
        if r['profile'][0]['sex'] == protAttr:
            return True
        else:
            return False
    else:
        print('>>> undetermined\n')
        return False

__determineWorkMonths(r)

takes a person's profile as JSON node and computes the total amount of work months this person has

Parameters:

r : JSON node

Source code in src/data_readers/data_reader_xing.py
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
def __determineWorkMonths(self, r):
    """
    takes a person's profile as JSON node and computes the total amount of work months this
    person has

    Parameters:
    ----------
    r : JSON node
    """

    total_working_months = 0  # ..of that profile
    job_duration = 0

    if len(r['profile'][0]) >= 4:  # a job is on the profile
        list_of_Jobs = r['profile'][0]['jobs']
        # print('profile summary' + str(r['profile'][0]['jobs']))
        for count in range(0, len(list_of_Jobs)):
            if len(list_of_Jobs[count]) > 3:  # an exact duration is given at 5 nodes!
                job_duration_string = list_of_Jobs[count]['jobDates']
                if job_duration_string == 'bis heute':
                    # print('job with no dates found - will be count for ' + str(job_with_no_dates) + ' months.')
                    job_duration = self.EDUCATION_OR_JOB_WITH_NO_DATES

                else:
                    job_start_string, job_end_string = job_duration_string.split(' - ')

                    if len(job_start_string) == 4:
                        job_start = datetime.datetime.strptime(job_start_string, "%Y")
                    elif len(job_start_string) == 7:
                        job_start = datetime.datetime.strptime(job_start_string, "%m/%Y")
                    else:
                        print("error reading start date")

                    if len(job_end_string) == 4:
                        job_end = datetime.datetime.strptime(job_end_string, "%Y")
                    elif len(job_end_string) == 7:
                        job_end = datetime.datetime.strptime(job_end_string, "%m/%Y")
                    else:
                        print("error reading end date")

                    if job_end - job_start == 0:
                        delta = self.EDUCATION_OR_JOB_WITH_SAME_YEAR
                    else:
                        delta = job_end - job_start

                    job_duration = math.ceil(delta.total_seconds() / 2629743.83)
            total_working_months += job_duration
    else:
        print('-no jobs on profile-')

    return total_working_months

__readFileOfQuery(filename)

takes one .json file and reads all information, creates candidate objects from these information and sorts them into 3 arrays. One contains all protected candidates, one contains all non-protected candidates, one contains all candidates in the same order as they appear in the json-file

@param filename: the json's filename

@return: key: the search query string protected: array that contains all protected candidates nonProtected: array that contains all nonProtected candidates

Source code in src/data_readers/data_reader_xing.py
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
def __readFileOfQuery(self, filename):
    """
    takes one .json file and reads all information, creates candidate objects from these
    information and sorts them into 3 arrays. One contains all protected candidates, one contains
    all non-protected candidates, one contains all candidates in the same order as they appear
    in the json-file

    @param filename: the json's filename

    @return:
        key: the search query string
        protected: array that contains all protected candidates
        nonProtected: array that contains all nonProtected candidates

    """
    protected = []
    nonProtected = []
    originalOrdering = []

    currentfile = open(filename)
    data = json.load(currentfile)

    xingSearchQuery = data['category']
    # if the Xing search query results in a gender neutral list,
    # we take female as the protected attribute
    protectedAttribute = 'm' if data['dominantSexXing'] == 'f' else 'f'

    for r in data['profiles']:
        # determine Member since / Hits
        if 'memberSince_Hits' in r['profile'][0]:
            hits_string = r['profile'][0]['memberSince_Hits']
            hits = hits_string.split(' / ')[1]
            member_since = hits_string.split(' / ')[0]
        else:
            hits = 1
            member_since = "unknown"

        work_experience = self.__determineWorkMonths(r)
        edu_experience = self.__determineEduMonths(r)
        if "education" in r['profile'][0]:
            degree = r['profile'][0]['education']['degree']
        else:
            degree = "unknown"
        score = (work_experience + edu_experience) * int(hits)

        if self.__determineIfProtected(r, protectedAttribute):
            protected.append(
                Candidate(work_experience, edu_experience, hits, score, [protectedAttribute], member_since, degree))
        else:
            nonProtected.append(Candidate(work_experience, edu_experience, hits, score, [], member_since, degree))

        sex = r['profile'][0]['sex']
        originalOrdering.append(Candidate(work_experience, edu_experience, hits, score, sex, member_since, degree))

    protected.sort(key=lambda candidate: candidate.qualification, reverse=True)
    nonProtected.sort(key=lambda candidate: candidate.qualification, reverse=True)

    self.__normalizeQualifications(protected + nonProtected)
    self.__normalizeQualifications(originalOrdering)

    currentfile.close()
    return xingSearchQuery, protected, nonProtected, originalOrdering

transform_data()

Transform data into pandas.DataFrame and apply cleaning steps.

Reads the XING data from a JSON file, performs data preprocessing, and returns the transformed data.

Returns:

Name Type Description
tuple

A tuple containing the transformed data. - dataframe_query (pandas.DataFrame): A DataFrame containing the transformed query data. - data_train (pandas.DataFrame): A DataFrame containing the transformed training data. - data_test (pandas.DataFrame): A DataFrame containing the transformed testing data.

Source code in src/data_readers/data_reader_xing.py
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
def transform_data(self):
    """
    Transform data into pandas.DataFrame and apply cleaning steps.

    Reads the XING data from a JSON file, performs data preprocessing, and returns the transformed data.

    Returns:
        tuple: A tuple containing the transformed data.
            - dataframe_query (pandas.DataFrame): A DataFrame containing the transformed query data.
            - data_train (pandas.DataFrame): A DataFrame containing the transformed training data.
            - data_test (pandas.DataFrame): A DataFrame containing the transformed testing data.
    """

    dataset = self.concat_data()

    dataset[self.query_col] = dataset[self.query_col].apply(lambda x: x.title())
    dataset["edu_experience_string"] = dataset["edu_experience"].apply(lambda x: str(x) + (" months"))
    dataset["work_experience_string"] = dataset["work_experience"].apply(lambda x: str(x) + (" months"))
    dataset["member_since_string"] = dataset["member_since"].apply(lambda x: "Member Since: " + str(x))
    dataset["degree_string"] = dataset["degree"].apply(lambda x: "Degree: " + str(x))

    # the query dataframe
    dataset_queries = pd.DataFrame(columns=['title', 'text'])
    dataset_queries['text'] = dataset[self.query_col].unique()
    dataset_queries['title'] = dataset[self.query_col].unique()

    data_train, data_test = self.create_train_test_split(dataset)
    return dataset_queries, data_train, data_test