Main class DataReader

`src.data_readers.data_reader.DataReader`

DataReader class for reading pre-determined dataset and data transformation for the UI.

Source code in src/data_readers/data_reader.py

class DataReader():
    """
    DataReader class for reading pre-determined dataset and data transformation for the UI. 
    """

    def __init__(self, configs):
        """
        Data reader init class.
        Attributes
        ----------
        configs : dict
            configuration dict of the dataset
        name : str
            name of the dataset set in run_apps args
        query_col : str
            name of the query column
        sensitive_col : str
            name of the sensitive column (e.g. gender) used for applying fairness interventions (optional)
        data_path : str
            path to the dataset file
        output_file_path : str
            path to the output file where the transformed dataset will be saved
        """
        self.name = configs["name"]
        self.query_col = configs["query"]
        self.score_col = configs["score"]

        if 'group' in configs:
            self.sensitive_col = configs["group"]
        else:
            self.sensitive_col =  None

        self.data_path = os.path.join(
            project_dir, 'dataset/' + self.name)
        self.output_file_path = os.path.join(self.data_path, 'format_data')
        if not os.path.exists(self.output_file_path):
            # save transformed data
            self.save_data()

    def read(self, split):
        """Read dataset file.

        Args:
            split (str): The split of the dataset to read ('test' or 'train').

        Returns:
            If split is 'test':
                tuple: A tuple containing the dataframes of document, query, and experiment lists.
            If split is 'train':
                tuple: A tuple containing the dataframes of document and query.

        Raises:
            FileNotFoundError: If the dataset file or query file is not found.

        """
        dataframe_data = pd.read_csv(os.path.join(self.output_file_path, split, 'data.csv'))
        dataframe_query = pd.read_csv(os.path.join(self.output_file_path, 'query.csv'))

        if split == 'test':
            experiments_files = [file for file in os.listdir(os.path.join(self.data_path, 'experiments')) if
                                    file.endswith('.json')]
            experiments_info = []
            for exp_file in experiments_files:
                with open(os.path.join(self.data_path, 'experiments', exp_file)) as f:
                    exp_info = json.load(f)
                    experiments_info.append(exp_info)

            return dataframe_data, dataframe_query, experiments_info
        else:
            return dataframe_data, dataframe_query

    def save_data(self):
        """Save the transformed data in splits.

        This method creates the necessary directories and saves the transformed data to CSV files.
        The data is saved in the following structure:
        - The main output directory is created at `self.output_file_path`.
        - Inside the main output directory, two subdirectories are created: 'test' and 'train'.
        - The transformed test data is saved as 'data.csv' inside the 'test' subdirectory.
            This will be displayed in the UI.
        - If there is transformed train data available, it is saved as 'data.csv' inside the 'train' subdirectory.
            This will be used for training the ranker or fairness intervention.
        - The dataset queries are saved as 'query.csv' inside the main output directory.

        Note: The method assumes that the necessary data has already been transformed and is available.

        Returns:
            None
        """

        # transform dataset into a pandas.DataFrame
        dataset_queries, data_train, data_test = self.transform_data()

        os.makedirs(self.output_file_path)
        os.makedirs(os.path.join(self.output_file_path, 'test'))
        os.makedirs(os.path.join(self.output_file_path, 'train'))
        data_test.to_csv(os.path.join(self.output_file_path, 'test', 'data.csv'), index=False)
        if data_train is not None:
            data_train.to_csv(os.path.join(self.output_file_path, 'train', 'data.csv'), index=False)
        dataset_queries.to_csv(os.path.join(self.output_file_path, 'query.csv'), index=False)

`init(configs)`

Data reader init class. Attributes

configs : dict configuration dict of the dataset name : str name of the dataset set in run_apps args query_col : str name of the query column sensitive_col : str name of the sensitive column (e.g. gender) used for applying fairness interventions (optional) data_path : str path to the dataset file output_file_path : str path to the output file where the transformed dataset will be saved

Source code in src/data_readers/data_reader.py

def __init__(self, configs):
    """
    Data reader init class.
    Attributes
    ----------
    configs : dict
        configuration dict of the dataset
    name : str
        name of the dataset set in run_apps args
    query_col : str
        name of the query column
    sensitive_col : str
        name of the sensitive column (e.g. gender) used for applying fairness interventions (optional)
    data_path : str
        path to the dataset file
    output_file_path : str
        path to the output file where the transformed dataset will be saved
    """
    self.name = configs["name"]
    self.query_col = configs["query"]
    self.score_col = configs["score"]

    if 'group' in configs:
        self.sensitive_col = configs["group"]
    else:
        self.sensitive_col =  None

    self.data_path = os.path.join(
        project_dir, 'dataset/' + self.name)
    self.output_file_path = os.path.join(self.data_path, 'format_data')
    if not os.path.exists(self.output_file_path):
        # save transformed data
        self.save_data()

`read(split)`

Read dataset file.

Parameters:

Name	Type	Description	Default
`split`	`str`	The split of the dataset to read ('test' or 'train').	required

Returns:

Type	Description
	If split is 'test': tuple: A tuple containing the dataframes of document, query, and experiment lists.
	If split is 'train': tuple: A tuple containing the dataframes of document and query.

Raises:

Type	Description
`FileNotFoundError`	If the dataset file or query file is not found.

Source code in src/data_readers/data_reader.py

def read(self, split):
    """Read dataset file.

    Args:
        split (str): The split of the dataset to read ('test' or 'train').

    Returns:
        If split is 'test':
            tuple: A tuple containing the dataframes of document, query, and experiment lists.
        If split is 'train':
            tuple: A tuple containing the dataframes of document and query.

    Raises:
        FileNotFoundError: If the dataset file or query file is not found.

    """
    dataframe_data = pd.read_csv(os.path.join(self.output_file_path, split, 'data.csv'))
    dataframe_query = pd.read_csv(os.path.join(self.output_file_path, 'query.csv'))

    if split == 'test':
        experiments_files = [file for file in os.listdir(os.path.join(self.data_path, 'experiments')) if
                                file.endswith('.json')]
        experiments_info = []
        for exp_file in experiments_files:
            with open(os.path.join(self.data_path, 'experiments', exp_file)) as f:
                exp_info = json.load(f)
                experiments_info.append(exp_info)

        return dataframe_data, dataframe_query, experiments_info
    else:
        return dataframe_data, dataframe_query

`save_data()`

Save the transformed data in splits.

This method creates the necessary directories and saves the transformed data to CSV files. The data is saved in the following structure: - The main output directory is created at self.output_file_path. - Inside the main output directory, two subdirectories are created: 'test' and 'train'. - The transformed test data is saved as 'data.csv' inside the 'test' subdirectory. This will be displayed in the UI. - If there is transformed train data available, it is saved as 'data.csv' inside the 'train' subdirectory. This will be used for training the ranker or fairness intervention. - The dataset queries are saved as 'query.csv' inside the main output directory.

Note: The method assumes that the necessary data has already been transformed and is available.

Returns:

Type	Description
	None

Source code in src/data_readers/data_reader.py

def save_data(self):
    """Save the transformed data in splits.

    This method creates the necessary directories and saves the transformed data to CSV files.
    The data is saved in the following structure:
    - The main output directory is created at `self.output_file_path`.
    - Inside the main output directory, two subdirectories are created: 'test' and 'train'.
    - The transformed test data is saved as 'data.csv' inside the 'test' subdirectory.
        This will be displayed in the UI.
    - If there is transformed train data available, it is saved as 'data.csv' inside the 'train' subdirectory.
        This will be used for training the ranker or fairness intervention.
    - The dataset queries are saved as 'query.csv' inside the main output directory.

    Note: The method assumes that the necessary data has already been transformed and is available.

    Returns:
        None
    """

    # transform dataset into a pandas.DataFrame
    dataset_queries, data_train, data_test = self.transform_data()

    os.makedirs(self.output_file_path)
    os.makedirs(os.path.join(self.output_file_path, 'test'))
    os.makedirs(os.path.join(self.output_file_path, 'train'))
    data_test.to_csv(os.path.join(self.output_file_path, 'test', 'data.csv'), index=False)
    if data_train is not None:
        data_train.to_csv(os.path.join(self.output_file_path, 'train', 'data.csv'), index=False)
    dataset_queries.to_csv(os.path.join(self.output_file_path, 'query.csv'), index=False)

Extending DataReader Class

Here are few examples of how to extend DataReader Class.

Data Reader for Amazon dataset

`src.data_readers.data_reader_amazon.DataReaderAmazon`

Bases: DataReader

Source code in src/data_readers/data_reader_amazon.py

class DataReaderAmazon(DataReader):
    def __init__(self, configs):
        super().__init__(configs)

    def transform_data(self):
        """Transform data into pandas.DataFrame and apply cleaning steps.

        Reads the 'amazon.csv' file from the specified data path, drops rows with missing values,
        and performs data transformations on the columns. Returns the transformed data.

        Returns:
            dataframe_query (pandas.DataFrame): A DataFrame containing the transformed query data.
            data_train (pandas.DataFrame): A DataFrame containing the transformed training data.
            data_test (pandas.DataFrame): A DataFrame containing the transformed testing data.
        """
        amazon_product_df = pd.read_csv(os.path.join(self.data_path, 'data', 'amazon.csv'))
        amazon_product_df = amazon_product_df.dropna(how='any', axis=0)

        # the query dataframe
        query_df = pd.DataFrame(columns=['title', 'text'])
        query_df['title'] = amazon_product_df["amazon_category_and_sub_category"]
        query_df['text'] = amazon_product_df["amazon_category_and_sub_category"].apply(lambda x: x.split(">")[-1])

        amazon_product_df["number_of_reviews_display"] = amazon_product_df["number_of_reviews"].apply(lambda x: str(x) + "reviews")

        # split data into train and test
        data_train = amazon_product_df.head(101)
        data_test = amazon_product_df.tail(410)

        dataframe_query = query_df

        return dataframe_query, data_train, data_test

`transform_data()`

Transform data into pandas.DataFrame and apply cleaning steps.

Reads the 'amazon.csv' file from the specified data path, drops rows with missing values, and performs data transformations on the columns. Returns the transformed data.

Returns:

Name	Type	Description
`dataframe_query`	`DataFrame`	A DataFrame containing the transformed query data.
`data_train`	`DataFrame`	A DataFrame containing the transformed training data.
`data_test`	`DataFrame`	A DataFrame containing the transformed testing data.

Source code in src/data_readers/data_reader_amazon.py

def transform_data(self):
    """Transform data into pandas.DataFrame and apply cleaning steps.

    Reads the 'amazon.csv' file from the specified data path, drops rows with missing values,
    and performs data transformations on the columns. Returns the transformed data.

    Returns:
        dataframe_query (pandas.DataFrame): A DataFrame containing the transformed query data.
        data_train (pandas.DataFrame): A DataFrame containing the transformed training data.
        data_test (pandas.DataFrame): A DataFrame containing the transformed testing data.
    """
    amazon_product_df = pd.read_csv(os.path.join(self.data_path, 'data', 'amazon.csv'))
    amazon_product_df = amazon_product_df.dropna(how='any', axis=0)

    # the query dataframe
    query_df = pd.DataFrame(columns=['title', 'text'])
    query_df['title'] = amazon_product_df["amazon_category_and_sub_category"]
    query_df['text'] = amazon_product_df["amazon_category_and_sub_category"].apply(lambda x: x.split(">")[-1])

    amazon_product_df["number_of_reviews_display"] = amazon_product_df["number_of_reviews"].apply(lambda x: str(x) + "reviews")

    # split data into train and test
    data_train = amazon_product_df.head(101)
    data_test = amazon_product_df.tail(410)

    dataframe_query = query_df

    return dataframe_query, data_train, data_test

Data Reader for CVS dataset

`src.data_readers.data_reader_cvs.DataReaderCvs`

Bases: DataReader

Source code in src/data_readers/data_reader_cvs.py

class DataReaderCvs(DataReader):

    def __init__(self, configs):
        super().__init__(configs)

    def transform_data(self):
        """
        Transform data into pandas.DataFrame and apply cleaning steps.

        This method reads and preprocesses data from multiple files and directories.
        It iterates over each occupation directory, reads the query description from a JSON file,
        formats the query as plain text, and appends it to the `dataframes_occupations` list.
        It then lists all JSON files in each occupation directory, reads the candidate data from each file,
        preprocesses the candidate data, and appends it to the `dataframes_candidates` list.
        Finally, it concatenates all the DataFrames into a single DataFrame and returns the result.

        Returns:
            dataframe_occupations (pandas.DataFrame): A DataFrame containing the preprocessed query data.
            data_train (pandas.DataFrame): A DataFrame containing the preprocessed candidate data for training.
            data_test (pandas.DataFrame): A DataFrame containing the preprocessed candidate data for testing.
        """
        occupation_dirs = [dir_name for dir_name in os.listdir(os.path.join(self.data_path, 'data')) if
                           dir_name != 'experiments' and dir_name != 'format_data' and dir_name != 'models']
        dataframes_occupations = []
        dataframes_candidates = []
        for dir_name in occupation_dirs:
            # Read query description and format the query as plain text
            with open(os.path.join(self.data_path, 'data', dir_name, 'description.json'), 'r') as json_file:
                query = json.load(json_file)
            query = pd.json_normalize(query)
            query['text'] = clean_text(dir_name, upper=True) + "\n" + query_to_text(query)
            query['title'] = dir_name
            dataframes_occupations.append(query)

            # List all files in the folder with a .json extension
            json_files = [file for file in os.listdir(os.path.join(self.data_path, 'data', dir_name)) if
                          file.endswith('.json') and file != 'description.json']

            # Iterate over each JSON file
            for json_file in json_files:
                file_path = os.path.join(self.data_path, 'data', dir_name, json_file)

                with open(os.path.join(self.data_path, 'data', dir_name, file_path), 'r') as json_file:
                    candidate_data = json.load(json_file)

                candidate_data = pd.json_normalize(candidate_data)
                candidate_data = candidate_to_text(candidate_data)

                candidate_data['query'] = dir_name

                dataframes_candidates.append(candidate_data)

        # Concatenate all DataFrames into a single DataFrame
        data_test = pd.concat(dataframes_candidates, ignore_index=True)
        # Set data_train to be the same as data_test for testing the ranker and fairness intervention on this dataset
        data_train = data_test

        dataframe_occupations = pd.concat(dataframes_occupations, ignore_index=True)

        return dataframe_occupations, data_train, data_test

`transform_data()`

Transform data into pandas.DataFrame and apply cleaning steps.

This method reads and preprocesses data from multiple files and directories. It iterates over each occupation directory, reads the query description from a JSON file, formats the query as plain text, and appends it to the dataframes_occupations list. It then lists all JSON files in each occupation directory, reads the candidate data from each file, preprocesses the candidate data, and appends it to the dataframes_candidates list. Finally, it concatenates all the DataFrames into a single DataFrame and returns the result.

Returns:

Name	Type	Description
`dataframe_occupations`	`DataFrame`	A DataFrame containing the preprocessed query data.
`data_train`	`DataFrame`	A DataFrame containing the preprocessed candidate data for training.
`data_test`	`DataFrame`	A DataFrame containing the preprocessed candidate data for testing.

Source code in src/data_readers/data_reader_cvs.py

def transform_data(self):
    """
    Transform data into pandas.DataFrame and apply cleaning steps.

    This method reads and preprocesses data from multiple files and directories.
    It iterates over each occupation directory, reads the query description from a JSON file,
    formats the query as plain text, and appends it to the `dataframes_occupations` list.
    It then lists all JSON files in each occupation directory, reads the candidate data from each file,
    preprocesses the candidate data, and appends it to the `dataframes_candidates` list.
    Finally, it concatenates all the DataFrames into a single DataFrame and returns the result.

    Returns:
        dataframe_occupations (pandas.DataFrame): A DataFrame containing the preprocessed query data.
        data_train (pandas.DataFrame): A DataFrame containing the preprocessed candidate data for training.
        data_test (pandas.DataFrame): A DataFrame containing the preprocessed candidate data for testing.
    """
    occupation_dirs = [dir_name for dir_name in os.listdir(os.path.join(self.data_path, 'data')) if
                       dir_name != 'experiments' and dir_name != 'format_data' and dir_name != 'models']
    dataframes_occupations = []
    dataframes_candidates = []
    for dir_name in occupation_dirs:
        # Read query description and format the query as plain text
        with open(os.path.join(self.data_path, 'data', dir_name, 'description.json'), 'r') as json_file:
            query = json.load(json_file)
        query = pd.json_normalize(query)
        query['text'] = clean_text(dir_name, upper=True) + "\n" + query_to_text(query)
        query['title'] = dir_name
        dataframes_occupations.append(query)

        # List all files in the folder with a .json extension
        json_files = [file for file in os.listdir(os.path.join(self.data_path, 'data', dir_name)) if
                      file.endswith('.json') and file != 'description.json']

        # Iterate over each JSON file
        for json_file in json_files:
            file_path = os.path.join(self.data_path, 'data', dir_name, json_file)

            with open(os.path.join(self.data_path, 'data', dir_name, file_path), 'r') as json_file:
                candidate_data = json.load(json_file)

            candidate_data = pd.json_normalize(candidate_data)
            candidate_data = candidate_to_text(candidate_data)

            candidate_data['query'] = dir_name

            dataframes_candidates.append(candidate_data)

    # Concatenate all DataFrames into a single DataFrame
    data_test = pd.concat(dataframes_candidates, ignore_index=True)
    # Set data_train to be the same as data_test for testing the ranker and fairness intervention on this dataset
    data_train = data_test

    dataframe_occupations = pd.concat(dataframes_occupations, ignore_index=True)

    return dataframe_occupations, data_train, data_test

Data Reader for Flickr dataset

`src.data_readers.data_reader_flickr.DataReaderFlickr`

Bases: DataReader

Source code in src/data_readers/data_reader_flickr.py

class DataReaderFlickr(DataReader):

    def __init__(self, configs):
        super().__init__(configs)

    def transform_data(self):
        """
        Transform data into pandas.DataFrame and apply cleaning steps.

        Reads the flickr data from a CSV file, performs data preprocessing, and returns the transformed data.

        Returns:
            tuple: A tuple containing the transformed data.
                - dataframe_query (pandas.DataFrame): A DataFrame containing the transformed query data.
                - data_train (pandas.DataFrame): A DataFrame containing the transformed training data.
                - data_test (pandas.DataFrame): A DataFrame containing the transformed testing data.
        """
        flickr_df = pd.read_csv(os.path.join(self.data_path, 'data', 'flickr.csv'))
        flickr_df = flickr_df.dropna(how='any', axis=0)

        # the query dataframe
        query_df = pd.DataFrame(columns=['title', 'text'])
        query_df['title'] = flickr_df['image']
        query_df['text'] = flickr_df['base64']

        # split data into train and test
        data_train = flickr_df.head(10)
        data_test = flickr_df.tail(20)

        dataframe_query = query_df

        return dataframe_query, data_train, data_test

`transform_data()`

Transform data into pandas.DataFrame and apply cleaning steps.

Reads the flickr data from a CSV file, performs data preprocessing, and returns the transformed data.

Returns:

Name	Type	Description
`tuple`		A tuple containing the transformed data. - dataframe_query (pandas.DataFrame): A DataFrame containing the transformed query data. - data_train (pandas.DataFrame): A DataFrame containing the transformed training data. - data_test (pandas.DataFrame): A DataFrame containing the transformed testing data.

Source code in src/data_readers/data_reader_flickr.py

def transform_data(self):
    """
    Transform data into pandas.DataFrame and apply cleaning steps.

    Reads the flickr data from a CSV file, performs data preprocessing, and returns the transformed data.

    Returns:
        tuple: A tuple containing the transformed data.
            - dataframe_query (pandas.DataFrame): A DataFrame containing the transformed query data.
            - data_train (pandas.DataFrame): A DataFrame containing the transformed training data.
            - data_test (pandas.DataFrame): A DataFrame containing the transformed testing data.
    """
    flickr_df = pd.read_csv(os.path.join(self.data_path, 'data', 'flickr.csv'))
    flickr_df = flickr_df.dropna(how='any', axis=0)

    # the query dataframe
    query_df = pd.DataFrame(columns=['title', 'text'])
    query_df['title'] = flickr_df['image']
    query_df['text'] = flickr_df['base64']

    # split data into train and test
    data_train = flickr_df.head(10)
    data_test = flickr_df.tail(20)

    dataframe_query = query_df

    return dataframe_query, data_train, data_test

Data Reader for Xing dataset

`src.data_readers.data_reader_xing.Candidate`

Bases: object

represents a candidate in a set that is passed to a search algorithm a candidate composes of a qualification and a list of protected attributes (strings) if the list of protected attributes is empty/null this is a candidate from a non-protected group natural ordering established by the qualification

Source code in src/data_readers/data_reader_xing.py

class Candidate(object):
    """
    represents a candidate in a set that is passed to a search algorithm
    a candidate composes of a qualification and a list of protected attributes (strings)
    if the list of protected attributes is empty/null this is a candidate from a non-protected group
    natural ordering established by the qualification
    """

    def __init__(self, work_experience, edu_experience, hits, qualification, protectedAttributes, member_since, degree):
        """
        @param qualification : describes how qualified the candidate is to match the search query
        @param protectedAttributes: list of strings that represent the protected attributes this
                                    candidate has (e.g. gender, race, etc)
                                    if the list is empty/null this is a candidate from a non-protected group
        """
        self.__qualification = qualification
        self.__protectedAttributes = protectedAttributes
        self.__work_experience = work_experience
        self.__edu_experience = edu_experience
        self.__member_since = member_since
        self.__hits = hits
        self.__degree = degree
        # keeps the candidate's initial qualification for evaluation purposes
        self.__originalQualification = qualification
        self.uuid = uuid.uuid4()

    @property
    def qualification(self):
        return self.__qualification

    @qualification.setter
    def qualification(self, value):
        self.__qualification = value

    @property
    def originalQualification(self):
        return self.__originalQualification

    @originalQualification.setter
    def originalQualification(self, value):
        self.__qualification = value

    @property
    def isProtected(self):
        '''
        true if the list of ProtectedAttribute elements actually contains anything
        false otherwise
        '''
        return not self.__protectedAttributes == []

`isProtected` `property`

true if the list of ProtectedAttribute elements actually contains anything false otherwise

`init(work_experience, edu_experience, hits, qualification, protectedAttributes, member_since, degree)`

@param qualification : describes how qualified the candidate is to match the search query @param protectedAttributes: list of strings that represent the protected attributes this candidate has (e.g. gender, race, etc) if the list is empty/null this is a candidate from a non-protected group

Source code in src/data_readers/data_reader_xing.py

def __init__(self, work_experience, edu_experience, hits, qualification, protectedAttributes, member_since, degree):
    """
    @param qualification : describes how qualified the candidate is to match the search query
    @param protectedAttributes: list of strings that represent the protected attributes this
                                candidate has (e.g. gender, race, etc)
                                if the list is empty/null this is a candidate from a non-protected group
    """
    self.__qualification = qualification
    self.__protectedAttributes = protectedAttributes
    self.__work_experience = work_experience
    self.__edu_experience = edu_experience
    self.__member_since = member_since
    self.__hits = hits
    self.__degree = degree
    # keeps the candidate's initial qualification for evaluation purposes
    self.__originalQualification = qualification
    self.uuid = uuid.uuid4()

`src.data_readers.data_reader_xing.DataReaderXing`

Bases: DataReader

reads profiles collected from Xing on certain job description queries profiles are available in JSON format they are read into a data frame indexed by the search queries we used to obtain candidate profiles

the columns consists of arrays of Candidates, the protected ones, the non-protected ones and one that contains all candidates in the same order as was collected from Xing website.

                     |          PROTECTED            |            NON-PROTECTED            |       ORIGINAL ORDERING

the protected attribute of a candidate is their sex a candidate's sex was manually determined from the profile name depending on the dominating sex of a search query result, the other one was set as the protected attribute (e.g. for administrative assistant the protected attribute is male, for auditor it's female)

Source code in src/data_readers/data_reader_xing.py

class DataReaderXing(DataReader):
    """
    reads profiles collected from Xing on certain job description queries
    profiles are available in JSON format
    they are read into a data frame indexed by the search queries we used to obtain candidate profiles

    the columns consists of arrays of Candidates, the protected ones, the non-protected ones and
    one that contains all candidates in the same order as was collected from Xing website.

                             |          PROTECTED            |            NON-PROTECTED            |       ORIGINAL ORDERING
    Administrative Assistant | [protected1, protected2, ...] | [nonProtected1, nonProtected2, ...] | [nonProtected1, protected1, ...]
    Auditor                  | [protected3, protected4, ...] | [nonProtected3, nonProtected3, ...] | [protected4, nonProtected3, ...]
            ...              |            ...                |               ...                   |             ...


    the protected attribute of a candidate is their sex
    a candidate's sex was manually determined from the profile name
    depending on the dominating sex of a search query result, the other one was set as the protected
    attribute (e.g. for administrative assistant the protected attribute is male, for auditor it's female)
    """

    EDUCATION_OR_JOB_WITH_NO_DATES = 3  # months count if you had a job that has no associated dates
    EDUCATION_OR_JOB_WITH_SAME_YEAR = 6  # months count if you had a job that started and finished in the same year
    EDUCATION_OR_JOB_WITH_UNDEFINED_DATES = 1  # month given that the person entered the job

    def __init__(self, configs):
        super().__init__(configs)

    def transform_data(self):
        """
        Transform data into pandas.DataFrame and apply cleaning steps.

        Reads the XING data from a JSON file, performs data preprocessing, and returns the transformed data.

        Returns:
            tuple: A tuple containing the transformed data.
                - dataframe_query (pandas.DataFrame): A DataFrame containing the transformed query data.
                - data_train (pandas.DataFrame): A DataFrame containing the transformed training data.
                - data_test (pandas.DataFrame): A DataFrame containing the transformed testing data.
        """

        dataset = self.concat_data()

        dataset[self.query_col] = dataset[self.query_col].apply(lambda x: x.title())
        dataset["edu_experience_string"] = dataset["edu_experience"].apply(lambda x: str(x) + (" months"))
        dataset["work_experience_string"] = dataset["work_experience"].apply(lambda x: str(x) + (" months"))
        dataset["member_since_string"] = dataset["member_since"].apply(lambda x: "Member Since: " + str(x))
        dataset["degree_string"] = dataset["degree"].apply(lambda x: "Degree: " + str(x))

        # the query dataframe
        dataset_queries = pd.DataFrame(columns=['title', 'text'])
        dataset_queries['text'] = dataset[self.query_col].unique()
        dataset_queries['title'] = dataset[self.query_col].unique()

        data_train, data_test = self.create_train_test_split(dataset)
        return dataset_queries, data_train, data_test

    def create_train_test_split(self, dataset):
        data_train_list = []
        data_test_list = []
        for query, group in dataset.groupby([self.query_col]):
            data_train = []
            data_test = []
            for group, gender_group in group.groupby(['gender']):
                if len(gender_group) >= 2:
                    data_train_gr, data_test_gr = train_test_split(gender_group, test_size=0.3)
                    data_train.append(data_train_gr)
                    data_test.append(data_test_gr)
            if len(data_train) == 2:
                data_train_list.append(pd.concat(data_train))
                data_test_list.append(pd.concat(data_test))
        data_train = pd.concat(data_train_list)
        data_test = pd.concat(data_test_list)

        return data_train, data_test

    def concat_data(self):
        entireDataSet = pd.DataFrame(columns=['protected', 'nonProtected', 'originalOrdering'])
        files = glob.glob(os.path.join(self.data_path, 'data', '*.json'))

        df_lists = []
        for filename in files:
            key, protected, nonProtected, origOrder = self.__readFileOfQuery(filename)
            entireDataSet.loc[key] = [protected, nonProtected, origOrder]
            df_temp = pd.DataFrame([o.__dict__ for o in origOrder])
            df_temp['title'] = key
            df_lists.append(df_temp)
        dataset = pd.concat(df_lists)
        new_cols = []
        for col in dataset.columns:
            new_col = col.split('__')[-1]
            if new_col == 'uuid':
                new_col = 'cid'
            if new_col == 'protectedAttributes':
                new_col = self.sensitive_col
            new_cols.append(new_col)
            dataset[new_col] = dataset[col].values
        dataset = dataset[new_cols]
        return dataset

    def dumpDataSet(self, pathToFile):
        with open(pathToFile, 'wb') as handle:
            pickle.dump(self.entireDataSet, handle, protocol=pickle.HIGHEST_PROTOCOL)

    def __readFileOfQuery(self, filename):
        """
        takes one .json file and reads all information, creates candidate objects from these
        information and sorts them into 3 arrays. One contains all protected candidates, one contains
        all non-protected candidates, one contains all candidates in the same order as they appear
        in the json-file

        @param filename: the json's filename

        @return:
            key: the search query string
            protected: array that contains all protected candidates
            nonProtected: array that contains all nonProtected candidates

        """
        protected = []
        nonProtected = []
        originalOrdering = []

        currentfile = open(filename)
        data = json.load(currentfile)

        xingSearchQuery = data['category']
        # if the Xing search query results in a gender neutral list,
        # we take female as the protected attribute
        protectedAttribute = 'm' if data['dominantSexXing'] == 'f' else 'f'

        for r in data['profiles']:
            # determine Member since / Hits
            if 'memberSince_Hits' in r['profile'][0]:
                hits_string = r['profile'][0]['memberSince_Hits']
                hits = hits_string.split(' / ')[1]
                member_since = hits_string.split(' / ')[0]
            else:
                hits = 1
                member_since = "unknown"

            work_experience = self.__determineWorkMonths(r)
            edu_experience = self.__determineEduMonths(r)
            if "education" in r['profile'][0]:
                degree = r['profile'][0]['education']['degree']
            else:
                degree = "unknown"
            score = (work_experience + edu_experience) * int(hits)

            if self.__determineIfProtected(r, protectedAttribute):
                protected.append(
                    Candidate(work_experience, edu_experience, hits, score, [protectedAttribute], member_since, degree))
            else:
                nonProtected.append(Candidate(work_experience, edu_experience, hits, score, [], member_since, degree))

            sex = r['profile'][0]['sex']
            originalOrdering.append(Candidate(work_experience, edu_experience, hits, score, sex, member_since, degree))

        protected.sort(key=lambda candidate: candidate.qualification, reverse=True)
        nonProtected.sort(key=lambda candidate: candidate.qualification, reverse=True)

        self.__normalizeQualifications(protected + nonProtected)
        self.__normalizeQualifications(originalOrdering)

        currentfile.close()
        return xingSearchQuery, protected, nonProtected, originalOrdering

    def __normalizeQualifications(self, ranking):
        # find highest qualification of candidate
        qualifications = [ranking[i].qualification for i in range(len(ranking))]
        highest = max(qualifications)
        for candidate in ranking:
            candidate.qualification = candidate.qualification / highest
            candidate.originalQualification = candidate.originalQualification / highest

    def __determineIfProtected(self, r, protAttr):
        """
        takes a JSON profile and finds if the person belongs to the protected group

        Parameter:
        ---------
        r : JSON node
        a person description in JSON, everything below node "profile"

        """

        if 'sex' in r['profile'][0]:
            if r['profile'][0]['sex'] == protAttr:
                return True
            else:
                return False
        else:
            print('>>> undetermined\n')
            return False

    def __determineWorkMonths(self, r):
        """
        takes a person's profile as JSON node and computes the total amount of work months this
        person has

        Parameters:
        ----------
        r : JSON node
        """

        total_working_months = 0  # ..of that profile
        job_duration = 0

        if len(r['profile'][0]) >= 4:  # a job is on the profile
            list_of_Jobs = r['profile'][0]['jobs']
            # print('profile summary' + str(r['profile'][0]['jobs']))
            for count in range(0, len(list_of_Jobs)):
                if len(list_of_Jobs[count]) > 3:  # an exact duration is given at 5 nodes!
                    job_duration_string = list_of_Jobs[count]['jobDates']
                    if job_duration_string == 'bis heute':
                        # print('job with no dates found - will be count for ' + str(job_with_no_dates) + ' months.')
                        job_duration = self.EDUCATION_OR_JOB_WITH_NO_DATES

                    else:
                        job_start_string, job_end_string = job_duration_string.split(' - ')

                        if len(job_start_string) == 4:
                            job_start = datetime.datetime.strptime(job_start_string, "%Y")
                        elif len(job_start_string) == 7:
                            job_start = datetime.datetime.strptime(job_start_string, "%m/%Y")
                        else:
                            print("error reading start date")

                        if len(job_end_string) == 4:
                            job_end = datetime.datetime.strptime(job_end_string, "%Y")
                        elif len(job_end_string) == 7:
                            job_end = datetime.datetime.strptime(job_end_string, "%m/%Y")
                        else:
                            print("error reading end date")

                        if job_end - job_start == 0:
                            delta = self.EDUCATION_OR_JOB_WITH_SAME_YEAR
                        else:
                            delta = job_end - job_start

                        job_duration = math.ceil(delta.total_seconds() / 2629743.83)
                total_working_months += job_duration
        else:
            print('-no jobs on profile-')

        return total_working_months

    def __determineEduMonths(self, r):
        """
        takes a person's profile as JSON node and computes the total amount of work months this
        person has

        Parameters:
        ----------
        r : JSON node
        """

        total_education_months = 0  # ..of that profile
        edu_duration = 0

        if 'education' in r:  # education info is on the profile
            list_of_edu = r['education']  # edu child nodes {institution, url, degree, eduDuration}
            # print('education summary' + str(r['education']))
            for count in range(0, len(list_of_edu)):
                if 'eduDuration' in list_of_edu[count]:  # there are education dates

                    edu_duration_string = list_of_edu[count]['eduDuration']
                    if edu_duration_string == ('bis heute' or None or ''):
                        edu_duration = self.EDUCATION_OR_JOB_WITH_NO_DATES
                    else:
                        edu_start_string, edu_end_string = edu_duration_string.split(' - ')

                        if len(edu_start_string) == 4:
                            edu_start = datetime.datetime.strptime(edu_start_string, "%Y")
                        elif len(edu_start_string) == 7:
                            edu_start = datetime.datetime.strptime(edu_start_string, "%m/%Y")
                        else:
                            print("error reading start date")

                        if len(edu_end_string) == 4:
                            edu_end = datetime.datetime.strptime(edu_end_string, "%Y")
                        elif len(edu_end_string) == 7:
                            edu_end = datetime.datetime.strptime(edu_end_string, "%m/%Y")
                        else:
                            print("error reading end date")

                        if edu_end - edu_start == 0:
                            delta = self.EDUCATION_OR_JOB_WITH_SAME_YEAR
                        else:
                            delta = edu_end - edu_start

                        edu_duration = math.ceil(delta.total_seconds() / 2629743.83)

                        # print(job_duration_string)
                        # print('this job: ' + str(job_duration))

                else:
                    edu_duration = self.EDUCATION_OR_JOB_WITH_NO_DATES

                total_education_months += edu_duration
                # print('total jobs: ' + str(total_working_months))

            # print("studying: " + str(total_education_months))
        else:
            print('-no education on profile-')

        return total_education_months

`__determineEduMonths(r)`

takes a person's profile as JSON node and computes the total amount of work months this person has

Parameters:

r : JSON node

Source code in src/data_readers/data_reader_xing.py

def __determineEduMonths(self, r):
    """
    takes a person's profile as JSON node and computes the total amount of work months this
    person has

    Parameters:
    ----------
    r : JSON node
    """

    total_education_months = 0  # ..of that profile
    edu_duration = 0

    if 'education' in r:  # education info is on the profile
        list_of_edu = r['education']  # edu child nodes {institution, url, degree, eduDuration}
        # print('education summary' + str(r['education']))
        for count in range(0, len(list_of_edu)):
            if 'eduDuration' in list_of_edu[count]:  # there are education dates

                edu_duration_string = list_of_edu[count]['eduDuration']
                if edu_duration_string == ('bis heute' or None or ''):
                    edu_duration = self.EDUCATION_OR_JOB_WITH_NO_DATES
                else:
                    edu_start_string, edu_end_string = edu_duration_string.split(' - ')

                    if len(edu_start_string) == 4:
                        edu_start = datetime.datetime.strptime(edu_start_string, "%Y")
                    elif len(edu_start_string) == 7:
                        edu_start = datetime.datetime.strptime(edu_start_string, "%m/%Y")
                    else:
                        print("error reading start date")

                    if len(edu_end_string) == 4:
                        edu_end = datetime.datetime.strptime(edu_end_string, "%Y")
                    elif len(edu_end_string) == 7:
                        edu_end = datetime.datetime.strptime(edu_end_string, "%m/%Y")
                    else:
                        print("error reading end date")

                    if edu_end - edu_start == 0:
                        delta = self.EDUCATION_OR_JOB_WITH_SAME_YEAR
                    else:
                        delta = edu_end - edu_start

                    edu_duration = math.ceil(delta.total_seconds() / 2629743.83)

                    # print(job_duration_string)
                    # print('this job: ' + str(job_duration))

            else:
                edu_duration = self.EDUCATION_OR_JOB_WITH_NO_DATES

            total_education_months += edu_duration
            # print('total jobs: ' + str(total_working_months))

        # print("studying: " + str(total_education_months))
    else:
        print('-no education on profile-')

    return total_education_months

`__determineIfProtected(r, protAttr)`

takes a JSON profile and finds if the person belongs to the protected group

Parameter:

r : JSON node a person description in JSON, everything below node "profile"

Source code in src/data_readers/data_reader_xing.py

def __determineIfProtected(self, r, protAttr):
    """
    takes a JSON profile and finds if the person belongs to the protected group

    Parameter:
    ---------
    r : JSON node
    a person description in JSON, everything below node "profile"

    """

    if 'sex' in r['profile'][0]:
        if r['profile'][0]['sex'] == protAttr:
            return True
        else:
            return False
    else:
        print('>>> undetermined\n')
        return False

`__determineWorkMonths(r)`

takes a person's profile as JSON node and computes the total amount of work months this person has

Parameters:

r : JSON node

Source code in src/data_readers/data_reader_xing.py

def __determineWorkMonths(self, r):
    """
    takes a person's profile as JSON node and computes the total amount of work months this
    person has

    Parameters:
    ----------
    r : JSON node
    """

    total_working_months = 0  # ..of that profile
    job_duration = 0

    if len(r['profile'][0]) >= 4:  # a job is on the profile
        list_of_Jobs = r['profile'][0]['jobs']
        # print('profile summary' + str(r['profile'][0]['jobs']))
        for count in range(0, len(list_of_Jobs)):
            if len(list_of_Jobs[count]) > 3:  # an exact duration is given at 5 nodes!
                job_duration_string = list_of_Jobs[count]['jobDates']
                if job_duration_string == 'bis heute':
                    # print('job with no dates found - will be count for ' + str(job_with_no_dates) + ' months.')
                    job_duration = self.EDUCATION_OR_JOB_WITH_NO_DATES

                else:
                    job_start_string, job_end_string = job_duration_string.split(' - ')

                    if len(job_start_string) == 4:
                        job_start = datetime.datetime.strptime(job_start_string, "%Y")
                    elif len(job_start_string) == 7:
                        job_start = datetime.datetime.strptime(job_start_string, "%m/%Y")
                    else:
                        print("error reading start date")

                    if len(job_end_string) == 4:
                        job_end = datetime.datetime.strptime(job_end_string, "%Y")
                    elif len(job_end_string) == 7:
                        job_end = datetime.datetime.strptime(job_end_string, "%m/%Y")
                    else:
                        print("error reading end date")

                    if job_end - job_start == 0:
                        delta = self.EDUCATION_OR_JOB_WITH_SAME_YEAR
                    else:
                        delta = job_end - job_start

                    job_duration = math.ceil(delta.total_seconds() / 2629743.83)
            total_working_months += job_duration
    else:
        print('-no jobs on profile-')

    return total_working_months

`__readFileOfQuery(filename)`

takes one .json file and reads all information, creates candidate objects from these information and sorts them into 3 arrays. One contains all protected candidates, one contains all non-protected candidates, one contains all candidates in the same order as they appear in the json-file

@param filename: the json's filename

@return: key: the search query string protected: array that contains all protected candidates nonProtected: array that contains all nonProtected candidates

Source code in src/data_readers/data_reader_xing.py

def __readFileOfQuery(self, filename):
    """
    takes one .json file and reads all information, creates candidate objects from these
    information and sorts them into 3 arrays. One contains all protected candidates, one contains
    all non-protected candidates, one contains all candidates in the same order as they appear
    in the json-file

    @param filename: the json's filename

    @return:
        key: the search query string
        protected: array that contains all protected candidates
        nonProtected: array that contains all nonProtected candidates

    """
    protected = []
    nonProtected = []
    originalOrdering = []

    currentfile = open(filename)
    data = json.load(currentfile)

    xingSearchQuery = data['category']
    # if the Xing search query results in a gender neutral list,
    # we take female as the protected attribute
    protectedAttribute = 'm' if data['dominantSexXing'] == 'f' else 'f'

    for r in data['profiles']:
        # determine Member since / Hits
        if 'memberSince_Hits' in r['profile'][0]:
            hits_string = r['profile'][0]['memberSince_Hits']
            hits = hits_string.split(' / ')[1]
            member_since = hits_string.split(' / ')[0]
        else:
            hits = 1
            member_since = "unknown"

        work_experience = self.__determineWorkMonths(r)
        edu_experience = self.__determineEduMonths(r)
        if "education" in r['profile'][0]:
            degree = r['profile'][0]['education']['degree']
        else:
            degree = "unknown"
        score = (work_experience + edu_experience) * int(hits)

        if self.__determineIfProtected(r, protectedAttribute):
            protected.append(
                Candidate(work_experience, edu_experience, hits, score, [protectedAttribute], member_since, degree))
        else:
            nonProtected.append(Candidate(work_experience, edu_experience, hits, score, [], member_since, degree))

        sex = r['profile'][0]['sex']
        originalOrdering.append(Candidate(work_experience, edu_experience, hits, score, sex, member_since, degree))

    protected.sort(key=lambda candidate: candidate.qualification, reverse=True)
    nonProtected.sort(key=lambda candidate: candidate.qualification, reverse=True)

    self.__normalizeQualifications(protected + nonProtected)
    self.__normalizeQualifications(originalOrdering)

    currentfile.close()
    return xingSearchQuery, protected, nonProtected, originalOrdering

`transform_data()`

Transform data into pandas.DataFrame and apply cleaning steps.

Reads the XING data from a JSON file, performs data preprocessing, and returns the transformed data.

Returns:

Name	Type	Description
`tuple`		A tuple containing the transformed data. - dataframe_query (pandas.DataFrame): A DataFrame containing the transformed query data. - data_train (pandas.DataFrame): A DataFrame containing the transformed training data. - data_test (pandas.DataFrame): A DataFrame containing the transformed testing data.

Source code in src/data_readers/data_reader_xing.py

def transform_data(self):
    """
    Transform data into pandas.DataFrame and apply cleaning steps.

    Reads the XING data from a JSON file, performs data preprocessing, and returns the transformed data.

    Returns:
        tuple: A tuple containing the transformed data.
            - dataframe_query (pandas.DataFrame): A DataFrame containing the transformed query data.
            - data_train (pandas.DataFrame): A DataFrame containing the transformed training data.
            - data_test (pandas.DataFrame): A DataFrame containing the transformed testing data.
    """

    dataset = self.concat_data()

    dataset[self.query_col] = dataset[self.query_col].apply(lambda x: x.title())
    dataset["edu_experience_string"] = dataset["edu_experience"].apply(lambda x: str(x) + (" months"))
    dataset["work_experience_string"] = dataset["work_experience"].apply(lambda x: str(x) + (" months"))
    dataset["member_since_string"] = dataset["member_since"].apply(lambda x: "Member Since: " + str(x))
    dataset["degree_string"] = dataset["degree"].apply(lambda x: "Degree: " + str(x))

    # the query dataframe
    dataset_queries = pd.DataFrame(columns=['title', 'text'])
    dataset_queries['text'] = dataset[self.query_col].unique()
    dataset_queries['title'] = dataset[self.query_col].unique()

    data_train, data_test = self.create_train_test_split(dataset)
    return dataset_queries, data_train, data_test

Main class DataReader

src.data_readers.data_reader.DataReader

__init__(configs)

read(split)

save_data()

Extending DataReader Class

Here are few examples of how to extend DataReader Class.

Data Reader for Amazon dataset

src.data_readers.data_reader_amazon.DataReaderAmazon

transform_data()

Data Reader for CVS dataset

src.data_readers.data_reader_cvs.DataReaderCvs

transform_data()

Data Reader for Flickr dataset

src.data_readers.data_reader_flickr.DataReaderFlickr

transform_data()

Data Reader for Xing dataset

src.data_readers.data_reader_xing.Candidate

isProtected property

__init__(work_experience, edu_experience, hits, qualification, protectedAttributes, member_since, degree)

src.data_readers.data_reader_xing.DataReaderXing

__determineEduMonths(r)

Parameters:

__determineIfProtected(r, protAttr)

Parameter:

__determineWorkMonths(r)

Parameters:

__readFileOfQuery(filename)

transform_data()

`src.data_readers.data_reader.DataReader`

`init(configs)`

`read(split)`

`save_data()`

`src.data_readers.data_reader_amazon.DataReaderAmazon`

`transform_data()`

`src.data_readers.data_reader_cvs.DataReaderCvs`

`transform_data()`

`src.data_readers.data_reader_flickr.DataReaderFlickr`

`transform_data()`

`src.data_readers.data_reader_xing.Candidate`

`isProtected` `property`

`init(work_experience, edu_experience, hits, qualification, protectedAttributes, member_since, degree)`

`src.data_readers.data_reader_xing.DataReaderXing`

`__determineEduMonths(r)`

`__determineIfProtected(r, protAttr)`

`__determineWorkMonths(r)`

`__readFileOfQuery(filename)`

`transform_data()`