social-clusters/src/main/df_loader.py

from datetime import date

import numpy as np
import pandas as pd
from pandas import DataFrame

from src.main.constants import Constants as const
from src.main.geocache import Geocache
from src.main.utils import Utils


class DfLoader:

    def __init__(self, json_file: str) -> None:
        self.__geocache: Geocache = Geocache()
        print(f'Try to load data from the {json_file} file')
        self.__df: DataFrame = pd.read_json(json_file)
        self.__prepare_dataset_age()
        self.__prepare_dataset_status()
        self.__prepare_dataset_location()
        print(f'Data is successfully loaded')

    @staticmethod
    def get_age_from_education(education: [], value: str, additional_value: int) -> int:
        if Utils.is_empty_collection(education):
            return const.empty_age()
        for item in education:
            graduation: int = item[value]
            if Utils.is_empty_number(graduation):
                return const.empty_age()
            return Utils.get_years(graduation, date.today().year) + additional_value

    def __prepare_dataset_age(self) -> None:
        self.__df['age'] = self.__df.loc[:, 'bdate'].apply(Utils.get_age)

        university_mask = (self.__df['age'] == const.empty_age()) & (self.__df['universities'].str.len() > 0)
        self.__df.loc[university_mask, 'age'] = self.__df.loc[university_mask, 'universities'] \
            .apply(lambda val: self.get_age_from_education(val, 'graduation', const.university_gr_age()))

        school_mask_1 = (self.__df['age'] == const.empty_age()) & (self.__df['schools'].str.len() > 0)
        self.__df.loc[school_mask_1, 'age'] = self.__df.loc[school_mask_1, 'schools'] \
            .apply(lambda val: self.get_age_from_education(val, 'year_graduated', const.school_gr_age()))

        school_mask_2 = (self.__df['age'] == const.empty_age()) & (self.__df['schools'].str.len() > 0)
        self.__df.loc[school_mask_2, 'age'] = self.__df.loc[school_mask_2, 'schools'] \
            .apply(lambda val: self.get_age_from_education(val, 'year_from', const.school_st_age()))

    def __prepare_dataset_status(self) -> None:
        is_univer_mask = ((self.__df['age'] >= const.university_gr_age()) | (self.__df['age'] == const.empty_age())) & \
                         ((self.__df['universities'].str.len() > 0) | (self.__df['occupation_type'] == 'university'))
        self.__df['is_university'] = np.where(is_univer_mask, 1, 0)

        is_work_mask = ((self.__df['age'] > const.school_gr_age()) | (self.__df['age'] == const.empty_age())) & \
                       ((self.__df['is_university'] == 1) | (self.__df['occupation_type'] == 'work')) | \
                       (self.__df['age'] > const.university_gr_age())
        self.__df['is_work'] = np.where(is_work_mask, 1, 0)

        is_student_mask = ((self.__df['occupation_type'] == 'university') &
                           ((self.__df['age'] >= const.school_gr_age()) &
                            (self.__df['age'] <= const.university_gr_age())))
        self.__df['is_student'] = np.where(is_student_mask, 1, 0)

        is_schoolboy_mask = ((self.__df['age'] < const.school_gr_age()) & (self.__df['age'] != const.empty_age())) | \
                            ((self.__df['age'] == const.empty_age()) & (self.__df['occupation_type'] == 'school'))
        self.__df['is_schoolboy'] = np.where(is_schoolboy_mask, 1, 0)

    def __prepare_dataset_location(self) -> None:
        self.__geocache.update_geo_cache(self.__df['city'].unique().tolist())
        self.__df['location'] = self.__df['city'] \
            .apply(lambda val: '' if Utils.is_empty_str(val) else self.__geocache.get_location(val))
        self.__df['location-la'] = self.__df.loc[:, 'location'] \
            .apply(lambda val: 0 if Utils.is_empty_collection(val) else val[0])
        self.__df['location-lo'] = self.__df.loc[:, 'location'] \
            .apply(lambda val: 0 if Utils.is_empty_collection(val) else val[1])

    def get_clustering_data(self) -> DataFrame:
        columns: [] = ['location-la', 'location-lo',
                       'sex', 'age', 'is_university', 'is_work', 'is_student', 'is_schoolboy']
        df = self.__df
        return df[columns]