from datetime import date import numpy as np import pandas as pd from pandas import DataFrame from src.main.constants import Constants as const from src.main.geocache import Geocache from src.main.utils import Utils class DfLoader: def __init__(self, json_file: str) -> None: self.__geocache: Geocache = Geocache() print(f'Try to load data from the {json_file} file') self.__df: DataFrame = pd.read_json(json_file) self.__prepare_dataset_age() self.__prepare_dataset_status() self.__prepare_dataset_location() print(f'Data is successfully loaded') @staticmethod def get_age_from_education(education: [], value: str, additional_value: int) -> int: if Utils.is_empty_collection(education): return const.empty_age() for item in education: graduation: int = item[value] if Utils.is_empty_number(graduation): return const.empty_age() return Utils.get_years(graduation, date.today().year) + additional_value def __prepare_dataset_age(self) -> None: self.__df['age'] = self.__df.loc[:, 'bdate'].apply(Utils.get_age) university_mask = (self.__df['age'] == const.empty_age()) & (self.__df['universities'].str.len() > 0) self.__df.loc[university_mask, 'age'] = self.__df.loc[university_mask, 'universities'] \ .apply(lambda val: self.get_age_from_education(val, 'graduation', const.university_gr_age())) school_mask_1 = (self.__df['age'] == const.empty_age()) & (self.__df['schools'].str.len() > 0) self.__df.loc[school_mask_1, 'age'] = self.__df.loc[school_mask_1, 'schools'] \ .apply(lambda val: self.get_age_from_education(val, 'year_graduated', const.school_gr_age())) school_mask_2 = (self.__df['age'] == const.empty_age()) & (self.__df['schools'].str.len() > 0) self.__df.loc[school_mask_2, 'age'] = self.__df.loc[school_mask_2, 'schools'] \ .apply(lambda val: self.get_age_from_education(val, 'year_from', const.school_st_age())) def __prepare_dataset_status(self) -> None: is_univer_mask = ((self.__df['age'] >= const.university_gr_age()) | (self.__df['age'] == const.empty_age())) & \ ((self.__df['universities'].str.len() > 0) | (self.__df['occupation_type'] == 'university')) self.__df['is_university'] = np.where(is_univer_mask, 1, 0) is_work_mask = ((self.__df['age'] > const.school_gr_age()) | (self.__df['age'] == const.empty_age())) & \ ((self.__df['is_university'] == 1) | (self.__df['occupation_type'] == 'work')) | \ (self.__df['age'] > const.university_gr_age()) self.__df['is_work'] = np.where(is_work_mask, 1, 0) is_student_mask = ((self.__df['occupation_type'] == 'university') & ((self.__df['age'] >= const.school_gr_age()) & (self.__df['age'] <= const.university_gr_age()))) self.__df['is_student'] = np.where(is_student_mask, 1, 0) is_schoolboy_mask = ((self.__df['age'] < const.school_gr_age()) & (self.__df['age'] != const.empty_age())) | \ ((self.__df['age'] == const.empty_age()) & (self.__df['occupation_type'] == 'school')) self.__df['is_schoolboy'] = np.where(is_schoolboy_mask, 1, 0) def __prepare_dataset_location(self) -> None: self.__geocache.update_geo_cache(self.__df['city'].unique().tolist()) self.__df['location'] = self.__df['city'] \ .apply(lambda val: '' if Utils.is_empty_str(val) else self.__geocache.get_location(val)) self.__df['location-la'] = self.__df.loc[:, 'location'] \ .apply(lambda val: 0 if Utils.is_empty_collection(val) else val[0]) self.__df['location-lo'] = self.__df.loc[:, 'location'] \ .apply(lambda val: 0 if Utils.is_empty_collection(val) else val[1]) def get_clustering_data(self) -> DataFrame: columns: [] = ['location-la', 'location-lo', 'sex', 'age', 'is_university', 'is_work', 'is_student', 'is_schoolboy'] df = self.__df return df[columns]