81 lines
4.1 KiB
Python
81 lines
4.1 KiB
Python
from datetime import date
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
from pandas import DataFrame
|
|
|
|
from src.main.constants import Constants as const
|
|
from src.main.geocache import Geocache
|
|
from src.main.utils import Utils
|
|
|
|
|
|
class DfLoader:
|
|
|
|
def __init__(self, json_file: str) -> None:
|
|
self.__geocache: Geocache = Geocache()
|
|
print(f'Try to load data from the {json_file} file')
|
|
self.__df: DataFrame = pd.read_json(json_file)
|
|
self.__prepare_dataset_age()
|
|
self.__prepare_dataset_status()
|
|
self.__prepare_dataset_location()
|
|
print(f'Data is successfully loaded')
|
|
|
|
@staticmethod
|
|
def get_age_from_education(education: [], value: str, additional_value: int) -> int:
|
|
if Utils.is_empty_collection(education):
|
|
return const.empty_age()
|
|
for item in education:
|
|
graduation: int = item[value]
|
|
if Utils.is_empty_number(graduation):
|
|
return const.empty_age()
|
|
return Utils.get_years(graduation, date.today().year) + additional_value
|
|
|
|
def __prepare_dataset_age(self) -> None:
|
|
self.__df['age'] = self.__df.loc[:, 'bdate'].apply(Utils.get_age)
|
|
|
|
university_mask = (self.__df['age'] == const.empty_age()) & (self.__df['universities'].str.len() > 0)
|
|
self.__df.loc[university_mask, 'age'] = self.__df.loc[university_mask, 'universities'] \
|
|
.apply(lambda val: self.get_age_from_education(val, 'graduation', const.university_gr_age()))
|
|
|
|
school_mask_1 = (self.__df['age'] == const.empty_age()) & (self.__df['schools'].str.len() > 0)
|
|
self.__df.loc[school_mask_1, 'age'] = self.__df.loc[school_mask_1, 'schools'] \
|
|
.apply(lambda val: self.get_age_from_education(val, 'year_graduated', const.school_gr_age()))
|
|
|
|
school_mask_2 = (self.__df['age'] == const.empty_age()) & (self.__df['schools'].str.len() > 0)
|
|
self.__df.loc[school_mask_2, 'age'] = self.__df.loc[school_mask_2, 'schools'] \
|
|
.apply(lambda val: self.get_age_from_education(val, 'year_from', const.school_st_age()))
|
|
|
|
def __prepare_dataset_status(self) -> None:
|
|
is_univer_mask = ((self.__df['age'] >= const.university_gr_age()) | (self.__df['age'] == const.empty_age())) & \
|
|
((self.__df['universities'].str.len() > 0) | (self.__df['occupation_type'] == 'university'))
|
|
self.__df['is_university'] = np.where(is_univer_mask, 1, 0)
|
|
|
|
is_work_mask = ((self.__df['age'] > const.school_gr_age()) | (self.__df['age'] == const.empty_age())) & \
|
|
((self.__df['is_university'] == 1) | (self.__df['occupation_type'] == 'work')) | \
|
|
(self.__df['age'] > const.university_gr_age())
|
|
self.__df['is_work'] = np.where(is_work_mask, 1, 0)
|
|
|
|
is_student_mask = ((self.__df['occupation_type'] == 'university') &
|
|
((self.__df['age'] >= const.school_gr_age()) &
|
|
(self.__df['age'] <= const.university_gr_age())))
|
|
self.__df['is_student'] = np.where(is_student_mask, 1, 0)
|
|
|
|
is_schoolboy_mask = ((self.__df['age'] < const.school_gr_age()) & (self.__df['age'] != const.empty_age())) | \
|
|
((self.__df['age'] == const.empty_age()) & (self.__df['occupation_type'] == 'school'))
|
|
self.__df['is_schoolboy'] = np.where(is_schoolboy_mask, 1, 0)
|
|
|
|
def __prepare_dataset_location(self) -> None:
|
|
self.__geocache.update_geo_cache(self.__df['city'].unique().tolist())
|
|
self.__df['location'] = self.__df['city'] \
|
|
.apply(lambda val: '' if Utils.is_empty_str(val) else self.__geocache.get_location(val))
|
|
self.__df['location-la'] = self.__df.loc[:, 'location'] \
|
|
.apply(lambda val: 0 if Utils.is_empty_collection(val) else val[0])
|
|
self.__df['location-lo'] = self.__df.loc[:, 'location'] \
|
|
.apply(lambda val: 0 if Utils.is_empty_collection(val) else val[1])
|
|
|
|
def get_clustering_data(self) -> DataFrame:
|
|
columns: [] = ['location-la', 'location-lo',
|
|
'sex', 'age', 'is_university', 'is_work', 'is_student', 'is_schoolboy']
|
|
df = self.__df
|
|
return df[columns]
|