social-clusters/src/main/df_loader.py

81 lines
4.1 KiB
Python

from datetime import date
import numpy as np
import pandas as pd
from pandas import DataFrame
from src.main.constants import Constants as const
from src.main.geocache import Geocache
from src.main.utils import Utils
class DfLoader:
def __init__(self, json_file: str) -> None:
self.__geocache: Geocache = Geocache()
print(f'Try to load data from the {json_file} file')
self.__df: DataFrame = pd.read_json(json_file)
self.__prepare_dataset_age()
self.__prepare_dataset_status()
self.__prepare_dataset_location()
print(f'Data is successfully loaded')
@staticmethod
def get_age_from_education(education: [], value: str, additional_value: int) -> int:
if Utils.is_empty_collection(education):
return const.empty_age()
for item in education:
graduation: int = item[value]
if Utils.is_empty_number(graduation):
return const.empty_age()
return Utils.get_years(graduation, date.today().year) + additional_value
def __prepare_dataset_age(self) -> None:
self.__df['age'] = self.__df.loc[:, 'bdate'].apply(Utils.get_age)
university_mask = (self.__df['age'] == const.empty_age()) & (self.__df['universities'].str.len() > 0)
self.__df.loc[university_mask, 'age'] = self.__df.loc[university_mask, 'universities'] \
.apply(lambda val: self.get_age_from_education(val, 'graduation', const.university_gr_age()))
school_mask_1 = (self.__df['age'] == const.empty_age()) & (self.__df['schools'].str.len() > 0)
self.__df.loc[school_mask_1, 'age'] = self.__df.loc[school_mask_1, 'schools'] \
.apply(lambda val: self.get_age_from_education(val, 'year_graduated', const.school_gr_age()))
school_mask_2 = (self.__df['age'] == const.empty_age()) & (self.__df['schools'].str.len() > 0)
self.__df.loc[school_mask_2, 'age'] = self.__df.loc[school_mask_2, 'schools'] \
.apply(lambda val: self.get_age_from_education(val, 'year_from', const.school_st_age()))
def __prepare_dataset_status(self) -> None:
is_univer_mask = ((self.__df['age'] >= const.university_gr_age()) | (self.__df['age'] == const.empty_age())) & \
((self.__df['universities'].str.len() > 0) | (self.__df['occupation_type'] == 'university'))
self.__df['is_university'] = np.where(is_univer_mask, 1, 0)
is_work_mask = ((self.__df['age'] > const.school_gr_age()) | (self.__df['age'] == const.empty_age())) & \
((self.__df['is_university'] == 1) | (self.__df['occupation_type'] == 'work')) | \
(self.__df['age'] > const.university_gr_age())
self.__df['is_work'] = np.where(is_work_mask, 1, 0)
is_student_mask = ((self.__df['occupation_type'] == 'university') &
((self.__df['age'] >= const.school_gr_age()) &
(self.__df['age'] <= const.university_gr_age())))
self.__df['is_student'] = np.where(is_student_mask, 1, 0)
is_schoolboy_mask = ((self.__df['age'] < const.school_gr_age()) & (self.__df['age'] != const.empty_age())) | \
((self.__df['age'] == const.empty_age()) & (self.__df['occupation_type'] == 'school'))
self.__df['is_schoolboy'] = np.where(is_schoolboy_mask, 1, 0)
def __prepare_dataset_location(self) -> None:
self.__geocache.update_geo_cache(self.__df['city'].unique().tolist())
self.__df['location'] = self.__df['city'] \
.apply(lambda val: '' if Utils.is_empty_str(val) else self.__geocache.get_location(val))
self.__df['location-la'] = self.__df.loc[:, 'location'] \
.apply(lambda val: 0 if Utils.is_empty_collection(val) else val[0])
self.__df['location-lo'] = self.__df.loc[:, 'location'] \
.apply(lambda val: 0 if Utils.is_empty_collection(val) else val[1])
def get_clustering_data(self) -> DataFrame:
columns: [] = ['location-la', 'location-lo',
'sex', 'age', 'is_university', 'is_work', 'is_student', 'is_schoolboy']
df = self.__df
return df[columns]