Change bool to int, separate location

master
Aleksey Filippov 1 year ago
parent 5089eb4b10
commit 0eefc9fde0

@ -2,7 +2,6 @@ from datetime import date
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from numpy import ndarray
from pandas import DataFrame from pandas import DataFrame
from src.main.constants import Constants as const from src.main.constants import Constants as const
@ -49,27 +48,33 @@ class DfLoader:
def __prepare_dataset_status(self) -> None: def __prepare_dataset_status(self) -> None:
is_univer_mask = ((self.__df['age'] >= const.university_gr_age()) | (self.__df['age'] == const.empty_age())) & \ is_univer_mask = ((self.__df['age'] >= const.university_gr_age()) | (self.__df['age'] == const.empty_age())) & \
((self.__df['universities'].str.len() > 0) | (self.__df['occupation_type'] == 'university')) ((self.__df['universities'].str.len() > 0) | (self.__df['occupation_type'] == 'university'))
self.__df['is_university'] = np.where(is_univer_mask, True, False) self.__df['is_university'] = np.where(is_univer_mask, 1, 0)
is_work_mask = ((self.__df['age'] > const.school_gr_age()) | (self.__df['age'] == const.empty_age())) & \ is_work_mask = ((self.__df['age'] > const.school_gr_age()) | (self.__df['age'] == const.empty_age())) & \
((self.__df['is_university']) | (self.__df['occupation_type'] == 'work')) | \ ((self.__df['is_university'] == 1) | (self.__df['occupation_type'] == 'work')) | \
(self.__df['age'] > const.university_gr_age()) (self.__df['age'] > const.university_gr_age())
self.__df['is_work'] = np.where(is_work_mask, True, False) self.__df['is_work'] = np.where(is_work_mask, 1, 0)
is_student_mask = ((self.__df['occupation_type'] == 'university') & is_student_mask = ((self.__df['occupation_type'] == 'university') &
((self.__df['age'] >= const.school_gr_age()) & ((self.__df['age'] >= const.school_gr_age()) &
(self.__df['age'] <= const.university_gr_age()))) (self.__df['age'] <= const.university_gr_age())))
self.__df['is_student'] = np.where(is_student_mask, True, False) self.__df['is_student'] = np.where(is_student_mask, 1, 0)
is_schoolboy_mask = ((self.__df['age'] < const.school_gr_age()) & (self.__df['age'] != const.empty_age())) | \ is_schoolboy_mask = ((self.__df['age'] < const.school_gr_age()) & (self.__df['age'] != const.empty_age())) | \
((self.__df['age'] == const.empty_age()) & (self.__df['occupation_type'] == 'school')) ((self.__df['age'] == const.empty_age()) & (self.__df['occupation_type'] == 'school'))
self.__df['is_schoolboy'] = np.where(is_schoolboy_mask, True, False) self.__df['is_schoolboy'] = np.where(is_schoolboy_mask, 1, 0)
def __prepare_dataset_location(self) -> None: def __prepare_dataset_location(self) -> None:
self.__geocache.update_geo_cache(self.__df['city'].unique().tolist()) self.__geocache.update_geo_cache(self.__df['city'].unique().tolist())
self.__df['location'] = self.__df['city'] \ self.__df['location'] = self.__df['city'] \
.apply(lambda val: '' if Utils.is_empty_str(val) else self.__geocache.get_location(val)) .apply(lambda val: '' if Utils.is_empty_str(val) else self.__geocache.get_location(val))
self.__df['location-la'] = self.__df.loc[:, 'location'] \
def get_clustering_data(self) -> ndarray: .apply(lambda val: 0 if Utils.is_empty_collection(val) else val[0])
columns: [] = ['location', 'sex', 'age', 'is_university', 'is_work', 'is_student', 'is_schoolboy'] self.__df['location-lo'] = self.__df.loc[:, 'location'] \
return self.__df[columns].to_numpy() .apply(lambda val: 0 if Utils.is_empty_collection(val) else val[1])
def get_clustering_data(self) -> DataFrame:
columns: [] = ['location-la', 'location-lo',
'sex', 'age', 'is_university', 'is_work', 'is_student', 'is_schoolboy']
df = self.__df
return df[columns]

Loading…
Cancel
Save