Compare commits

...

2 Commits

Author SHA1 Message Date
a9dd12af39 Add initial version of analyser 2023-05-29 22:56:53 +04:00
e16131b436 Unified dataset empty data values 2023-05-29 22:56:34 +04:00
7 changed files with 1539 additions and 10 deletions

1358
geocache.json Normal file

File diff suppressed because it is too large Load Diff

161
main.py
View File

@ -1,12 +1,171 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import json
import os import os
import sys import sys
from datetime import date, datetime
import numpy as np
import pandas as pd import pandas as pd
from geopy.extra.rate_limiter import RateLimiter
from geopy.geocoders import Nominatim
EMPTY_AGE = 0
UNIVERSITY_AGE = 21
SCHOOL_BEGIN_AGE = 7
SCHOOL_GRADUATED_AGE = 17
geolocator = Nominatim(user_agent="MyApp")
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)
geo_cache = {}
def is_empty_str(value):
if value is None:
return True
return len(str(value).strip()) == 0
def is_empty_number(value):
if is_empty_str(value):
return True
str_val = str(value)
if str_val.startswith('-'):
str_val = str_val.replace('-', '', 1)
return not str_val.isnumeric()
def is_empty_collection(collection):
if is_empty_str(collection):
return True
if not isinstance(collection, list):
return True
return len(collection) == 0
def get_age(date_str):
if is_empty_str(date_str):
return EMPTY_AGE
today = date.today()
birthdate = datetime.strptime(date_str, '%d.%m.%Y')
age = today.year - birthdate.year - ((today.month, today.day) < (birthdate.month, birthdate.day))
return age
def get_years(year1, year2):
if year1 >= year2:
return year1 - year2
if year2 >= year1:
return year2 - year1
def get_age_from_education(education, value, additional_value):
if is_empty_collection(education):
return EMPTY_AGE
for item in education:
graduation = item[value]
if is_empty_number(graduation):
return EMPTY_AGE
return get_years(graduation, date.today().year) + additional_value
def prepare_dataset_age(df):
df['age'] = df.loc[:, 'bdate'].apply(get_age)
university_mask = (df['age'] == EMPTY_AGE) & (df['universities'].str.len() > 0)
df.loc[university_mask, 'age'] = df.loc[university_mask, 'universities'] \
.apply(lambda val: get_age_from_education(val, 'graduation', UNIVERSITY_AGE))
school_mask_1 = (df['age'] == EMPTY_AGE) & (df['schools'].str.len() > 0)
df.loc[school_mask_1, 'age'] = df.loc[school_mask_1, 'schools'] \
.apply(lambda val: get_age_from_education(val, 'year_graduated', SCHOOL_GRADUATED_AGE))
school_mask_2 = (df['age'] == EMPTY_AGE) & (df['schools'].str.len() > 0)
df.loc[school_mask_2, 'age'] = df.loc[school_mask_2, 'schools'] \
.apply(lambda val: get_age_from_education(val, 'year_from', SCHOOL_BEGIN_AGE))
return df
def prepare_dataset_status(df):
is_university_mask = ((df['age'] >= UNIVERSITY_AGE) | (df['age'] == EMPTY_AGE)) & \
((df['universities'].str.len() > 0) | (df['occupation_type'] == 'university'))
df['is_university'] = np.where(is_university_mask, True, False)
is_work_mask = ((df['age'] > SCHOOL_GRADUATED_AGE) | (df['age'] == EMPTY_AGE)) & \
((df['is_university']) | (df['occupation_type'] == 'work')) | \
(df['age'] > UNIVERSITY_AGE)
df['is_work'] = np.where(is_work_mask, True, False)
is_student_mask = ((df['occupation_type'] == 'university') &
((df['age'] >= SCHOOL_GRADUATED_AGE) & (df['age'] <= UNIVERSITY_AGE)))
df['is_student'] = np.where(is_student_mask, True, False)
is_schoolboy_mask = ((df['age'] < SCHOOL_GRADUATED_AGE) & (df['age'] != EMPTY_AGE)) | \
((df['age'] == EMPTY_AGE) & (df['occupation_type'] == 'school'))
df['is_schoolboy'] = np.where(is_schoolboy_mask, True, False)
return df
def load_geo_cache(json_file):
with open(json_file, 'r') as rf:
geo_cache.update(json.load(rf))
def save_geo_cache(json_file):
with open(json_file, 'w') as wf:
json.dump(geo_cache, wf)
print('Geocache saved')
def update_geo_cache(cities, json_file):
is_changed = False
for city in cities:
if is_empty_str(city):
continue
result = geo_cache.get(city)
if result is not None:
continue
print(f'{len(geo_cache.keys())}/{len(cities)} - Try to load geocode for {city}')
location = geocode(city)
result = (location.latitude, location.longitude)
geo_cache[city] = result
is_changed = True
if len(geo_cache.keys()) % 50 == 0:
save_geo_cache(json_file)
if is_changed:
save_geo_cache(json_file)
def prepare_dataset_location(df):
json_file = 'geocache.json'
load_geo_cache(json_file)
update_geo_cache(df['city'].unique().tolist(), json_file)
df['location'] = df['city'] \
.apply(lambda val: '' if is_empty_str(val) else geo_cache[val])
return df
def prepare_dataset(json_file):
df = pd.read_json(json_file)
df = prepare_dataset_age(df)
df = prepare_dataset_status(df)
df = prepare_dataset_location(df)
return df
def __main(json_file): def __main(json_file):
df = pd.read_json(json_file) df = prepare_dataset(json_file)
print('done') print('done')

View File

@ -32,6 +32,8 @@ def __main(json_file_name):
current_col.append(person[key]) current_col.append(person[key])
df[key] = pd.Series(current_col) df[key] = pd.Series(current_col)
df = df.drop(columns=['is_closed', 'deactivated'])
pathname, extension = os.path.splitext(json_file_name) pathname, extension = os.path.splitext(json_file_name)
filename = pathname.split('/')[-1] filename = pathname.split('/')[-1]

View File

@ -1 +1,2 @@
pandas==2.0.1 pandas==2.0.1
geopy==2.3.0

View File

@ -23,28 +23,37 @@ class RawData:
@staticmethod @staticmethod
def get_int_st(value, attr): def get_int_st(value, attr):
if value is None: if value is None:
return -1 return ''
result = value[attr] result = value[attr]
if result is None: if result is None:
return -1 return ''
if not str(result).isnumeric(): if not str(result).isnumeric():
print(f'The value {result} is not a number') print(f'The value {result} is not a number')
return -1 return ''
return result return result
@staticmethod
def str_to_date(value, str_format):
return datetime.strptime(value, str_format).date().strftime('%d.%m.%Y')
@staticmethod @staticmethod
def get_date_st(value): def get_date_st(value):
if value is None: if value is None:
return '' return ''
try: try:
return datetime.strptime(value, '%d.%m.%Y').date() return RawData.str_to_date(value, '%d.%m.%Y')
except ValueError: except ValueError:
try: try:
return datetime.strptime(value, '%d.%m.%y').date() return RawData.str_to_date(value, '%d.%m.%y')
except ValueError: except ValueError:
print(f'Invalid date {value}') print(f'Invalid date {value}')
return '' return ''
@staticmethod @staticmethod
def get_collection_st(collection, function): def get_collection_st(collection, function):
return list(map(lambda item: function(item), [] if collection is None else collection)) if collection is None:
return ''
result_list = list(map(lambda item: function(item), collection))
if len(result_list) == 0:
return ''
return result_list

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long