Add initial version of analyser

This commit is contained in:
Aleksey Filippov 2023-05-29 22:56:53 +04:00
parent e16131b436
commit a9dd12af39
3 changed files with 1520 additions and 2 deletions

1358
geocache.json Normal file

File diff suppressed because it is too large Load Diff

161
main.py
View File

@ -1,12 +1,171 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import json
import os import os
import sys import sys
from datetime import date, datetime
import numpy as np
import pandas as pd import pandas as pd
from geopy.extra.rate_limiter import RateLimiter
from geopy.geocoders import Nominatim
EMPTY_AGE = 0
UNIVERSITY_AGE = 21
SCHOOL_BEGIN_AGE = 7
SCHOOL_GRADUATED_AGE = 17
geolocator = Nominatim(user_agent="MyApp")
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)
geo_cache = {}
def is_empty_str(value):
if value is None:
return True
return len(str(value).strip()) == 0
def is_empty_number(value):
if is_empty_str(value):
return True
str_val = str(value)
if str_val.startswith('-'):
str_val = str_val.replace('-', '', 1)
return not str_val.isnumeric()
def is_empty_collection(collection):
if is_empty_str(collection):
return True
if not isinstance(collection, list):
return True
return len(collection) == 0
def get_age(date_str):
if is_empty_str(date_str):
return EMPTY_AGE
today = date.today()
birthdate = datetime.strptime(date_str, '%d.%m.%Y')
age = today.year - birthdate.year - ((today.month, today.day) < (birthdate.month, birthdate.day))
return age
def get_years(year1, year2):
if year1 >= year2:
return year1 - year2
if year2 >= year1:
return year2 - year1
def get_age_from_education(education, value, additional_value):
if is_empty_collection(education):
return EMPTY_AGE
for item in education:
graduation = item[value]
if is_empty_number(graduation):
return EMPTY_AGE
return get_years(graduation, date.today().year) + additional_value
def prepare_dataset_age(df):
df['age'] = df.loc[:, 'bdate'].apply(get_age)
university_mask = (df['age'] == EMPTY_AGE) & (df['universities'].str.len() > 0)
df.loc[university_mask, 'age'] = df.loc[university_mask, 'universities'] \
.apply(lambda val: get_age_from_education(val, 'graduation', UNIVERSITY_AGE))
school_mask_1 = (df['age'] == EMPTY_AGE) & (df['schools'].str.len() > 0)
df.loc[school_mask_1, 'age'] = df.loc[school_mask_1, 'schools'] \
.apply(lambda val: get_age_from_education(val, 'year_graduated', SCHOOL_GRADUATED_AGE))
school_mask_2 = (df['age'] == EMPTY_AGE) & (df['schools'].str.len() > 0)
df.loc[school_mask_2, 'age'] = df.loc[school_mask_2, 'schools'] \
.apply(lambda val: get_age_from_education(val, 'year_from', SCHOOL_BEGIN_AGE))
return df
def prepare_dataset_status(df):
is_university_mask = ((df['age'] >= UNIVERSITY_AGE) | (df['age'] == EMPTY_AGE)) & \
((df['universities'].str.len() > 0) | (df['occupation_type'] == 'university'))
df['is_university'] = np.where(is_university_mask, True, False)
is_work_mask = ((df['age'] > SCHOOL_GRADUATED_AGE) | (df['age'] == EMPTY_AGE)) & \
((df['is_university']) | (df['occupation_type'] == 'work')) | \
(df['age'] > UNIVERSITY_AGE)
df['is_work'] = np.where(is_work_mask, True, False)
is_student_mask = ((df['occupation_type'] == 'university') &
((df['age'] >= SCHOOL_GRADUATED_AGE) & (df['age'] <= UNIVERSITY_AGE)))
df['is_student'] = np.where(is_student_mask, True, False)
is_schoolboy_mask = ((df['age'] < SCHOOL_GRADUATED_AGE) & (df['age'] != EMPTY_AGE)) | \
((df['age'] == EMPTY_AGE) & (df['occupation_type'] == 'school'))
df['is_schoolboy'] = np.where(is_schoolboy_mask, True, False)
return df
def load_geo_cache(json_file):
with open(json_file, 'r') as rf:
geo_cache.update(json.load(rf))
def save_geo_cache(json_file):
with open(json_file, 'w') as wf:
json.dump(geo_cache, wf)
print('Geocache saved')
def update_geo_cache(cities, json_file):
is_changed = False
for city in cities:
if is_empty_str(city):
continue
result = geo_cache.get(city)
if result is not None:
continue
print(f'{len(geo_cache.keys())}/{len(cities)} - Try to load geocode for {city}')
location = geocode(city)
result = (location.latitude, location.longitude)
geo_cache[city] = result
is_changed = True
if len(geo_cache.keys()) % 50 == 0:
save_geo_cache(json_file)
if is_changed:
save_geo_cache(json_file)
def prepare_dataset_location(df):
json_file = 'geocache.json'
load_geo_cache(json_file)
update_geo_cache(df['city'].unique().tolist(), json_file)
df['location'] = df['city'] \
.apply(lambda val: '' if is_empty_str(val) else geo_cache[val])
return df
def prepare_dataset(json_file):
df = pd.read_json(json_file)
df = prepare_dataset_age(df)
df = prepare_dataset_status(df)
df = prepare_dataset_location(df)
return df
def __main(json_file): def __main(json_file):
df = pd.read_json(json_file) df = prepare_dataset(json_file)
print('done') print('done')

View File

@ -1 +1,2 @@
pandas==2.0.1 pandas==2.0.1
geopy==2.3.0