Compare commits
2 Commits
bf376c66c9
...
a9dd12af39
Author | SHA1 | Date | |
---|---|---|---|
a9dd12af39 | |||
e16131b436 |
1358
geocache.json
Normal file
1358
geocache.json
Normal file
File diff suppressed because it is too large
Load Diff
161
main.py
161
main.py
@ -1,12 +1,171 @@
|
||||
#!/usr/bin/env python3
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from datetime import date, datetime
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from geopy.extra.rate_limiter import RateLimiter
|
||||
from geopy.geocoders import Nominatim
|
||||
|
||||
EMPTY_AGE = 0
|
||||
UNIVERSITY_AGE = 21
|
||||
SCHOOL_BEGIN_AGE = 7
|
||||
SCHOOL_GRADUATED_AGE = 17
|
||||
|
||||
geolocator = Nominatim(user_agent="MyApp")
|
||||
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)
|
||||
|
||||
geo_cache = {}
|
||||
|
||||
|
||||
def is_empty_str(value):
|
||||
if value is None:
|
||||
return True
|
||||
return len(str(value).strip()) == 0
|
||||
|
||||
|
||||
def is_empty_number(value):
|
||||
if is_empty_str(value):
|
||||
return True
|
||||
str_val = str(value)
|
||||
if str_val.startswith('-'):
|
||||
str_val = str_val.replace('-', '', 1)
|
||||
return not str_val.isnumeric()
|
||||
|
||||
|
||||
def is_empty_collection(collection):
|
||||
if is_empty_str(collection):
|
||||
return True
|
||||
if not isinstance(collection, list):
|
||||
return True
|
||||
return len(collection) == 0
|
||||
|
||||
|
||||
def get_age(date_str):
|
||||
if is_empty_str(date_str):
|
||||
return EMPTY_AGE
|
||||
today = date.today()
|
||||
birthdate = datetime.strptime(date_str, '%d.%m.%Y')
|
||||
age = today.year - birthdate.year - ((today.month, today.day) < (birthdate.month, birthdate.day))
|
||||
return age
|
||||
|
||||
|
||||
def get_years(year1, year2):
|
||||
if year1 >= year2:
|
||||
return year1 - year2
|
||||
if year2 >= year1:
|
||||
return year2 - year1
|
||||
|
||||
|
||||
def get_age_from_education(education, value, additional_value):
|
||||
if is_empty_collection(education):
|
||||
return EMPTY_AGE
|
||||
for item in education:
|
||||
graduation = item[value]
|
||||
if is_empty_number(graduation):
|
||||
return EMPTY_AGE
|
||||
return get_years(graduation, date.today().year) + additional_value
|
||||
|
||||
|
||||
def prepare_dataset_age(df):
|
||||
df['age'] = df.loc[:, 'bdate'].apply(get_age)
|
||||
|
||||
university_mask = (df['age'] == EMPTY_AGE) & (df['universities'].str.len() > 0)
|
||||
df.loc[university_mask, 'age'] = df.loc[university_mask, 'universities'] \
|
||||
.apply(lambda val: get_age_from_education(val, 'graduation', UNIVERSITY_AGE))
|
||||
|
||||
school_mask_1 = (df['age'] == EMPTY_AGE) & (df['schools'].str.len() > 0)
|
||||
df.loc[school_mask_1, 'age'] = df.loc[school_mask_1, 'schools'] \
|
||||
.apply(lambda val: get_age_from_education(val, 'year_graduated', SCHOOL_GRADUATED_AGE))
|
||||
|
||||
school_mask_2 = (df['age'] == EMPTY_AGE) & (df['schools'].str.len() > 0)
|
||||
df.loc[school_mask_2, 'age'] = df.loc[school_mask_2, 'schools'] \
|
||||
.apply(lambda val: get_age_from_education(val, 'year_from', SCHOOL_BEGIN_AGE))
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def prepare_dataset_status(df):
|
||||
is_university_mask = ((df['age'] >= UNIVERSITY_AGE) | (df['age'] == EMPTY_AGE)) & \
|
||||
((df['universities'].str.len() > 0) | (df['occupation_type'] == 'university'))
|
||||
df['is_university'] = np.where(is_university_mask, True, False)
|
||||
|
||||
is_work_mask = ((df['age'] > SCHOOL_GRADUATED_AGE) | (df['age'] == EMPTY_AGE)) & \
|
||||
((df['is_university']) | (df['occupation_type'] == 'work')) | \
|
||||
(df['age'] > UNIVERSITY_AGE)
|
||||
df['is_work'] = np.where(is_work_mask, True, False)
|
||||
|
||||
is_student_mask = ((df['occupation_type'] == 'university') &
|
||||
((df['age'] >= SCHOOL_GRADUATED_AGE) & (df['age'] <= UNIVERSITY_AGE)))
|
||||
df['is_student'] = np.where(is_student_mask, True, False)
|
||||
|
||||
is_schoolboy_mask = ((df['age'] < SCHOOL_GRADUATED_AGE) & (df['age'] != EMPTY_AGE)) | \
|
||||
((df['age'] == EMPTY_AGE) & (df['occupation_type'] == 'school'))
|
||||
df['is_schoolboy'] = np.where(is_schoolboy_mask, True, False)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def load_geo_cache(json_file):
|
||||
with open(json_file, 'r') as rf:
|
||||
geo_cache.update(json.load(rf))
|
||||
|
||||
|
||||
def save_geo_cache(json_file):
|
||||
with open(json_file, 'w') as wf:
|
||||
json.dump(geo_cache, wf)
|
||||
print('Geocache saved')
|
||||
|
||||
|
||||
def update_geo_cache(cities, json_file):
|
||||
is_changed = False
|
||||
for city in cities:
|
||||
if is_empty_str(city):
|
||||
continue
|
||||
result = geo_cache.get(city)
|
||||
if result is not None:
|
||||
continue
|
||||
print(f'{len(geo_cache.keys())}/{len(cities)} - Try to load geocode for {city}')
|
||||
location = geocode(city)
|
||||
result = (location.latitude, location.longitude)
|
||||
geo_cache[city] = result
|
||||
is_changed = True
|
||||
if len(geo_cache.keys()) % 50 == 0:
|
||||
save_geo_cache(json_file)
|
||||
|
||||
if is_changed:
|
||||
save_geo_cache(json_file)
|
||||
|
||||
|
||||
def prepare_dataset_location(df):
|
||||
json_file = 'geocache.json'
|
||||
|
||||
load_geo_cache(json_file)
|
||||
|
||||
update_geo_cache(df['city'].unique().tolist(), json_file)
|
||||
|
||||
df['location'] = df['city'] \
|
||||
.apply(lambda val: '' if is_empty_str(val) else geo_cache[val])
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def prepare_dataset(json_file):
|
||||
df = pd.read_json(json_file)
|
||||
|
||||
df = prepare_dataset_age(df)
|
||||
|
||||
df = prepare_dataset_status(df)
|
||||
|
||||
df = prepare_dataset_location(df)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def __main(json_file):
|
||||
df = pd.read_json(json_file)
|
||||
df = prepare_dataset(json_file)
|
||||
print('done')
|
||||
|
||||
|
||||
|
@ -32,6 +32,8 @@ def __main(json_file_name):
|
||||
current_col.append(person[key])
|
||||
df[key] = pd.Series(current_col)
|
||||
|
||||
df = df.drop(columns=['is_closed', 'deactivated'])
|
||||
|
||||
pathname, extension = os.path.splitext(json_file_name)
|
||||
filename = pathname.split('/')[-1]
|
||||
|
||||
|
@ -1 +1,2 @@
|
||||
pandas==2.0.1
|
||||
pandas==2.0.1
|
||||
geopy==2.3.0
|
@ -23,28 +23,37 @@ class RawData:
|
||||
@staticmethod
|
||||
def get_int_st(value, attr):
|
||||
if value is None:
|
||||
return -1
|
||||
return ''
|
||||
result = value[attr]
|
||||
if result is None:
|
||||
return -1
|
||||
return ''
|
||||
if not str(result).isnumeric():
|
||||
print(f'The value {result} is not a number')
|
||||
return -1
|
||||
return ''
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
def str_to_date(value, str_format):
|
||||
return datetime.strptime(value, str_format).date().strftime('%d.%m.%Y')
|
||||
|
||||
@staticmethod
|
||||
def get_date_st(value):
|
||||
if value is None:
|
||||
return ''
|
||||
try:
|
||||
return datetime.strptime(value, '%d.%m.%Y').date()
|
||||
return RawData.str_to_date(value, '%d.%m.%Y')
|
||||
except ValueError:
|
||||
try:
|
||||
return datetime.strptime(value, '%d.%m.%y').date()
|
||||
return RawData.str_to_date(value, '%d.%m.%y')
|
||||
except ValueError:
|
||||
print(f'Invalid date {value}')
|
||||
return ''
|
||||
|
||||
@staticmethod
|
||||
def get_collection_st(collection, function):
|
||||
return list(map(lambda item: function(item), [] if collection is None else collection))
|
||||
if collection is None:
|
||||
return ''
|
||||
result_list = list(map(lambda item: function(item), collection))
|
||||
if len(result_list) == 0:
|
||||
return ''
|
||||
return result_list
|
||||
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user