Compare commits
No commits in common. "a9dd12af39f9b0e4658f733196afae50578e3421" and "bf376c66c9f01384c510bbfbc53ba534b3b3947f" have entirely different histories.
a9dd12af39
...
bf376c66c9
1358
geocache.json
1358
geocache.json
File diff suppressed because it is too large
Load Diff
161
main.py
161
main.py
@ -1,171 +1,12 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
import json
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
from datetime import date, datetime
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from geopy.extra.rate_limiter import RateLimiter
|
|
||||||
from geopy.geocoders import Nominatim
|
|
||||||
|
|
||||||
EMPTY_AGE = 0
|
|
||||||
UNIVERSITY_AGE = 21
|
|
||||||
SCHOOL_BEGIN_AGE = 7
|
|
||||||
SCHOOL_GRADUATED_AGE = 17
|
|
||||||
|
|
||||||
geolocator = Nominatim(user_agent="MyApp")
|
|
||||||
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)
|
|
||||||
|
|
||||||
geo_cache = {}
|
|
||||||
|
|
||||||
|
|
||||||
def is_empty_str(value):
|
|
||||||
if value is None:
|
|
||||||
return True
|
|
||||||
return len(str(value).strip()) == 0
|
|
||||||
|
|
||||||
|
|
||||||
def is_empty_number(value):
|
|
||||||
if is_empty_str(value):
|
|
||||||
return True
|
|
||||||
str_val = str(value)
|
|
||||||
if str_val.startswith('-'):
|
|
||||||
str_val = str_val.replace('-', '', 1)
|
|
||||||
return not str_val.isnumeric()
|
|
||||||
|
|
||||||
|
|
||||||
def is_empty_collection(collection):
|
|
||||||
if is_empty_str(collection):
|
|
||||||
return True
|
|
||||||
if not isinstance(collection, list):
|
|
||||||
return True
|
|
||||||
return len(collection) == 0
|
|
||||||
|
|
||||||
|
|
||||||
def get_age(date_str):
|
|
||||||
if is_empty_str(date_str):
|
|
||||||
return EMPTY_AGE
|
|
||||||
today = date.today()
|
|
||||||
birthdate = datetime.strptime(date_str, '%d.%m.%Y')
|
|
||||||
age = today.year - birthdate.year - ((today.month, today.day) < (birthdate.month, birthdate.day))
|
|
||||||
return age
|
|
||||||
|
|
||||||
|
|
||||||
def get_years(year1, year2):
|
|
||||||
if year1 >= year2:
|
|
||||||
return year1 - year2
|
|
||||||
if year2 >= year1:
|
|
||||||
return year2 - year1
|
|
||||||
|
|
||||||
|
|
||||||
def get_age_from_education(education, value, additional_value):
|
|
||||||
if is_empty_collection(education):
|
|
||||||
return EMPTY_AGE
|
|
||||||
for item in education:
|
|
||||||
graduation = item[value]
|
|
||||||
if is_empty_number(graduation):
|
|
||||||
return EMPTY_AGE
|
|
||||||
return get_years(graduation, date.today().year) + additional_value
|
|
||||||
|
|
||||||
|
|
||||||
def prepare_dataset_age(df):
|
|
||||||
df['age'] = df.loc[:, 'bdate'].apply(get_age)
|
|
||||||
|
|
||||||
university_mask = (df['age'] == EMPTY_AGE) & (df['universities'].str.len() > 0)
|
|
||||||
df.loc[university_mask, 'age'] = df.loc[university_mask, 'universities'] \
|
|
||||||
.apply(lambda val: get_age_from_education(val, 'graduation', UNIVERSITY_AGE))
|
|
||||||
|
|
||||||
school_mask_1 = (df['age'] == EMPTY_AGE) & (df['schools'].str.len() > 0)
|
|
||||||
df.loc[school_mask_1, 'age'] = df.loc[school_mask_1, 'schools'] \
|
|
||||||
.apply(lambda val: get_age_from_education(val, 'year_graduated', SCHOOL_GRADUATED_AGE))
|
|
||||||
|
|
||||||
school_mask_2 = (df['age'] == EMPTY_AGE) & (df['schools'].str.len() > 0)
|
|
||||||
df.loc[school_mask_2, 'age'] = df.loc[school_mask_2, 'schools'] \
|
|
||||||
.apply(lambda val: get_age_from_education(val, 'year_from', SCHOOL_BEGIN_AGE))
|
|
||||||
|
|
||||||
return df
|
|
||||||
|
|
||||||
|
|
||||||
def prepare_dataset_status(df):
|
|
||||||
is_university_mask = ((df['age'] >= UNIVERSITY_AGE) | (df['age'] == EMPTY_AGE)) & \
|
|
||||||
((df['universities'].str.len() > 0) | (df['occupation_type'] == 'university'))
|
|
||||||
df['is_university'] = np.where(is_university_mask, True, False)
|
|
||||||
|
|
||||||
is_work_mask = ((df['age'] > SCHOOL_GRADUATED_AGE) | (df['age'] == EMPTY_AGE)) & \
|
|
||||||
((df['is_university']) | (df['occupation_type'] == 'work')) | \
|
|
||||||
(df['age'] > UNIVERSITY_AGE)
|
|
||||||
df['is_work'] = np.where(is_work_mask, True, False)
|
|
||||||
|
|
||||||
is_student_mask = ((df['occupation_type'] == 'university') &
|
|
||||||
((df['age'] >= SCHOOL_GRADUATED_AGE) & (df['age'] <= UNIVERSITY_AGE)))
|
|
||||||
df['is_student'] = np.where(is_student_mask, True, False)
|
|
||||||
|
|
||||||
is_schoolboy_mask = ((df['age'] < SCHOOL_GRADUATED_AGE) & (df['age'] != EMPTY_AGE)) | \
|
|
||||||
((df['age'] == EMPTY_AGE) & (df['occupation_type'] == 'school'))
|
|
||||||
df['is_schoolboy'] = np.where(is_schoolboy_mask, True, False)
|
|
||||||
|
|
||||||
return df
|
|
||||||
|
|
||||||
|
|
||||||
def load_geo_cache(json_file):
|
|
||||||
with open(json_file, 'r') as rf:
|
|
||||||
geo_cache.update(json.load(rf))
|
|
||||||
|
|
||||||
|
|
||||||
def save_geo_cache(json_file):
|
|
||||||
with open(json_file, 'w') as wf:
|
|
||||||
json.dump(geo_cache, wf)
|
|
||||||
print('Geocache saved')
|
|
||||||
|
|
||||||
|
|
||||||
def update_geo_cache(cities, json_file):
|
|
||||||
is_changed = False
|
|
||||||
for city in cities:
|
|
||||||
if is_empty_str(city):
|
|
||||||
continue
|
|
||||||
result = geo_cache.get(city)
|
|
||||||
if result is not None:
|
|
||||||
continue
|
|
||||||
print(f'{len(geo_cache.keys())}/{len(cities)} - Try to load geocode for {city}')
|
|
||||||
location = geocode(city)
|
|
||||||
result = (location.latitude, location.longitude)
|
|
||||||
geo_cache[city] = result
|
|
||||||
is_changed = True
|
|
||||||
if len(geo_cache.keys()) % 50 == 0:
|
|
||||||
save_geo_cache(json_file)
|
|
||||||
|
|
||||||
if is_changed:
|
|
||||||
save_geo_cache(json_file)
|
|
||||||
|
|
||||||
|
|
||||||
def prepare_dataset_location(df):
|
|
||||||
json_file = 'geocache.json'
|
|
||||||
|
|
||||||
load_geo_cache(json_file)
|
|
||||||
|
|
||||||
update_geo_cache(df['city'].unique().tolist(), json_file)
|
|
||||||
|
|
||||||
df['location'] = df['city'] \
|
|
||||||
.apply(lambda val: '' if is_empty_str(val) else geo_cache[val])
|
|
||||||
|
|
||||||
return df
|
|
||||||
|
|
||||||
|
|
||||||
def prepare_dataset(json_file):
|
|
||||||
df = pd.read_json(json_file)
|
|
||||||
|
|
||||||
df = prepare_dataset_age(df)
|
|
||||||
|
|
||||||
df = prepare_dataset_status(df)
|
|
||||||
|
|
||||||
df = prepare_dataset_location(df)
|
|
||||||
|
|
||||||
return df
|
|
||||||
|
|
||||||
|
|
||||||
def __main(json_file):
|
def __main(json_file):
|
||||||
df = prepare_dataset(json_file)
|
df = pd.read_json(json_file)
|
||||||
print('done')
|
print('done')
|
||||||
|
|
||||||
|
|
||||||
|
@ -32,8 +32,6 @@ def __main(json_file_name):
|
|||||||
current_col.append(person[key])
|
current_col.append(person[key])
|
||||||
df[key] = pd.Series(current_col)
|
df[key] = pd.Series(current_col)
|
||||||
|
|
||||||
df = df.drop(columns=['is_closed', 'deactivated'])
|
|
||||||
|
|
||||||
pathname, extension = os.path.splitext(json_file_name)
|
pathname, extension = os.path.splitext(json_file_name)
|
||||||
filename = pathname.split('/')[-1]
|
filename = pathname.split('/')[-1]
|
||||||
|
|
||||||
|
@ -1,2 +1 @@
|
|||||||
pandas==2.0.1
|
pandas==2.0.1
|
||||||
geopy==2.3.0
|
|
@ -23,37 +23,28 @@ class RawData:
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def get_int_st(value, attr):
|
def get_int_st(value, attr):
|
||||||
if value is None:
|
if value is None:
|
||||||
return ''
|
return -1
|
||||||
result = value[attr]
|
result = value[attr]
|
||||||
if result is None:
|
if result is None:
|
||||||
return ''
|
return -1
|
||||||
if not str(result).isnumeric():
|
if not str(result).isnumeric():
|
||||||
print(f'The value {result} is not a number')
|
print(f'The value {result} is not a number')
|
||||||
return ''
|
return -1
|
||||||
return result
|
return result
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def str_to_date(value, str_format):
|
|
||||||
return datetime.strptime(value, str_format).date().strftime('%d.%m.%Y')
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_date_st(value):
|
def get_date_st(value):
|
||||||
if value is None:
|
if value is None:
|
||||||
return ''
|
return ''
|
||||||
try:
|
try:
|
||||||
return RawData.str_to_date(value, '%d.%m.%Y')
|
return datetime.strptime(value, '%d.%m.%Y').date()
|
||||||
except ValueError:
|
except ValueError:
|
||||||
try:
|
try:
|
||||||
return RawData.str_to_date(value, '%d.%m.%y')
|
return datetime.strptime(value, '%d.%m.%y').date()
|
||||||
except ValueError:
|
except ValueError:
|
||||||
print(f'Invalid date {value}')
|
print(f'Invalid date {value}')
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_collection_st(collection, function):
|
def get_collection_st(collection, function):
|
||||||
if collection is None:
|
return list(map(lambda item: function(item), [] if collection is None else collection))
|
||||||
return ''
|
|
||||||
result_list = list(map(lambda item: function(item), collection))
|
|
||||||
if len(result_list) == 0:
|
|
||||||
return ''
|
|
||||||
return result_list
|
|
||||||
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Loading…
x
Reference in New Issue
Block a user