#!/usr/bin/env python3 import json import os import sys from datetime import date, datetime import numpy as np import pandas as pd from geopy.extra.rate_limiter import RateLimiter from geopy.geocoders import Nominatim EMPTY_AGE = 0 UNIVERSITY_AGE = 21 SCHOOL_BEGIN_AGE = 7 SCHOOL_GRADUATED_AGE = 17 geolocator = Nominatim(user_agent="MyApp") geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1) geo_cache = {} def is_empty_str(value): if value is None: return True return len(str(value).strip()) == 0 def is_empty_number(value): if is_empty_str(value): return True str_val = str(value) if str_val.startswith('-'): str_val = str_val.replace('-', '', 1) return not str_val.isnumeric() def is_empty_collection(collection): if is_empty_str(collection): return True if not isinstance(collection, list): return True return len(collection) == 0 def get_age(date_str): if is_empty_str(date_str): return EMPTY_AGE today = date.today() birthdate = datetime.strptime(date_str, '%d.%m.%Y') age = today.year - birthdate.year - ((today.month, today.day) < (birthdate.month, birthdate.day)) return age def get_years(year1, year2): if year1 >= year2: return year1 - year2 if year2 >= year1: return year2 - year1 def get_age_from_education(education, value, additional_value): if is_empty_collection(education): return EMPTY_AGE for item in education: graduation = item[value] if is_empty_number(graduation): return EMPTY_AGE return get_years(graduation, date.today().year) + additional_value def prepare_dataset_age(df): df['age'] = df.loc[:, 'bdate'].apply(get_age) university_mask = (df['age'] == EMPTY_AGE) & (df['universities'].str.len() > 0) df.loc[university_mask, 'age'] = df.loc[university_mask, 'universities'] \ .apply(lambda val: get_age_from_education(val, 'graduation', UNIVERSITY_AGE)) school_mask_1 = (df['age'] == EMPTY_AGE) & (df['schools'].str.len() > 0) df.loc[school_mask_1, 'age'] = df.loc[school_mask_1, 'schools'] \ .apply(lambda val: get_age_from_education(val, 'year_graduated', SCHOOL_GRADUATED_AGE)) school_mask_2 = (df['age'] == EMPTY_AGE) & (df['schools'].str.len() > 0) df.loc[school_mask_2, 'age'] = df.loc[school_mask_2, 'schools'] \ .apply(lambda val: get_age_from_education(val, 'year_from', SCHOOL_BEGIN_AGE)) return df def prepare_dataset_status(df): is_university_mask = ((df['age'] >= UNIVERSITY_AGE) | (df['age'] == EMPTY_AGE)) & \ ((df['universities'].str.len() > 0) | (df['occupation_type'] == 'university')) df['is_university'] = np.where(is_university_mask, True, False) is_work_mask = ((df['age'] > SCHOOL_GRADUATED_AGE) | (df['age'] == EMPTY_AGE)) & \ ((df['is_university']) | (df['occupation_type'] == 'work')) | \ (df['age'] > UNIVERSITY_AGE) df['is_work'] = np.where(is_work_mask, True, False) is_student_mask = ((df['occupation_type'] == 'university') & ((df['age'] >= SCHOOL_GRADUATED_AGE) & (df['age'] <= UNIVERSITY_AGE))) df['is_student'] = np.where(is_student_mask, True, False) is_schoolboy_mask = ((df['age'] < SCHOOL_GRADUATED_AGE) & (df['age'] != EMPTY_AGE)) | \ ((df['age'] == EMPTY_AGE) & (df['occupation_type'] == 'school')) df['is_schoolboy'] = np.where(is_schoolboy_mask, True, False) return df def load_geo_cache(json_file): with open(json_file, 'r') as rf: geo_cache.update(json.load(rf)) def save_geo_cache(json_file): with open(json_file, 'w') as wf: json.dump(geo_cache, wf) print('Geocache saved') def update_geo_cache(cities, json_file): is_changed = False for city in cities: if is_empty_str(city): continue result = geo_cache.get(city) if result is not None: continue print(f'{len(geo_cache.keys())}/{len(cities)} - Try to load geocode for {city}') location = geocode(city) result = (location.latitude, location.longitude) geo_cache[city] = result is_changed = True if len(geo_cache.keys()) % 50 == 0: save_geo_cache(json_file) if is_changed: save_geo_cache(json_file) def prepare_dataset_location(df): json_file = 'geocache.json' load_geo_cache(json_file) update_geo_cache(df['city'].unique().tolist(), json_file) df['location'] = df['city'] \ .apply(lambda val: '' if is_empty_str(val) else geo_cache[val]) return df def prepare_dataset(json_file): df = pd.read_json(json_file) df = prepare_dataset_age(df) df = prepare_dataset_status(df) df = prepare_dataset_location(df) return df def __main(json_file): df = prepare_dataset(json_file) print('done') if __name__ == '__main__': if len(sys.argv) != 2: print('You must specify the raw_dataset json file') exit(1) if not os.path.isfile(sys.argv[1]): print(f'File {sys.argv[1]} is not exists') __main(sys.argv[1])