social-clusters/main.py

#!/usr/bin/env python3
import json
import os
import sys
from datetime import date, datetime

import numpy as np
import pandas as pd
from geopy.extra.rate_limiter import RateLimiter
from geopy.geocoders import Nominatim

EMPTY_AGE = 0
UNIVERSITY_AGE = 21
SCHOOL_BEGIN_AGE = 7
SCHOOL_GRADUATED_AGE = 17

geolocator = Nominatim(user_agent="MyApp")
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

geo_cache = {}


def is_empty_str(value):
    if value is None:
        return True
    return len(str(value).strip()) == 0


def is_empty_number(value):
    if is_empty_str(value):
        return True
    str_val = str(value)
    if str_val.startswith('-'):
        str_val = str_val.replace('-', '', 1)
    return not str_val.isnumeric()


def is_empty_collection(collection):
    if is_empty_str(collection):
        return True
    if not isinstance(collection, list):
        return True
    return len(collection) == 0


def get_age(date_str):
    if is_empty_str(date_str):
        return EMPTY_AGE
    today = date.today()
    birthdate = datetime.strptime(date_str, '%d.%m.%Y')
    age = today.year - birthdate.year - ((today.month, today.day) < (birthdate.month, birthdate.day))
    return age


def get_years(year1, year2):
    if year1 >= year2:
        return year1 - year2
    if year2 >= year1:
        return year2 - year1


def get_age_from_education(education, value, additional_value):
    if is_empty_collection(education):
        return EMPTY_AGE
    for item in education:
        graduation = item[value]
        if is_empty_number(graduation):
            return EMPTY_AGE
        return get_years(graduation, date.today().year) + additional_value


def prepare_dataset_age(df):
    df['age'] = df.loc[:, 'bdate'].apply(get_age)

    university_mask = (df['age'] == EMPTY_AGE) & (df['universities'].str.len() > 0)
    df.loc[university_mask, 'age'] = df.loc[university_mask, 'universities'] \
        .apply(lambda val: get_age_from_education(val, 'graduation', UNIVERSITY_AGE))

    school_mask_1 = (df['age'] == EMPTY_AGE) & (df['schools'].str.len() > 0)
    df.loc[school_mask_1, 'age'] = df.loc[school_mask_1, 'schools'] \
        .apply(lambda val: get_age_from_education(val, 'year_graduated', SCHOOL_GRADUATED_AGE))

    school_mask_2 = (df['age'] == EMPTY_AGE) & (df['schools'].str.len() > 0)
    df.loc[school_mask_2, 'age'] = df.loc[school_mask_2, 'schools'] \
        .apply(lambda val: get_age_from_education(val, 'year_from', SCHOOL_BEGIN_AGE))

    return df


def prepare_dataset_status(df):
    is_university_mask = ((df['age'] >= UNIVERSITY_AGE) | (df['age'] == EMPTY_AGE)) & \
                         ((df['universities'].str.len() > 0) | (df['occupation_type'] == 'university'))
    df['is_university'] = np.where(is_university_mask, True, False)

    is_work_mask = ((df['age'] > SCHOOL_GRADUATED_AGE) | (df['age'] == EMPTY_AGE)) & \
                   ((df['is_university']) | (df['occupation_type'] == 'work')) | \
                   (df['age'] > UNIVERSITY_AGE)
    df['is_work'] = np.where(is_work_mask, True, False)

    is_student_mask = ((df['occupation_type'] == 'university') &
                       ((df['age'] >= SCHOOL_GRADUATED_AGE) & (df['age'] <= UNIVERSITY_AGE)))
    df['is_student'] = np.where(is_student_mask, True, False)

    is_schoolboy_mask = ((df['age'] < SCHOOL_GRADUATED_AGE) & (df['age'] != EMPTY_AGE)) | \
                        ((df['age'] == EMPTY_AGE) & (df['occupation_type'] == 'school'))
    df['is_schoolboy'] = np.where(is_schoolboy_mask, True, False)

    return df


def load_geo_cache(json_file):
    with open(json_file, 'r') as rf:
        geo_cache.update(json.load(rf))


def save_geo_cache(json_file):
    with open(json_file, 'w') as wf:
        json.dump(geo_cache, wf)
    print('Geocache saved')


def update_geo_cache(cities, json_file):
    is_changed = False
    for city in cities:
        if is_empty_str(city):
            continue
        result = geo_cache.get(city)
        if result is not None:
            continue
        print(f'{len(geo_cache.keys())}/{len(cities)} - Try to load geocode for {city}')
        location = geocode(city)
        result = (location.latitude, location.longitude)
        geo_cache[city] = result
        is_changed = True
        if len(geo_cache.keys()) % 50 == 0:
            save_geo_cache(json_file)

    if is_changed:
        save_geo_cache(json_file)


def prepare_dataset_location(df):
    json_file = 'geocache.json'

    load_geo_cache(json_file)

    update_geo_cache(df['city'].unique().tolist(), json_file)

    df['location'] = df['city'] \
        .apply(lambda val: '' if is_empty_str(val) else geo_cache[val])

    return df


def prepare_dataset(json_file):
    df = pd.read_json(json_file)

    df = prepare_dataset_age(df)

    df = prepare_dataset_status(df)

    df = prepare_dataset_location(df)

    return df


def __main(json_file):
    df = prepare_dataset(json_file)
    print('done')


if __name__ == '__main__':
    if len(sys.argv) != 2:
        print('You must specify the raw_dataset json file')
        exit(1)
    if not os.path.isfile(sys.argv[1]):
        print(f'File {sys.argv[1]} is not exists')
    __main(sys.argv[1])