Add initial version of analyser

Unified dataset empty data values
2023-05-29 22:56:53 +04:00 · 2023-05-29 22:56:34 +04:00
7 changed files with 1539 additions and 10 deletions
--- a/geocache.json
+++ b/geocache.json
--- a/main.py
+++ b/main.py
@ -1,12 +1,171 @@
 #!/usr/bin/env python3
+import json
 import os
 import sys
+from datetime import date, datetime

+import numpy as np
 import pandas as pd
+from geopy.extra.rate_limiter import RateLimiter
+from geopy.geocoders import Nominatim
+
+EMPTY_AGE = 0
+UNIVERSITY_AGE = 21
+SCHOOL_BEGIN_AGE = 7
+SCHOOL_GRADUATED_AGE = 17
+
+geolocator = Nominatim(user_agent="MyApp")
+geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)
+
+geo_cache = {}
+
+
+def is_empty_str(value):
+    if value is None:
+        return True
+    return len(str(value).strip()) == 0
+
+
+def is_empty_number(value):
+    if is_empty_str(value):
+        return True
+    str_val = str(value)
+    if str_val.startswith('-'):
+        str_val = str_val.replace('-', '', 1)
+    return not str_val.isnumeric()
+
+
+def is_empty_collection(collection):
+    if is_empty_str(collection):
+        return True
+    if not isinstance(collection, list):
+        return True
+    return len(collection) == 0
+
+
+def get_age(date_str):
+    if is_empty_str(date_str):
+        return EMPTY_AGE
+    today = date.today()
+    birthdate = datetime.strptime(date_str, '%d.%m.%Y')
+    age = today.year - birthdate.year - ((today.month, today.day) < (birthdate.month, birthdate.day))
+    return age
+
+
+def get_years(year1, year2):
+    if year1 >= year2:
+        return year1 - year2
+    if year2 >= year1:
+        return year2 - year1
+
+
+def get_age_from_education(education, value, additional_value):
+    if is_empty_collection(education):
+        return EMPTY_AGE
+    for item in education:
+        graduation = item[value]
+        if is_empty_number(graduation):
+            return EMPTY_AGE
+        return get_years(graduation, date.today().year) + additional_value
+
+
+def prepare_dataset_age(df):
+    df['age'] = df.loc[:, 'bdate'].apply(get_age)
+
+    university_mask = (df['age'] == EMPTY_AGE) & (df['universities'].str.len() > 0)
+    df.loc[university_mask, 'age'] = df.loc[university_mask, 'universities'] \
+        .apply(lambda val: get_age_from_education(val, 'graduation', UNIVERSITY_AGE))
+
+    school_mask_1 = (df['age'] == EMPTY_AGE) & (df['schools'].str.len() > 0)
+    df.loc[school_mask_1, 'age'] = df.loc[school_mask_1, 'schools'] \
+        .apply(lambda val: get_age_from_education(val, 'year_graduated', SCHOOL_GRADUATED_AGE))
+
+    school_mask_2 = (df['age'] == EMPTY_AGE) & (df['schools'].str.len() > 0)
+    df.loc[school_mask_2, 'age'] = df.loc[school_mask_2, 'schools'] \
+        .apply(lambda val: get_age_from_education(val, 'year_from', SCHOOL_BEGIN_AGE))
+
+    return df
+
+
+def prepare_dataset_status(df):
+    is_university_mask = ((df['age'] >= UNIVERSITY_AGE) | (df['age'] == EMPTY_AGE)) & \
+                         ((df['universities'].str.len() > 0) | (df['occupation_type'] == 'university'))
+    df['is_university'] = np.where(is_university_mask, True, False)
+
+    is_work_mask = ((df['age'] > SCHOOL_GRADUATED_AGE) | (df['age'] == EMPTY_AGE)) & \
+                   ((df['is_university']) | (df['occupation_type'] == 'work')) | \
+                   (df['age'] > UNIVERSITY_AGE)
+    df['is_work'] = np.where(is_work_mask, True, False)
+
+    is_student_mask = ((df['occupation_type'] == 'university') &
+                       ((df['age'] >= SCHOOL_GRADUATED_AGE) & (df['age'] <= UNIVERSITY_AGE)))
+    df['is_student'] = np.where(is_student_mask, True, False)
+
+    is_schoolboy_mask = ((df['age'] < SCHOOL_GRADUATED_AGE) & (df['age'] != EMPTY_AGE)) | \
+                        ((df['age'] == EMPTY_AGE) & (df['occupation_type'] == 'school'))
+    df['is_schoolboy'] = np.where(is_schoolboy_mask, True, False)
+
+    return df
+
+
+def load_geo_cache(json_file):
+    with open(json_file, 'r') as rf:
+        geo_cache.update(json.load(rf))
+
+
+def save_geo_cache(json_file):
+    with open(json_file, 'w') as wf:
+        json.dump(geo_cache, wf)
+    print('Geocache saved')
+
+
+def update_geo_cache(cities, json_file):
+    is_changed = False
+    for city in cities:
+        if is_empty_str(city):
+            continue
+        result = geo_cache.get(city)
+        if result is not None:
+            continue
+        print(f'{len(geo_cache.keys())}/{len(cities)} - Try to load geocode for {city}')
+        location = geocode(city)
+        result = (location.latitude, location.longitude)
+        geo_cache[city] = result
+        is_changed = True
+        if len(geo_cache.keys()) % 50 == 0:
+            save_geo_cache(json_file)
+
+    if is_changed:
+        save_geo_cache(json_file)
+
+
+def prepare_dataset_location(df):
+    json_file = 'geocache.json'
+
+    load_geo_cache(json_file)
+
+    update_geo_cache(df['city'].unique().tolist(), json_file)
+
+    df['location'] = df['city'] \
+        .apply(lambda val: '' if is_empty_str(val) else geo_cache[val])
+
+    return df
+
+
+def prepare_dataset(json_file):
+    df = pd.read_json(json_file)
+
+    df = prepare_dataset_age(df)
+
+    df = prepare_dataset_status(df)
+
+    df = prepare_dataset_location(df)
+
+    return df


 def __main(json_file):
-    df = pd.read_json(json_file)
+    df = prepare_dataset(json_file)
    print('done')


--- a/prepare_dataset.py
+++ b/prepare_dataset.py
@ -32,6 +32,8 @@ def __main(json_file_name):
            current_col.append(person[key])
        df[key] = pd.Series(current_col)

+    df = df.drop(columns=['is_closed', 'deactivated'])
+
    pathname, extension = os.path.splitext(json_file_name)
    filename = pathname.split('/')[-1]

--- a/requirements.txt
+++ b/requirements.txt
@ -1 +1,2 @@
-pandas==2.0.1
+pandas==2.0.1
+geopy==2.3.0
--- a/src/raw_data.py
+++ b/src/raw_data.py
@ -23,28 +23,37 @@ class RawData:
    @staticmethod
    def get_int_st(value, attr):
        if value is None:
-            return -1
+            return ''
        result = value[attr]
        if result is None:
-            return -1
+            return ''
        if not str(result).isnumeric():
            print(f'The value {result} is not a number')
-            return -1
+            return ''
        return result

+    @staticmethod
+    def str_to_date(value, str_format):
+        return datetime.strptime(value, str_format).date().strftime('%d.%m.%Y')
+
    @staticmethod
    def get_date_st(value):
        if value is None:
            return ''
        try:
-            return datetime.strptime(value, '%d.%m.%Y').date()
+            return RawData.str_to_date(value, '%d.%m.%Y')
        except ValueError:
            try:
-                return datetime.strptime(value, '%d.%m.%y').date()
+                return RawData.str_to_date(value, '%d.%m.%y')
            except ValueError:
                print(f'Invalid date {value}')
                return ''

    @staticmethod
    def get_collection_st(collection, function):
-        return list(map(lambda item: function(item), [] if collection is None else collection))
+        if collection is None:
+            return ''
+        result_list = list(map(lambda item: function(item), collection))
+        if len(result_list) == 0:
+            return ''
+        return result_list
--- a/ulpressa.private.json
+++ b/ulpressa.private.json
--- a/ultra.private.json
+++ b/ultra.private.json
Author	SHA1	Message	Date
Aleksey Filippov	a9dd12af39	Add initial version of analyser	2023-05-29 22:56:53 +04:00
Aleksey Filippov	e16131b436	Unified dataset empty data values	2023-05-29 22:56:34 +04:00