From 86fbf158f0e968042f446ae39bbe649072c7b9af Mon Sep 17 00:00:00 2001 From: Aleksey Filippov Date: Fri, 26 May 2023 11:22:38 +0400 Subject: [PATCH] Improve models for prepare_dataset script for minimize null data --- src/career.py | 16 +++++------ src/military.py | 12 ++++----- src/person.py | 69 ++++++++++++++++++++++------------------------- src/raw_data.py | 33 ++++++++++++++++++----- src/school.py | 24 ++++++++--------- src/university.py | 26 +++++++++--------- 6 files changed, 97 insertions(+), 83 deletions(-) diff --git a/src/career.py b/src/career.py index 09e595b..3e657b1 100644 --- a/src/career.py +++ b/src/career.py @@ -4,11 +4,11 @@ from src.raw_data import RawData class Career: def __init__(self, raw_career): data = RawData(raw_career) - self.id = data.id - self.group_id = data.group_id - self.company = data.company - self.country_id = data.country_id - self.city_id = data.city_id - self.year_from = raw_career['from'] - self.until = data.until - self.position = data.position + self.id = data.get_int('id') + self.group_id = data.get_int('group_id') + self.company = data.get_str('company') + self.country_id = data.get_int('country_id') + self.city_id = data.get_int('city_id') + self.year_from = data.get_int('from') + self.until = data.get_int('until') + self.position = data.get_str('position') diff --git a/src/military.py b/src/military.py index 6da92f4..d869f51 100644 --- a/src/military.py +++ b/src/military.py @@ -4,9 +4,9 @@ from src.raw_data import RawData class Military: def __init__(self, raw_military): data = RawData(raw_military) - self.id = data.id - self.unit_id = data.unit_id - self.unit = data.unit - self.country_id = data.country_id - self.year_from = raw_military['from'] - self.until = data.until + self.id = data.get_int('id') + self.unit_id = data.get_int('unit_id') + self.unit = data.get_str('unit') + self.country_id = data.get_int('country_id') + self.year_from = data.get_int('from') + self.until = data.get_int('until') diff --git a/src/person.py b/src/person.py index 89c06cb..3e01138 100644 --- a/src/person.py +++ b/src/person.py @@ -13,51 +13,46 @@ class Person: self.deactivated = data.deactivated is not None self.has_photo = data.has_photo == 1 self.followers_count = data.followers_count - self.sex = data.sex - self.bdate = data.get_date(data.bdate) - self.relation = data.relation - self.country_id = data.get_int(data.country, 'id') - self.country = data.get_int(data.country, 'title') - self.city_id = data.get_int(data.city, 'id') - self.city = data.get_int(data.city, 'title') - self.home_town = data.home_town - self.status = data.status + self.sex = data.get_int('sex') + self.bdate = RawData.get_date_st(data.bdate) + self.relation = data.get_int('relation') + self.country_id = RawData.get_int_st(data.country, 'id') + self.country = RawData.get_str_st(data.country, 'title') + self.city_id = RawData.get_int_st(data.city, 'id') + self.city = RawData.get_str_st(data.city, 'title') + self.home_town = data.get_str('home_town') + self.status = data.get_str('status') self.site = data.site is not None self.facebook = data.facebook is not None self.twitter = data.twitter is not None self.instagram = data.instagram is not None self.mobile_phone = data.mobile_phone is not None - self.activities = data.activities - self.interests = data.interests - self.books = data.books - self.movies = data.movies - self.music = data.music - self.tv = data.tv - self.games = data.games - self.quotes = data.quotes - self.about = data.about + self.activities = data.get_str('activities') + self.interests = data.get_str('interests') + self.books = data.get_str('books') + self.movies = data.get_str('movies') + self.music = data.get_str('music') + self.tv = data.get_str('tv') + self.games = data.get_str('games') + self.quotes = data.get_str('quotes') + self.about = data.get_str('about') personal = None if data.personal is not None: if len(data.personal) == 1: personal = data.personal[0] if len(data.personal) > 1: raise Exception(f'PERSONAL {data.id}') - self.political = data.get_int(personal, 'political') - self.religion = data.get_str(personal, 'religion') - self.inspired_by = data.get_str(personal, 'inspired_by') - self.people_main = data.get_int(personal, 'people_main') - self.life_main = data.get_int(personal, 'life_main') - self.smoking = data.get_int(personal, 'smoking') - self.alcohol = data.get_int(personal, 'alcohol') - self.relatives = Person.__collection(data.relatives, lambda item: item['type']) - self.occupation_type = data.get_str(data.occupation, 'type') - self.occupation_place_id = data.get_int(data.occupation, 'id') - self.occupation_place_name = data.get_str(data.occupation, 'name') - self.universities = Person.__collection(data.universities, lambda item: University(item).__dict__) - self.schools = Person.__collection(data.schools, lambda item: School(item).__dict__) - self.career = Person.__collection(data.career, lambda item: Career(item).__dict__) - self.military = Person.__collection(data.military, lambda item: Military(item).__dict__) - - @staticmethod - def __collection(collection, function): - return list(map(lambda item: function(item), [] if collection is None else collection)) + self.political = RawData.get_int_st(personal, 'political') + self.religion = RawData.get_str_st(personal, 'religion') + self.inspired_by = RawData.get_str_st(personal, 'inspired_by') + self.people_main = RawData.get_int_st(personal, 'people_main') + self.life_main = RawData.get_int_st(personal, 'life_main') + self.smoking = RawData.get_int_st(personal, 'smoking') + self.alcohol = RawData.get_int_st(personal, 'alcohol') + self.relatives = RawData.get_collection_st(data.relatives, lambda item: item['type']) + self.occupation_type = RawData.get_str_st(data.occupation, 'type') + self.occupation_place_name = RawData.get_str_st(data.occupation, 'name') + self.universities = RawData.get_collection_st(data.universities, lambda item: University(item).__dict__) + self.schools = RawData.get_collection_st(data.schools, lambda item: School(item).__dict__) + self.career = RawData.get_collection_st(data.career, lambda item: Career(item).__dict__) + self.military = RawData.get_collection_st(data.military, lambda item: Military(item).__dict__) diff --git a/src/raw_data.py b/src/raw_data.py index 39c20e5..b2ab6e5 100644 --- a/src/raw_data.py +++ b/src/raw_data.py @@ -5,22 +5,37 @@ class RawData: def __init__(self, data): self.__dict__.update(data) + def get_str(self, attr): + return RawData.get_str_st(self.__dict__, attr) + @staticmethod - def get_str(value, attr): + def get_str_st(value, attr): if value is None: return '' - return '' if value is None else value[attr] + result = value[attr] + if result is None: + return '' + return result + + def get_int(self, attr): + return RawData.get_int_st(self.__dict__, attr) @staticmethod - def get_int(value, attr): + def get_int_st(value, attr): if value is None: return -1 - return -1 if value is None else value[attr] + result = value[attr] + if result is None: + return -1 + if not str(result).isnumeric(): + print(f'The value {result} is not a number') + return -1 + return result @staticmethod - def get_date(value): + def get_date_st(value): if value is None: - return None + return '' try: return datetime.strptime(value, '%d.%m.%Y').date() except ValueError: @@ -28,4 +43,8 @@ class RawData: return datetime.strptime(value, '%d.%m.%y').date() except ValueError: print(f'Invalid date {value}') - return None + return '' + + @staticmethod + def get_collection_st(collection, function): + return list(map(lambda item: function(item), [] if collection is None else collection)) diff --git a/src/school.py b/src/school.py index 1b50db2..a438481 100644 --- a/src/school.py +++ b/src/school.py @@ -4,15 +4,15 @@ from src.raw_data import RawData class School: def __init__(self, raw_school): data = RawData(raw_school) - self.id = data.id - self.name = data.name - self.country_id = data.get_int(data.country, 'id') - self.city_id = data.get_int(data.city, 'id') - self.year_from = data.year_from - self.year_to = data.year_to - self.year_graduated = data.year_graduated - self.speciality = data.speciality - self.type_id = data.type - self.type = data.type_str - self.litera = raw_school['class'] - self.litera_id = data.class_id + self.id = data.get_int('id') + self.name = data.get_str('name') + self.country_id = RawData.get_int_st(data.country, 'id') + self.city_id = RawData.get_int_st(data.city, 'id') + self.year_from = data.get_int('year_from') + self.year_to = data.get_int('year_to') + self.year_graduated = data.get_int('year_graduated') + self.speciality = data.get_str('speciality') + self.type_id = data.get_int('type') + self.type = data.get_str('type_str') + self.litera_id = data.get_int('class_id') + self.litera = data.get_str('class') diff --git a/src/university.py b/src/university.py index af395f4..5dbeb1a 100644 --- a/src/university.py +++ b/src/university.py @@ -4,16 +4,16 @@ from src.raw_data import RawData class University: def __init__(self, raw_university): data = RawData(raw_university) - self.id = data.id - self.name = data.name - self.country_id = data.get_int(data.country, 'id') - self.city_id = data.get_int(data.city, 'id') - self.faculty_id = data.faculty - self.faculty = data.faculty_name - self.chair_id = data.chair - self.chair = data.chair_name - self.graduation = data.graduation - self.form_id = data.education_form_id - self.form = data.education_form - self.status_id = data.education_status_id - self.status = data.education_status + self.id = data.get_int('id') + self.name = data.get_str('name') + self.country_id = RawData.get_int_st(data.country, 'id') + self.city_id = RawData.get_int_st(data.city, 'id') + self.faculty_id = data.get_int('faculty') + self.faculty = data.get_str('faculty_name') + self.chair_id = data.get_int('chair') + self.chair = data.get_str('chair_name') + self.graduation = data.get_int('graduation') + self.form_id = data.get_int('education_form_id') + self.form = data.get_str('education_form') + self.status_id = data.get_int('education_status_id') + self.status = data.get_str('education_status')