Improve models for prepare_dataset script for minimize null data

This commit is contained in:
Aleksey Filippov 2023-05-26 11:22:38 +04:00
parent 3a51a23048
commit 86fbf158f0
6 changed files with 97 additions and 83 deletions

View File

@ -4,11 +4,11 @@ from src.raw_data import RawData
class Career: class Career:
def __init__(self, raw_career): def __init__(self, raw_career):
data = RawData(raw_career) data = RawData(raw_career)
self.id = data.id self.id = data.get_int('id')
self.group_id = data.group_id self.group_id = data.get_int('group_id')
self.company = data.company self.company = data.get_str('company')
self.country_id = data.country_id self.country_id = data.get_int('country_id')
self.city_id = data.city_id self.city_id = data.get_int('city_id')
self.year_from = raw_career['from'] self.year_from = data.get_int('from')
self.until = data.until self.until = data.get_int('until')
self.position = data.position self.position = data.get_str('position')

View File

@ -4,9 +4,9 @@ from src.raw_data import RawData
class Military: class Military:
def __init__(self, raw_military): def __init__(self, raw_military):
data = RawData(raw_military) data = RawData(raw_military)
self.id = data.id self.id = data.get_int('id')
self.unit_id = data.unit_id self.unit_id = data.get_int('unit_id')
self.unit = data.unit self.unit = data.get_str('unit')
self.country_id = data.country_id self.country_id = data.get_int('country_id')
self.year_from = raw_military['from'] self.year_from = data.get_int('from')
self.until = data.until self.until = data.get_int('until')

View File

@ -13,51 +13,46 @@ class Person:
self.deactivated = data.deactivated is not None self.deactivated = data.deactivated is not None
self.has_photo = data.has_photo == 1 self.has_photo = data.has_photo == 1
self.followers_count = data.followers_count self.followers_count = data.followers_count
self.sex = data.sex self.sex = data.get_int('sex')
self.bdate = data.get_date(data.bdate) self.bdate = RawData.get_date_st(data.bdate)
self.relation = data.relation self.relation = data.get_int('relation')
self.country_id = data.get_int(data.country, 'id') self.country_id = RawData.get_int_st(data.country, 'id')
self.country = data.get_int(data.country, 'title') self.country = RawData.get_str_st(data.country, 'title')
self.city_id = data.get_int(data.city, 'id') self.city_id = RawData.get_int_st(data.city, 'id')
self.city = data.get_int(data.city, 'title') self.city = RawData.get_str_st(data.city, 'title')
self.home_town = data.home_town self.home_town = data.get_str('home_town')
self.status = data.status self.status = data.get_str('status')
self.site = data.site is not None self.site = data.site is not None
self.facebook = data.facebook is not None self.facebook = data.facebook is not None
self.twitter = data.twitter is not None self.twitter = data.twitter is not None
self.instagram = data.instagram is not None self.instagram = data.instagram is not None
self.mobile_phone = data.mobile_phone is not None self.mobile_phone = data.mobile_phone is not None
self.activities = data.activities self.activities = data.get_str('activities')
self.interests = data.interests self.interests = data.get_str('interests')
self.books = data.books self.books = data.get_str('books')
self.movies = data.movies self.movies = data.get_str('movies')
self.music = data.music self.music = data.get_str('music')
self.tv = data.tv self.tv = data.get_str('tv')
self.games = data.games self.games = data.get_str('games')
self.quotes = data.quotes self.quotes = data.get_str('quotes')
self.about = data.about self.about = data.get_str('about')
personal = None personal = None
if data.personal is not None: if data.personal is not None:
if len(data.personal) == 1: if len(data.personal) == 1:
personal = data.personal[0] personal = data.personal[0]
if len(data.personal) > 1: if len(data.personal) > 1:
raise Exception(f'PERSONAL {data.id}') raise Exception(f'PERSONAL {data.id}')
self.political = data.get_int(personal, 'political') self.political = RawData.get_int_st(personal, 'political')
self.religion = data.get_str(personal, 'religion') self.religion = RawData.get_str_st(personal, 'religion')
self.inspired_by = data.get_str(personal, 'inspired_by') self.inspired_by = RawData.get_str_st(personal, 'inspired_by')
self.people_main = data.get_int(personal, 'people_main') self.people_main = RawData.get_int_st(personal, 'people_main')
self.life_main = data.get_int(personal, 'life_main') self.life_main = RawData.get_int_st(personal, 'life_main')
self.smoking = data.get_int(personal, 'smoking') self.smoking = RawData.get_int_st(personal, 'smoking')
self.alcohol = data.get_int(personal, 'alcohol') self.alcohol = RawData.get_int_st(personal, 'alcohol')
self.relatives = Person.__collection(data.relatives, lambda item: item['type']) self.relatives = RawData.get_collection_st(data.relatives, lambda item: item['type'])
self.occupation_type = data.get_str(data.occupation, 'type') self.occupation_type = RawData.get_str_st(data.occupation, 'type')
self.occupation_place_id = data.get_int(data.occupation, 'id') self.occupation_place_name = RawData.get_str_st(data.occupation, 'name')
self.occupation_place_name = data.get_str(data.occupation, 'name') self.universities = RawData.get_collection_st(data.universities, lambda item: University(item).__dict__)
self.universities = Person.__collection(data.universities, lambda item: University(item).__dict__) self.schools = RawData.get_collection_st(data.schools, lambda item: School(item).__dict__)
self.schools = Person.__collection(data.schools, lambda item: School(item).__dict__) self.career = RawData.get_collection_st(data.career, lambda item: Career(item).__dict__)
self.career = Person.__collection(data.career, lambda item: Career(item).__dict__) self.military = RawData.get_collection_st(data.military, lambda item: Military(item).__dict__)
self.military = Person.__collection(data.military, lambda item: Military(item).__dict__)
@staticmethod
def __collection(collection, function):
return list(map(lambda item: function(item), [] if collection is None else collection))

View File

@ -5,22 +5,37 @@ class RawData:
def __init__(self, data): def __init__(self, data):
self.__dict__.update(data) self.__dict__.update(data)
def get_str(self, attr):
return RawData.get_str_st(self.__dict__, attr)
@staticmethod @staticmethod
def get_str(value, attr): def get_str_st(value, attr):
if value is None: if value is None:
return '' return ''
return '' if value is None else value[attr] result = value[attr]
if result is None:
return ''
return result
def get_int(self, attr):
return RawData.get_int_st(self.__dict__, attr)
@staticmethod @staticmethod
def get_int(value, attr): def get_int_st(value, attr):
if value is None: if value is None:
return -1 return -1
return -1 if value is None else value[attr] result = value[attr]
if result is None:
return -1
if not str(result).isnumeric():
print(f'The value {result} is not a number')
return -1
return result
@staticmethod @staticmethod
def get_date(value): def get_date_st(value):
if value is None: if value is None:
return None return ''
try: try:
return datetime.strptime(value, '%d.%m.%Y').date() return datetime.strptime(value, '%d.%m.%Y').date()
except ValueError: except ValueError:
@ -28,4 +43,8 @@ class RawData:
return datetime.strptime(value, '%d.%m.%y').date() return datetime.strptime(value, '%d.%m.%y').date()
except ValueError: except ValueError:
print(f'Invalid date {value}') print(f'Invalid date {value}')
return None return ''
@staticmethod
def get_collection_st(collection, function):
return list(map(lambda item: function(item), [] if collection is None else collection))

View File

@ -4,15 +4,15 @@ from src.raw_data import RawData
class School: class School:
def __init__(self, raw_school): def __init__(self, raw_school):
data = RawData(raw_school) data = RawData(raw_school)
self.id = data.id self.id = data.get_int('id')
self.name = data.name self.name = data.get_str('name')
self.country_id = data.get_int(data.country, 'id') self.country_id = RawData.get_int_st(data.country, 'id')
self.city_id = data.get_int(data.city, 'id') self.city_id = RawData.get_int_st(data.city, 'id')
self.year_from = data.year_from self.year_from = data.get_int('year_from')
self.year_to = data.year_to self.year_to = data.get_int('year_to')
self.year_graduated = data.year_graduated self.year_graduated = data.get_int('year_graduated')
self.speciality = data.speciality self.speciality = data.get_str('speciality')
self.type_id = data.type self.type_id = data.get_int('type')
self.type = data.type_str self.type = data.get_str('type_str')
self.litera = raw_school['class'] self.litera_id = data.get_int('class_id')
self.litera_id = data.class_id self.litera = data.get_str('class')

View File

@ -4,16 +4,16 @@ from src.raw_data import RawData
class University: class University:
def __init__(self, raw_university): def __init__(self, raw_university):
data = RawData(raw_university) data = RawData(raw_university)
self.id = data.id self.id = data.get_int('id')
self.name = data.name self.name = data.get_str('name')
self.country_id = data.get_int(data.country, 'id') self.country_id = RawData.get_int_st(data.country, 'id')
self.city_id = data.get_int(data.city, 'id') self.city_id = RawData.get_int_st(data.city, 'id')
self.faculty_id = data.faculty self.faculty_id = data.get_int('faculty')
self.faculty = data.faculty_name self.faculty = data.get_str('faculty_name')
self.chair_id = data.chair self.chair_id = data.get_int('chair')
self.chair = data.chair_name self.chair = data.get_str('chair_name')
self.graduation = data.graduation self.graduation = data.get_int('graduation')
self.form_id = data.education_form_id self.form_id = data.get_int('education_form_id')
self.form = data.education_form self.form = data.get_str('education_form')
self.status_id = data.education_status_id self.status_id = data.get_int('education_status_id')
self.status = data.education_status self.status = data.get_str('education_status')