Improve models for prepare_dataset script for minimize null data

This commit is contained in:
Aleksey Filippov 2023-05-26 11:22:38 +04:00
parent 3a51a23048
commit 86fbf158f0
6 changed files with 97 additions and 83 deletions

View File

@ -4,11 +4,11 @@ from src.raw_data import RawData
class Career:
def __init__(self, raw_career):
data = RawData(raw_career)
self.id = data.id
self.group_id = data.group_id
self.company = data.company
self.country_id = data.country_id
self.city_id = data.city_id
self.year_from = raw_career['from']
self.until = data.until
self.position = data.position
self.id = data.get_int('id')
self.group_id = data.get_int('group_id')
self.company = data.get_str('company')
self.country_id = data.get_int('country_id')
self.city_id = data.get_int('city_id')
self.year_from = data.get_int('from')
self.until = data.get_int('until')
self.position = data.get_str('position')

View File

@ -4,9 +4,9 @@ from src.raw_data import RawData
class Military:
def __init__(self, raw_military):
data = RawData(raw_military)
self.id = data.id
self.unit_id = data.unit_id
self.unit = data.unit
self.country_id = data.country_id
self.year_from = raw_military['from']
self.until = data.until
self.id = data.get_int('id')
self.unit_id = data.get_int('unit_id')
self.unit = data.get_str('unit')
self.country_id = data.get_int('country_id')
self.year_from = data.get_int('from')
self.until = data.get_int('until')

View File

@ -13,51 +13,46 @@ class Person:
self.deactivated = data.deactivated is not None
self.has_photo = data.has_photo == 1
self.followers_count = data.followers_count
self.sex = data.sex
self.bdate = data.get_date(data.bdate)
self.relation = data.relation
self.country_id = data.get_int(data.country, 'id')
self.country = data.get_int(data.country, 'title')
self.city_id = data.get_int(data.city, 'id')
self.city = data.get_int(data.city, 'title')
self.home_town = data.home_town
self.status = data.status
self.sex = data.get_int('sex')
self.bdate = RawData.get_date_st(data.bdate)
self.relation = data.get_int('relation')
self.country_id = RawData.get_int_st(data.country, 'id')
self.country = RawData.get_str_st(data.country, 'title')
self.city_id = RawData.get_int_st(data.city, 'id')
self.city = RawData.get_str_st(data.city, 'title')
self.home_town = data.get_str('home_town')
self.status = data.get_str('status')
self.site = data.site is not None
self.facebook = data.facebook is not None
self.twitter = data.twitter is not None
self.instagram = data.instagram is not None
self.mobile_phone = data.mobile_phone is not None
self.activities = data.activities
self.interests = data.interests
self.books = data.books
self.movies = data.movies
self.music = data.music
self.tv = data.tv
self.games = data.games
self.quotes = data.quotes
self.about = data.about
self.activities = data.get_str('activities')
self.interests = data.get_str('interests')
self.books = data.get_str('books')
self.movies = data.get_str('movies')
self.music = data.get_str('music')
self.tv = data.get_str('tv')
self.games = data.get_str('games')
self.quotes = data.get_str('quotes')
self.about = data.get_str('about')
personal = None
if data.personal is not None:
if len(data.personal) == 1:
personal = data.personal[0]
if len(data.personal) > 1:
raise Exception(f'PERSONAL {data.id}')
self.political = data.get_int(personal, 'political')
self.religion = data.get_str(personal, 'religion')
self.inspired_by = data.get_str(personal, 'inspired_by')
self.people_main = data.get_int(personal, 'people_main')
self.life_main = data.get_int(personal, 'life_main')
self.smoking = data.get_int(personal, 'smoking')
self.alcohol = data.get_int(personal, 'alcohol')
self.relatives = Person.__collection(data.relatives, lambda item: item['type'])
self.occupation_type = data.get_str(data.occupation, 'type')
self.occupation_place_id = data.get_int(data.occupation, 'id')
self.occupation_place_name = data.get_str(data.occupation, 'name')
self.universities = Person.__collection(data.universities, lambda item: University(item).__dict__)
self.schools = Person.__collection(data.schools, lambda item: School(item).__dict__)
self.career = Person.__collection(data.career, lambda item: Career(item).__dict__)
self.military = Person.__collection(data.military, lambda item: Military(item).__dict__)
@staticmethod
def __collection(collection, function):
return list(map(lambda item: function(item), [] if collection is None else collection))
self.political = RawData.get_int_st(personal, 'political')
self.religion = RawData.get_str_st(personal, 'religion')
self.inspired_by = RawData.get_str_st(personal, 'inspired_by')
self.people_main = RawData.get_int_st(personal, 'people_main')
self.life_main = RawData.get_int_st(personal, 'life_main')
self.smoking = RawData.get_int_st(personal, 'smoking')
self.alcohol = RawData.get_int_st(personal, 'alcohol')
self.relatives = RawData.get_collection_st(data.relatives, lambda item: item['type'])
self.occupation_type = RawData.get_str_st(data.occupation, 'type')
self.occupation_place_name = RawData.get_str_st(data.occupation, 'name')
self.universities = RawData.get_collection_st(data.universities, lambda item: University(item).__dict__)
self.schools = RawData.get_collection_st(data.schools, lambda item: School(item).__dict__)
self.career = RawData.get_collection_st(data.career, lambda item: Career(item).__dict__)
self.military = RawData.get_collection_st(data.military, lambda item: Military(item).__dict__)

View File

@ -5,22 +5,37 @@ class RawData:
def __init__(self, data):
self.__dict__.update(data)
def get_str(self, attr):
return RawData.get_str_st(self.__dict__, attr)
@staticmethod
def get_str(value, attr):
def get_str_st(value, attr):
if value is None:
return ''
return '' if value is None else value[attr]
result = value[attr]
if result is None:
return ''
return result
def get_int(self, attr):
return RawData.get_int_st(self.__dict__, attr)
@staticmethod
def get_int(value, attr):
def get_int_st(value, attr):
if value is None:
return -1
return -1 if value is None else value[attr]
result = value[attr]
if result is None:
return -1
if not str(result).isnumeric():
print(f'The value {result} is not a number')
return -1
return result
@staticmethod
def get_date(value):
def get_date_st(value):
if value is None:
return None
return ''
try:
return datetime.strptime(value, '%d.%m.%Y').date()
except ValueError:
@ -28,4 +43,8 @@ class RawData:
return datetime.strptime(value, '%d.%m.%y').date()
except ValueError:
print(f'Invalid date {value}')
return None
return ''
@staticmethod
def get_collection_st(collection, function):
return list(map(lambda item: function(item), [] if collection is None else collection))

View File

@ -4,15 +4,15 @@ from src.raw_data import RawData
class School:
def __init__(self, raw_school):
data = RawData(raw_school)
self.id = data.id
self.name = data.name
self.country_id = data.get_int(data.country, 'id')
self.city_id = data.get_int(data.city, 'id')
self.year_from = data.year_from
self.year_to = data.year_to
self.year_graduated = data.year_graduated
self.speciality = data.speciality
self.type_id = data.type
self.type = data.type_str
self.litera = raw_school['class']
self.litera_id = data.class_id
self.id = data.get_int('id')
self.name = data.get_str('name')
self.country_id = RawData.get_int_st(data.country, 'id')
self.city_id = RawData.get_int_st(data.city, 'id')
self.year_from = data.get_int('year_from')
self.year_to = data.get_int('year_to')
self.year_graduated = data.get_int('year_graduated')
self.speciality = data.get_str('speciality')
self.type_id = data.get_int('type')
self.type = data.get_str('type_str')
self.litera_id = data.get_int('class_id')
self.litera = data.get_str('class')

View File

@ -4,16 +4,16 @@ from src.raw_data import RawData
class University:
def __init__(self, raw_university):
data = RawData(raw_university)
self.id = data.id
self.name = data.name
self.country_id = data.get_int(data.country, 'id')
self.city_id = data.get_int(data.city, 'id')
self.faculty_id = data.faculty
self.faculty = data.faculty_name
self.chair_id = data.chair
self.chair = data.chair_name
self.graduation = data.graduation
self.form_id = data.education_form_id
self.form = data.education_form
self.status_id = data.education_status_id
self.status = data.education_status
self.id = data.get_int('id')
self.name = data.get_str('name')
self.country_id = RawData.get_int_st(data.country, 'id')
self.city_id = RawData.get_int_st(data.city, 'id')
self.faculty_id = data.get_int('faculty')
self.faculty = data.get_str('faculty_name')
self.chair_id = data.get_int('chair')
self.chair = data.get_str('chair_name')
self.graduation = data.get_int('graduation')
self.form_id = data.get_int('education_form_id')
self.form = data.get_str('education_form')
self.status_id = data.get_int('education_status_id')
self.status = data.get_str('education_status')