Export main dataframe as numpy array for clustering

This commit is contained in:
Aleksey Filippov 2023-06-05 18:32:51 +04:00
parent 488857052d
commit 2dd770d578

View File

@ -2,6 +2,7 @@ from datetime import date
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from numpy import ndarray
from pandas import DataFrame from pandas import DataFrame
from src.main.constants import Constants as const from src.main.constants import Constants as const
@ -10,6 +11,7 @@ from src.main.utils import Utils
class DfLoader: class DfLoader:
def __init__(self, json_file: str) -> None: def __init__(self, json_file: str) -> None:
self.__geocache: Geocache = Geocache() self.__geocache: Geocache = Geocache()
print(f'Try to load data from the {json_file} file') print(f'Try to load data from the {json_file} file')
@ -68,5 +70,6 @@ class DfLoader:
self.__df['location'] = self.__df['city'] \ self.__df['location'] = self.__df['city'] \
.apply(lambda val: '' if Utils.is_empty_str(val) else self.__geocache.get_location(val)) .apply(lambda val: '' if Utils.is_empty_str(val) else self.__geocache.get_location(val))
def get_clustering_data(self) -> DataFrame: def get_clustering_data(self) -> ndarray:
return self.__df columns: [] = ['location', 'sex', 'age', 'is_university', 'is_work', 'is_student', 'is_schoolboy']
return self.__df[columns].to_numpy()