Add cluster centers extraction

Return ndarray instead DataFrame in df_loader.py
2023-06-07 15:24:49 +04:00 · 2023-06-06 17:59:37 +04:00
3 changed files with 66 additions and 39 deletions
--- a/main.py
+++ b/main.py
@ -1,59 +1,73 @@
 #!/usr/bin/env python3
 import os
 import sys
+from typing import List

 import numpy
+import numpy as np
 import pandas as pd
-# import scipy.cluster.hierarchy as sc
+import scipy.cluster.hierarchy as sc
 from matplotlib import pyplot as plt
-from pandas import DataFrame
+from numpy import ndarray
+from pandas import Series
 from sklearn.cluster import AgglomerativeClustering
 from sklearn.decomposition import PCA

 from src.main.df_loader import DfLoader
+from src.main.georeverse import Georeverse
+
+is_plots: bool = False
+default_clusters: int = 3
+georeverse: Georeverse = Georeverse()


-def __clustering(data: DataFrame) -> None:
-    # clusters = round(math.sqrt(len(data) / 2))
-    # plt.figure(figsize=(20, 7))
-    # plt.title("Dendrograms")
-    # # Create dendrogram
-    # sc.dendrogram(sc.linkage(data.to_numpy(), method='ward'))
-    # plt.title('Dendrogram')
-    # plt.xlabel('Sample index')
-    # plt.ylabel('Euclidean distance')
-
-    clusters = 3
-    model = AgglomerativeClustering(n_clusters=clusters, metric='euclidean', linkage='ward')
-    model.fit(data)
-    labels = model.labels_
-
-    data_norm = (data - data.min()) / (data.max() - data.min())
-
-    pca = PCA(n_components=2)  # 2-dimensional PCA
-    transformed = pd.DataFrame(pca.fit_transform(data_norm))
-    # plt.scatter(x=transformed[:, 0], y=transformed[:, 1], c=labels, cmap='rainbow')
-    for i in range(clusters):
-        series = transformed.iloc[numpy.where(labels[:] == i)]
-        plt.scatter(series[0], series[1], label=f'Cluster {i + 1}')
-    plt.legend()
+def __plots(data: ndarray, labels: ndarray) -> None:
+    plt.figure(figsize=(12, 6))
+    plt.subplot(1, 2, 1)
+    sc.dendrogram(sc.linkage(data, method='ward'), p=4, truncate_mode='level')
+    plt.title('Dendrogram')
+    pca = PCA(n_components=2)
+    transformed = pd.DataFrame(pca.fit_transform(data)).to_numpy()
+    plt.subplot(1, 2, 2)
+    plt.scatter(x=transformed[:, 0], y=transformed[:, 1], c=labels, cmap='rainbow')
+    plt.title('Clustering')
    plt.show()

-    # fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
-    # sns.scatterplot(ax=axes[0], data=data, x='location-la,location-lo', y='age,sex').set_title('Without clustering')
-    # sns.scatterplot(ax=axes[1], data=data, x='location-la,location-lo', y='age,sex', hue=labels) \
-    #     .set_title('With clustering')
-    # plt.show()

-    # s = numpy.where(labels[:] == 34)
-    # print(labels)
+def __get_cluster_centers(data: ndarray, labels: ndarray) -> ndarray:
+    centers: List[List[float]] = list()
+    for label in set(labels):
+        center: Series = data[numpy.where(labels[:] == label)].mean(axis=0)
+        centers.append(list(center))
+    return np.array(centers)
+
+
+def __print_center(center: ndarray) -> None:
+    location: str = georeverse.get_city(center[0], center[1])
+    sex = round(center[2])
+    age = round(center[3])
+    is_university = bool(round(center[4]))
+    is_work = bool(round(center[5]))
+    is_student = bool(round(center[6]))
+    is_schoolboy = bool(round(center[7]))
+    print(f'location: {location}, sex: {sex}, age: {age},'
+          f' univer: {is_university}, work: {is_work}, student: {is_student}, school: {is_schoolboy}')
+
+
+def __clustering(data: ndarray, n_clusters: int = 3, plots: bool = False) -> None:
+    model = AgglomerativeClustering(n_clusters=n_clusters, metric='euclidean', linkage='ward')
+    model.fit(data)
+    labels = model.labels_
+    if plots:
+        __plots(data, labels)
+    centers = __get_cluster_centers(data, labels)
+    for center in centers:
+        __print_center(center)


 def __main(json_file):
-    df_loader: DfLoader = DfLoader(json_file)
-    data = df_loader.get_clustering_data()
-    print(data)
-    __clustering(data)
+    data: ndarray = DfLoader(json_file).get_data()
+    __clustering(data, default_clusters, is_plots)


 if __name__ == '__main__':
--- a/src/main/df_loader.py
+++ b/src/main/df_loader.py
@ -2,6 +2,7 @@ from datetime import date

 import numpy as np
 import pandas as pd
+from numpy import ndarray
 from pandas import DataFrame

 from src.main.constants import Constants as const
@ -73,8 +74,8 @@ class DfLoader:
        self.__df['location-lo'] = self.__df.loc[:, 'location'] \
            .apply(lambda val: 0 if Utils.is_empty_collection(val) else val[1])

-    def get_clustering_data(self) -> DataFrame:
+    def get_data(self) -> ndarray:
        columns: [] = ['location-la', 'location-lo',
                       'sex', 'age', 'is_university', 'is_work', 'is_student', 'is_schoolboy']
        df = self.__df
-        return df[columns]
+        return df[columns].to_numpy()
--- a/src/main/georeverse.py
+++ b/src/main/georeverse.py
@ -0,0 +1,12 @@
+from functools import partial
+
+from geopy import Nominatim
+
+
+class Georeverse:
+    def __init__(self) -> None:
+        geolocator: Nominatim = Nominatim(user_agent="MyApp")
+        self.__reverse = partial(geolocator.reverse, language="ru")
+
+    def get_city(self, latitude: float, longitude: float) -> str:
+        return self.__reverse(f'{latitude}, {longitude}')
Author	SHA1	Message	Date
Aleksey Filippov	155b350e1e	Add cluster centers extraction	2023-06-07 15:24:49 +04:00
Aleksey Filippov	f4a32bf57f	Return ndarray instead DataFrame in df_loader.py	2023-06-06 17:59:37 +04:00