social-clusters/main.py

#!/usr/bin/env python3
import os
import sys
from typing import List

import numpy
import numpy as np
import pandas as pd
import scipy.cluster.hierarchy as sc
from matplotlib import pyplot as plt
from numpy import ndarray
from pandas import Series
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA

from src.main.df_loader import DfLoader
from src.main.georeverse import Georeverse

is_plots: bool = False
default_clusters: int = 3
georeverse: Georeverse = Georeverse()


def __plots(data: ndarray, labels: ndarray) -> None:
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    sc.dendrogram(sc.linkage(data, method='ward'), p=4, truncate_mode='level')
    plt.title('Dendrogram')
    pca = PCA(n_components=2)
    transformed = pd.DataFrame(pca.fit_transform(data)).to_numpy()
    plt.subplot(1, 2, 2)
    plt.scatter(x=transformed[:, 0], y=transformed[:, 1], c=labels, cmap='rainbow')
    plt.title('Clustering')
    plt.show()


def __get_cluster_centers(data: ndarray, labels: ndarray) -> ndarray:
    centers: List[List[float]] = list()
    for label in set(labels):
        center: Series = data[numpy.where(labels[:] == label)].mean(axis=0)
        centers.append(list(center))
    return np.array(centers)


def __print_center(center: ndarray) -> None:
    location: str = georeverse.get_city(center[0], center[1])
    sex = round(center[2])
    age = round(center[3])
    is_university = bool(round(center[4]))
    is_work = bool(round(center[5]))
    is_student = bool(round(center[6]))
    is_schoolboy = bool(round(center[7]))
    print(f'location: {location}, sex: {sex}, age: {age},'
          f' univer: {is_university}, work: {is_work}, student: {is_student}, school: {is_schoolboy}')


def __clustering(data: ndarray, n_clusters: int = 3, plots: bool = False) -> None:
    model = AgglomerativeClustering(n_clusters=n_clusters, metric='euclidean', linkage='ward')
    model.fit(data)
    labels = model.labels_
    if plots:
        __plots(data, labels)
    centers = __get_cluster_centers(data, labels)
    for center in centers:
        __print_center(center)


def __main(json_file):
    data: ndarray = DfLoader(json_file).get_data()
    __clustering(data, default_clusters, is_plots)


if __name__ == '__main__':
    if len(sys.argv) != 2:
        print('You must specify the raw_dataset json file')
        exit(1)
    if not os.path.isfile(sys.argv[1]):
        print(f'File {sys.argv[1]} is not exists')
    __main(sys.argv[1])
Add initial main analysis script 2023-05-26 10:33:54 +04:00			`#!/usr/bin/env python3`
			`import os`
			`import sys`
Add cluster centers extraction 2023-06-07 15:24:49 +04:00			`from typing import List`
Add initial main analysis script 2023-05-26 10:33:54 +04:00
Add agglomerative clustering 2023-06-06 00:33:45 +04:00			`import numpy`
Add cluster centers extraction 2023-06-07 15:24:49 +04:00			`import numpy as np`
Add agglomerative clustering 2023-06-06 00:33:45 +04:00			`import pandas as pd`
Add cluster centers extraction 2023-06-07 15:24:49 +04:00			`import scipy.cluster.hierarchy as sc`
Add agglomerative clustering 2023-06-06 00:33:45 +04:00			`from matplotlib import pyplot as plt`
Add cluster centers extraction 2023-06-07 15:24:49 +04:00			`from numpy import ndarray`
			`from pandas import Series`
Add agglomerative clustering 2023-06-06 00:33:45 +04:00			`from sklearn.cluster import AgglomerativeClustering`
			`from sklearn.decomposition import PCA`

Separate main script to classes 2023-06-05 18:18:18 +04:00			`from src.main.df_loader import DfLoader`
Add cluster centers extraction 2023-06-07 15:24:49 +04:00			`from src.main.georeverse import Georeverse`
Add initial version of analyser 2023-05-29 22:56:53 +04:00
Add cluster centers extraction 2023-06-07 15:24:49 +04:00			`is_plots: bool = False`
			`default_clusters: int = 3`
			`georeverse: Georeverse = Georeverse()`
Add initial version of analyser 2023-05-29 22:56:53 +04:00
Add agglomerative clustering 2023-06-06 00:33:45 +04:00
Add cluster centers extraction 2023-06-07 15:24:49 +04:00			`def __plots(data: ndarray, labels: ndarray) -> None:`
			`plt.figure(figsize=(12, 6))`
			`plt.subplot(1, 2, 1)`
			`sc.dendrogram(sc.linkage(data, method='ward'), p=4, truncate_mode='level')`
			`plt.title('Dendrogram')`
			`pca = PCA(n_components=2)`
			`transformed = pd.DataFrame(pca.fit_transform(data)).to_numpy()`
			`plt.subplot(1, 2, 2)`
			`plt.scatter(x=transformed[:, 0], y=transformed[:, 1], c=labels, cmap='rainbow')`
			`plt.title('Clustering')`
			`plt.show()`
Add agglomerative clustering 2023-06-06 00:33:45 +04:00

Add cluster centers extraction 2023-06-07 15:24:49 +04:00			`def __get_cluster_centers(data: ndarray, labels: ndarray) -> ndarray:`
			`centers: List[List[float]] = list()`
			`for label in set(labels):`
			`center: Series = data[numpy.where(labels[:] == label)].mean(axis=0)`
			`centers.append(list(center))`
			`return np.array(centers)`
Add agglomerative clustering 2023-06-06 00:33:45 +04:00

Add cluster centers extraction 2023-06-07 15:24:49 +04:00			`def __print_center(center: ndarray) -> None:`
			`location: str = georeverse.get_city(center[0], center[1])`
			`sex = round(center[2])`
			`age = round(center[3])`
			`is_university = bool(round(center[4]))`
			`is_work = bool(round(center[5]))`
			`is_student = bool(round(center[6]))`
			`is_schoolboy = bool(round(center[7]))`
			`print(f'location: {location}, sex: {sex}, age: {age},'`
			`f' univer: {is_university}, work: {is_work}, student: {is_student}, school: {is_schoolboy}')`


			`def __clustering(data: ndarray, n_clusters: int = 3, plots: bool = False) -> None:`
			`model = AgglomerativeClustering(n_clusters=n_clusters, metric='euclidean', linkage='ward')`
			`model.fit(data)`
			`labels = model.labels_`
			`if plots:`
			`__plots(data, labels)`
			`centers = __get_cluster_centers(data, labels)`
			`for center in centers:`
			`__print_center(center)`
Add agglomerative clustering 2023-06-06 00:33:45 +04:00

Add initial version of analyser 2023-05-29 22:56:53 +04:00			`def __main(json_file):`
Add cluster centers extraction 2023-06-07 15:24:49 +04:00			`data: ndarray = DfLoader(json_file).get_data()`
			`__clustering(data, default_clusters, is_plots)`
Add initial main analysis script 2023-05-26 10:33:54 +04:00

			`if __name__ == '__main__':`
			`if len(sys.argv) != 2:`
			`print('You must specify the raw_dataset json file')`
			`exit(1)`
			`if not os.path.isfile(sys.argv[1]):`
			`print(f'File {sys.argv[1]} is not exists')`
			`__main(sys.argv[1])`