Compare commits

..

5 Commits

5 changed files with 865 additions and 24 deletions

View File

@ -1354,5 +1354,783 @@
"Saint Petersburg": [
27.7703796,
-82.6695085
],
"\u041b\u0438\u043f\u0435\u0446\u043a": [
52.6041877,
39.5936899
],
"Toronto": [
43.6534817,
-79.3839347
],
"\u041a\u0430\u0434\u0430\u0442": [
37.89127,
72.26013
],
"\u0427\u0435\u0431\u0430\u0440\u043a\u0443\u043b\u044c": [
54.977795,
60.37009
],
"Hong Kong": [
22.2793278,
114.1628131
],
"\u041a\u0443\u0440\u0441\u043a": [
51.739433,
36.179604
],
"\u0414\u0437\u0435\u0440\u0436\u0438\u043d\u0441\u043a": [
56.2382157,
43.4617405
],
"\u041a\u043e\u0441\u0443\u043b\u0438\u043d\u043e": [
54.8081899,
64.052749
],
"\u0412\u044f\u0440\u0442\u0441\u0438\u043b\u044f": [
62.176053,
30.6961678
],
"Maroochydore": [
-26.6556523,
153.0946684
],
"\u0427\u0435\u0440\u043a\u0435\u0441\u0441\u043a": [
44.2285229,
42.048257
],
"\u0411\u0440\u044f\u043d\u0441\u043a": [
53.2423778,
34.3668288
],
"\u0412\u043d\u0443\u043a\u043e\u0432\u043e (\u043f\u043e\u0441\u0435\u043b\u043e\u043a)": [
55.6368166,
37.261665
],
"Norderstedt": [
53.7089898,
9.9891914
],
"\u0418\u0432\u0430\u043d\u043e\u0432\u043e": [
56.9984452,
40.9737394
],
"\u041c\u0438\u043d\u0441\u043a": [
53.9024716,
27.5618225
],
"\u041d\u043e\u0432\u043e\u0447\u0435\u0440\u043a\u0430\u0441\u0441\u043a": [
47.41066,
40.101986
],
"\u0425\u043e\u0440\u043e\u0448\u0438\u0439": [
64.1832122,
141.7068097
],
"\u0412\u044b\u0431\u043e\u0440\u0433": [
60.709217,
28.744051
],
"Kuta Selatan": [
-8.8014264,
115.17669285453046
],
"\u0428\u0443\u0440\u044b\u0433\u0438\u043d\u043e": [
54.075077,
82.86483
],
"\u0414\u0443\u0431\u043d\u0430": [
56.7362705,
37.1623696
],
"\u041f\u043e\u0434\u043e\u043b\u044c\u0441\u043a": [
55.4308841,
37.5453056
],
"\u041a\u0430\u043c\u0435\u043d\u043a\u0430": [
53.1850012,
44.0529066
],
"\u0414\u043e\u043d\u0435\u0446\u043a": [
48.0158753,
37.8013407
],
"\u0411\u0438\u043b\u0438\u0431\u0438\u043d\u043e": [
68.0550305,
166.4448515
],
"Kingdom of Cambodia": [
12.5433216,
104.8144914
],
"Vancouver": [
49.2608724,
-123.113952
],
"\u041f\u0443\u0448\u043a\u0438\u043d": [
59.722256,
30.415731
],
"\u0421\u0442\u0430\u0440\u044b\u0439 \u041e\u0441\u043a\u043e\u043b": [
51.298038,
37.833202
],
"\u0421\u043e\u0441\u043d\u043e\u0432\u044b\u0439 \u0411\u043e\u0440": [
59.8968655,
29.0765628
],
"\u0410\u043a\u0442\u043e\u0431\u0435": [
50.2836268,
57.2298651
],
"\u0410\u043b\u043c\u0430\u0442\u044b": [
43.2363924,
76.9457275
],
"\u041e\u0439\u043a\u0430\u0441-\u041a\u0438\u0431\u0435\u043a\u0438": [
55.613743,
47.166279
],
"\u0412\u0435\u043b\u044c\u0441\u043a": [
61.0756446,
42.0979582
],
"\u041a\u0443\u0439\u0431\u044b\u0448\u0435\u0432": [
55.4454258,
78.3222294
],
"\u041b\u0443\u0433\u0430\u043d\u0441\u043a": [
48.5717084,
39.2973153
],
"\u0412\u0438\u0442\u0435\u0431\u0441\u043a": [
55.1930197,
30.2070437
],
"Wroc\u0142aw": [
51.1089776,
17.0326689
],
"\u041f\u0441\u043a\u043e\u0432": [
57.8173923,
28.3343465
],
"\u041c\u0438\u0440\u043d\u044b\u0439": [
62.541077,
113.978584
],
"\u0414\u043e\u043b\u0433\u043e\u043f\u0440\u0443\u0434\u043d\u044b\u0439": [
55.9341491,
37.5142417
],
"Gu'an County": [
39.3372611,
116.2936501
],
"\u041a\u0440\u0430\u0441\u043d\u043e\u0433\u043e\u0440\u043e\u0434\u0441\u043a": [
56.830654,
28.284595
],
"\u041c\u043e\u0441\u043a\u043e\u0432\u0441\u043a\u0438\u0439": [
55.6026007,
37.3479176
],
"\u041a\u0440\u0435\u043c\u0435\u043d\u0447\u0443\u0433": [
49.0629553,
33.403516
],
"\u041d\u0430\u0434\u044b\u043c": [
65.532,
72.52
],
"\u041a\u0440\u0438\u0432\u043e\u0439 \u0420\u043e\u0433": [
47.9102734,
33.3917703
],
"\u0425\u0430\u0431\u0430\u0440\u043e\u0432\u0441\u043a": [
48.4812568,
135.0762968
],
"\u0422\u0443\u0439\u043c\u0430\u0437\u044b": [
54.606537,
53.6974004
],
"\u0421\u0430\u043c\u0430\u0440\u0430 (\u043f\u043e\u0441\u0435\u043b\u043e\u043a)": [
41.3495987,
69.3871141
],
"Bruxelles": [
50.8465573,
4.351697
],
"\u0421\u0435\u0432\u0435\u0440\u043e\u0434\u0432\u0438\u043d\u0441\u043a": [
64.563385,
39.823769
],
"\u0410\u043b\u044c\u043c\u0435\u0442\u044c\u0435\u0432\u0441\u043a": [
54.9005008,
52.2963777
],
"\u0420\u0443\u0437\u0430\u0435\u0432\u043a\u0430": [
54.058735,
44.954391
],
"\u0421\u0442\u0435\u0440\u043b\u0438\u0442\u0430\u043c\u0430\u043a": [
53.632374,
55.952259
],
"\u041e\u0440\u0434\u0436\u043e\u043d\u0438\u043a\u0438\u0434\u0437\u0435": [
44.9640797,
35.3580294
],
"Berlin": [
52.5170365,
13.3888599
],
"\u0411\u0435\u043b\u043e\u0432\u043e": [
54.419323,
86.303429
],
"\u0417\u0430\u0432\u043e\u043b\u0436\u044c\u0435": [
56.639992,
43.399353
],
"\u0416\u0443\u043a\u043e\u0432\u0441\u043a\u0438\u0439": [
55.5972801,
38.1199863
],
"Palermo": [
38.1112268,
13.3524434
],
"Seoul": [
37.5666791,
126.9782914
],
"\u0410\u0441\u0442\u0440\u0430\u0445\u0430\u043d\u044c": [
46.3498308,
48.0326203
],
"\u041d\u043e\u0432\u0430\u044f \u041a\u0430\u0445\u043e\u0432\u043a\u0430": [
46.7583404,
33.3566678
],
"Santo Domingo": [
18.4801972,
-69.942111
],
"Frankfurt am Main": [
50.1106444,
8.6820917
],
"D\u00fcsseldorf": [
51.2254018,
6.7763137
],
"\u0413\u0430\u0433\u0440\u0430": [
43.289035999999996,
40.26846983800224
],
"Myrtle Beach": [
33.6956461,
-78.8900409
],
"Speightstown": [
13.2500531,
-59.6423782
],
"\u0423\u0440\u0430\u043b\u044c\u0441\u043a": [
51.227984750000005,
51.40139534719691
],
"\u041f\u0430\u0432\u043b\u043e\u0432\u043e": [
55.965137,
43.071114
],
"\u0425\u0430\u0440\u044c\u043a\u043e\u0432": [
49.9923181,
36.2310146
],
"\u0421\u043d\u0435\u0436\u0438\u043d\u0441\u043a": [
56.087009,
60.732674
],
"\u041a\u043e\u0432\u0440\u043e\u0432": [
56.374603,
41.311676
],
"\u041c\u043e\u0441\u0435\u0435\u0432\u043a\u0430": [
52.7334757,
47.2789831
],
"Malta": [
35.8885993,
14.4476911
],
"\u0422\u043e\u0431\u043e\u043b\u044c\u0441\u043a": [
58.210710750000004,
68.36961038981468
],
"\u0427\u0435\u0440\u043a\u0430\u0441\u0441\u044b": [
49.4447056,
32.0588085
],
"\u041a\u0443\u0440\u043c\u0430\u043d\u0430\u0435\u0432\u043e": [
54.137821,
55.637344
],
"\u0411\u043e\u043b\u044c\u0448\u0430\u044f \u0411\u043e\u0440\u043b\u0430": [
53.5224018,
48.0598897
],
"M\u00fcnchen": [
48.1371079,
11.5753822
],
"New Orleans": [
29.9759983,
-90.0782127
],
"\u0410\u0445\u0442\u0443\u0431\u0438\u043d\u0441\u043a": [
48.2877616,
46.1744879
],
"London": [
51.5073359,
-0.12765
],
"\u0423\u0434\u0430\u0447\u043d\u044b\u0439": [
66.411768,
112.2517294
],
"\u0423\u0445\u0442\u0430": [
63.5623797,
53.6842376
],
"\u0427\u0438\u0442\u0430": [
52.033409,
113.500893
],
"\u0421\u043c\u043e\u043b\u0435\u043d\u0441\u043a": [
54.77897005,
32.04718121915615
],
"\u041f\u043e\u0445\u0432\u0438\u0441\u0442\u043d\u0435\u0432\u043e": [
53.646797,
52.123344
],
"\u0422\u0430\u0440\u0430": [
47.3900474,
0.6889268
],
"\u0421\u0442\u0430\u0440\u044b\u0439 \u041f\u0438\u0447\u0435\u0443\u0440": [
52.8707967,
47.0608188
],
"\u041d\u0435\u0444\u0442\u0435\u043a\u0430\u043c\u0441\u043a": [
56.0884031,
54.2478094
],
"\u041b\u044e\u0431\u0435\u0440\u0446\u044b": [
55.6783142,
37.89377
],
"\u0413\u0430\u0439": [
51.471443,
58.440517
],
"Fort Lauderdale": [
26.1223084,
-80.1433786
],
"\u0130stanbul": [
41.0091982,
28.9662187
],
"\u0421\u0430\u0440\u043e\u0432": [
54.934631,
43.334606
],
"\u0411\u0430\u0440\u0430\u043d\u043e\u0432\u0438\u0447\u0438": [
53.1322925,
26.0184156
],
"\u0420\u0443\u0431\u0446\u043e\u0432\u0441\u043a": [
51.5276264,
81.2176174
],
"\u041f\u044f\u0440\u043d\u0443": [
58.3835136,
24.5081751
],
"\u041d\u043e\u0432\u0430\u044f \u041c\u0430\u0439\u043d\u0430 1-\u044f": [
54.1483754,
49.7633615
],
"\u0423\u0441\u0438\u043d\u0441\u043a": [
65.995731,
57.5571267
],
"\u0423\u0441\u0442\u044c-\u041d\u0435\u0440\u0430": [
64.566376,
143.237839
],
"\u0421\u0435\u0440\u044b\u0448\u0435\u0432\u043e": [
51.091206,
128.38092
],
"Manchester": [
53.4794892,
-2.2451148
],
"Sevilla": [
37.3886303,
-5.9953403
],
"\u0414\u043d\u0435\u043f\u0440 (\u0414\u043d\u0435\u043f\u0440\u043e\u043f\u0435\u0442\u0440\u043e\u0432\u0441\u043a)": [
48.4680221,
35.0417711
],
"\u0421\u0430\u043b\u0430\u0432\u0430\u0442": [
53.361687,
55.924641
],
"Erlangen": [
49.5928616,
11.0056
],
"Z\u00fcrich": [
47.3744489,
8.5410422
],
"\u041e\u0440\u0441\u043a": [
51.2305015,
58.4738015
],
"Lake Kiowa": [
33.5770541,
-97.0130633
],
"\u0411\u043b\u0430\u0433\u043e\u0432\u0435\u0449\u0435\u043d\u0441\u043a": [
50.290527,
127.527161
],
"\u0416\u0435\u043b\u0435\u0437\u043d\u043e\u0433\u043e\u0440\u0441\u043a": [
56.250938,
93.53286
],
"\u0422\u0430\u043b\u043b\u0438\u043d": [
59.4372155,
24.7453688
],
"\u0411\u0435\u0440\u0435\u0437\u043d\u0438\u043a\u0438": [
59.4084171,
56.8036958
],
"Stockholm": [
59.3251172,
18.0710935
],
"Beverly Hills": [
34.0696501,
-118.3963062
],
"\u041b\u043e\u043a\u043e\u043c\u043e\u0442\u0438\u0432\u043d\u044b\u0439": [
53.042561,
60.614517
],
"\u041f\u0430\u043d\u0435\u0432\u0435\u0436\u0438\u0441": [
55.7344985,
24.3578055
],
"\u041d\u0435\u0440\u0447\u0438\u043d\u0441\u043a": [
51.97694,
116.588341
],
"\u0410\u043d\u0442\u0440\u0430\u0446\u0438\u0442": [
48.1164285,
39.0886375
],
"Tokyo": [
35.6812665,
139.757653
],
"Nantucket": [
41.315731400000004,
-70.00901408779944
],
"\u0421\u0443\u0440\u0433\u0443\u0442": [
61.254032,
73.3964
],
"\u0411\u0435\u043b\u0430\u044f \u0413\u043b\u0438\u043d\u0430": [
46.078419,
40.863701
],
"\u041a\u0443\u0442\u0430\u0438\u0441\u0438": [
42.2716078,
42.7054475
],
"\u0417\u0435\u043b\u0435\u043d\u043e\u0433\u043e\u0440\u0441\u043a": [
60.1970751,
29.7071585
],
"\u0410\u043a\u0442\u0430\u043d\u044b\u0448": [
55.7221799,
54.0577272
],
"Praha": [
50.0874654,
14.4212535
],
"Caracas": [
10.5060934,
-66.9146008
],
"\u0427\u0435\u0440\u0435\u043f\u043e\u0432\u0435\u0446": [
59.128696,
37.916389
],
"Bar": [
48.4088704,
7.4492808
],
"\u0427\u0430\u043f\u0430\u0435\u0432\u0441\u043a": [
52.981285,
49.711193
],
"\u0412\u0435\u043d\u0435\u0432": [
54.3504,
38.2656
],
"\u041c\u0430\u044f\u043a\u0438": [
46.413799,
30.26836
],
"\u042e\u0440\u043c\u0430\u043b\u0430": [
56.9727164,
23.7886979
],
"\u041c\u0438\u043d\u0435\u0440\u0430\u043b\u044c\u043d\u044b\u0435 \u0412\u043e\u0434\u044b": [
44.2107377,
43.134969
],
"Saint John's": [
17.1184569,
-61.8448509
],
"\u0421\u0430\u043c\u0430\u0440\u043a\u0430\u043d\u0434": [
39.6634028,
66.94416098427573
],
"\u041a\u043e\u043a\u0448\u0435\u0442\u0430\u0443": [
53.2852054,
69.3814676
],
"\u0411\u0435\u043b\u0435\u0431\u0435\u0439": [
54.106495,
54.107811
],
"\u041a\u043e\u043f\u0435\u0439\u0441\u043a": [
55.1131951,
61.6216332
],
"Nazareth": [
32.7066301,
35.3048161
],
"\u041a\u043e\u0440\u043e\u043b\u0451\u0432": [
55.9204898,
37.8326289
],
"\u0421\u0432\u043e\u0431\u043e\u0434\u0430": [
50.9773982,
39.5047959
],
"\u0416\u0438\u0433\u0443\u043b\u0435\u0432\u0441\u043a": [
53.4005122,
49.4955657
],
"\u041c\u043e\u0436\u0433\u0430": [
56.44903635,
52.21859474400328
],
"\u0417\u0430\u0440\u0438\u043d\u0441\u043a": [
53.707915,
84.9348606
],
"\u042f\u043b\u0442\u0430 (\u043f\u0433\u0442)": "",
"\u041a\u0443\u0440\u0433\u0430\u043d-\u0422\u044e\u0431\u0435": [
37.8357217,
68.7820957
],
"\u0414\u0443\u0448\u0430\u043d\u0431\u0435": [
38.585694700000005,
68.7603746751885
],
"Abu Dhabi": [
24.4538352,
54.3774014
],
"\u042e\u0436\u043d\u043e\u0443\u0440\u0430\u043b\u044c\u0441\u043a": [
54.4425364,
61.2682731
],
"\u041b\u0438\u0441\u043a\u0438": [
50.9773982,
39.5047959
],
"\u0410\u0437\u043e\u0432": [
46.196729,
36.73570655151333
],
"\u0423\u043b\u044c\u044f\u043d\u043e\u0432\u0441\u043a\u043e\u0435": [
43.629871,
45.602955
],
"Si\u0101lkot": [
32.4935378,
74.5411575
],
"Col\u00f3n": [
9.3553005,
-79.8974085
],
"\u0422\u0435\u043b\u044c-\u0410\u0432\u0438\u0432": [
32.0852997,
34.7818064
],
"\u0425\u0430\u0439\u0444\u0430": [
32.8191218,
34.9983856
],
"\u0411\u0438\u0447\u0443\u0440\u0433\u0430-\u0411\u0430\u0438\u0448\u0435\u0432\u043e": [
54.7619003,
47.3070718
],
"Bethlehem": [
40.6178915,
-75.3786521
],
"\u0428\u0438\u0433\u043e\u043d\u044b (\u0441\u0435\u043b\u043e)": "",
"\u0423\u0441\u0441\u0443\u0440\u0438\u0439\u0441\u043a": [
43.7972447,
131.9520752
],
"\u0413\u0443\u0431\u0430\u0445\u0430": [
58.842407,
57.555058
],
"\u0421\u0432\u0435\u0442\u0438 \u0412\u043b\u0430\u0441": [
42.7138408,
27.7588044
],
"\u0413\u043e\u0440\u043d\u043e-\u0410\u043b\u0442\u0430\u0439\u0441\u043a": [
51.957775,
85.963653
],
"\u0429\u0435\u043a\u0438\u043d\u043e": [
54.004465,
37.5179079
],
"\u0421\u0442\u0443\u043f\u0438\u043d\u043e": [
54.886598,
38.0772589
],
"\u0421\u0435\u0439\u0434\u0438": [
39.477026300000006,
62.9115776115831
],
"\u0422\u0438\u0440\u0430\u0441\u043f\u043e\u043b\u044c": [
46.8566229,
29.605918132910546
],
"\u0424\u0440\u044f\u0437\u0438\u043d\u043e": [
55.954618,
38.0567691
],
"\u0411\u043e\u043b\u0434\u0443\u043c\u0441\u0430\u0437": [
42.1276285,
59.6722993
],
"\u0422\u0443\u0430\u043f\u0441\u0435": [
44.0984747,
39.0718875
],
"\u0411\u0435\u0440\u0435\u0437\u043e\u0432\u0441\u043a\u0438\u0439": [
56.9097871,
60.8120254
],
"\u0417\u0430\u0440\u0435\u0447\u043d\u044b\u0439": [
56.8149706,
61.3205936
],
"Jinhua": [
29.1080344,
119.6486487
],
"\u041d\u0430\u0445\u043e\u0434\u043a\u0430": [
42.8246489,
132.8926
],
"\u041d\u043e\u0447\u043a\u0430 (\u0434\u0435\u0440\u0435\u0432\u043d\u044f)": [
53.880704,
46.1061883
],
"\u041f\u0438\u043a\u0430\u043b\u0435\u0432\u043e": [
59.5129787,
34.1773573
],
"Buenos Aires": [
-34.6075682,
-58.4370894
],
"\u042f\u043a\u0443\u0442\u0441\u043a": [
62.0274078,
129.7319787
],
"\u041d\u043e\u0432\u043e\u0447\u0435\u0431\u043e\u043a\u0441\u0430\u0440\u0441\u043a": [
56.1140023,
47.4870341
],
"Kingston": [
17.9712148,
-76.7928128
],
"\u0420\u043e\u0432\u043d\u043e\u0435": [
54.463985,
20.975958
],
"\u0410\u0448\u0445\u0430\u0431\u0430\u0434": [
37.9404648,
58.3823487
],
"\u0410\u043d\u0433\u0430\u0440\u0441\u043a": [
52.5311117,
103.8826109
],
"\u041d\u043e\u044f\u0431\u0440\u044c\u0441\u043a": [
63.2002917,
75.4475807
],
"\u0130zmir": [
38.4224548,
27.1310699
],
"\u041f\u0440\u043e\u0442\u0432\u0438\u043d\u043e": [
54.8707703,
37.2188629
],
"\u0420\u0443\u0433\u043e\u0437\u0435\u0440\u043e": [
64.08017,
32.780212
],
"\u0421\u0442\u0430\u0440\u0430\u044f \u0421\u0430\u0445\u0447\u0430": [
54.4187729,
49.9492337
],
"Boston": [
42.3554334,
-71.060511
]
}

49
main.py
View File

@ -2,13 +2,58 @@
import os
import sys
import numpy
import pandas as pd
# import scipy.cluster.hierarchy as sc
from matplotlib import pyplot as plt
from pandas import DataFrame
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA
from src.main.df_loader import DfLoader
def __clustering(data: DataFrame) -> None:
# clusters = round(math.sqrt(len(data) / 2))
# plt.figure(figsize=(20, 7))
# plt.title("Dendrograms")
# # Create dendrogram
# sc.dendrogram(sc.linkage(data.to_numpy(), method='ward'))
# plt.title('Dendrogram')
# plt.xlabel('Sample index')
# plt.ylabel('Euclidean distance')
clusters = 3
model = AgglomerativeClustering(n_clusters=clusters, metric='euclidean', linkage='ward')
model.fit(data)
labels = model.labels_
data_norm = (data - data.min()) / (data.max() - data.min())
pca = PCA(n_components=2) # 2-dimensional PCA
transformed = pd.DataFrame(pca.fit_transform(data_norm))
# plt.scatter(x=transformed[:, 0], y=transformed[:, 1], c=labels, cmap='rainbow')
for i in range(clusters):
series = transformed.iloc[numpy.where(labels[:] == i)]
plt.scatter(series[0], series[1], label=f'Cluster {i + 1}')
plt.legend()
plt.show()
# fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
# sns.scatterplot(ax=axes[0], data=data, x='location-la,location-lo', y='age,sex').set_title('Without clustering')
# sns.scatterplot(ax=axes[1], data=data, x='location-la,location-lo', y='age,sex', hue=labels) \
# .set_title('With clustering')
# plt.show()
# s = numpy.where(labels[:] == 34)
# print(labels)
def __main(json_file):
df_loader: DfLoader = DfLoader(json_file)
df = df_loader.get_data_frame()
print('done')
data = df_loader.get_clustering_data()
print(data)
__clustering(data)
if __name__ == '__main__':

View File

@ -1,3 +1,7 @@
pandas==2.0.1
geopy==2.3.0
numpy==1.24.3
numpy==1.24.3
scikit-learn==1.2.2
matplotlib==3.7.1
seaborn==0.12.2
scipy==1.10.1

View File

@ -10,6 +10,7 @@ from src.main.utils import Utils
class DfLoader:
def __init__(self, json_file: str) -> None:
self.__geocache: Geocache = Geocache()
print(f'Try to load data from the {json_file} file')
@ -47,26 +48,33 @@ class DfLoader:
def __prepare_dataset_status(self) -> None:
is_univer_mask = ((self.__df['age'] >= const.university_gr_age()) | (self.__df['age'] == const.empty_age())) & \
((self.__df['universities'].str.len() > 0) | (self.__df['occupation_type'] == 'university'))
self.__df['is_university'] = np.where(is_univer_mask, True, False)
self.__df['is_university'] = np.where(is_univer_mask, 1, 0)
is_work_mask = ((self.__df['age'] > const.school_gr_age()) | (self.__df['age'] == const.empty_age())) & \
((self.__df['is_university']) | (self.__df['occupation_type'] == 'work')) | \
((self.__df['is_university'] == 1) | (self.__df['occupation_type'] == 'work')) | \
(self.__df['age'] > const.university_gr_age())
self.__df['is_work'] = np.where(is_work_mask, True, False)
self.__df['is_work'] = np.where(is_work_mask, 1, 0)
is_student_mask = ((self.__df['occupation_type'] == 'university') &
((self.__df['age'] >= const.school_gr_age()) &
(self.__df['age'] <= const.university_gr_age())))
self.__df['is_student'] = np.where(is_student_mask, True, False)
self.__df['is_student'] = np.where(is_student_mask, 1, 0)
is_schoolboy_mask = ((self.__df['age'] < const.school_gr_age()) & (self.__df['age'] != const.empty_age())) | \
((self.__df['age'] == const.empty_age()) & (self.__df['occupation_type'] == 'school'))
self.__df['is_schoolboy'] = np.where(is_schoolboy_mask, True, False)
self.__df['is_schoolboy'] = np.where(is_schoolboy_mask, 1, 0)
def __prepare_dataset_location(self) -> None:
self.__geocache.update_geo_cache(self.__df['city'].unique().tolist())
self.__df['location'] = self.__df['city'] \
.apply(lambda val: '' if Utils.is_empty_str(val) else self.__geocache.get_location(val))
self.__df['location-la'] = self.__df.loc[:, 'location'] \
.apply(lambda val: 0 if Utils.is_empty_collection(val) else val[0])
self.__df['location-lo'] = self.__df.loc[:, 'location'] \
.apply(lambda val: 0 if Utils.is_empty_collection(val) else val[1])
def get_clustering_data(self) -> DataFrame:
return self.__df
columns: [] = ['location-la', 'location-lo',
'sex', 'age', 'is_university', 'is_work', 'is_student', 'is_schoolboy']
df = self.__df
return df[columns]

View File

@ -22,29 +22,35 @@ class Geocache:
if os.path.isfile(self.JSON_FILE):
with open(self.JSON_FILE, 'r') as rf:
self.__geo_cache.update(json.load(rf))
print(f'Geocache loaded from {self.JSON_FILE}')
def __save_geo_cache(self) -> None:
with open(self.JSON_FILE, 'w') as wf:
json.dump(self.__geo_cache, wf)
print('Geocache saved')
print(f'Geocache saved to {self.JSON_FILE}')
def update_geo_cache(self, cities: List[str]) -> None:
is_changed: bool = False
for city in cities:
if Utils.is_empty_str(city):
continue
result: () = self.__geo_cache.get(city)
if result is not None:
continue
print(f'{len(self.__geo_cache.keys())}/{len(cities)} - Try to load geocode for {city}')
location: Point = self.__geocode(city)
result: () = (location.latitude, location.longitude)
self.__geo_cache[city] = result
is_changed = True
if len(self.__geo_cache.keys()) % 50 == 0:
try:
for city in cities:
if Utils.is_empty_str(city):
continue
result: () = self.__geo_cache.get(city)
if result is not None:
continue
print(f'{len(self.__geo_cache.keys())} - Try to load geocode for {city}')
location: Point = self.__geocode(city)
if location is None:
self.__geo_cache[city] = ''
else:
result: [] = [location.latitude, location.longitude]
self.__geo_cache[city] = result
is_changed = True
if len(self.__geo_cache.keys()) % 50 == 0:
self.__save_geo_cache()
finally:
if is_changed:
self.__save_geo_cache()
if is_changed:
self.__save_geo_cache()
def get_location(self, city: str) -> ():
return self.__geo_cache.get(city)