fuzzy-rules-generator/cardio.ipynb

106 KiB
Raw Blame History

In [403]:
import pandas as pd

df = pd.read_csv("data-cardio/cardio_train.csv", sep=";", index_col="id")
df["age"] = df["age"] / 365.24
print(df.cardio.value_counts())
display(df.info(), df.cardio.value_counts())

df
cardio
0    35021
1    34979
Name: count, dtype: int64
<class 'pandas.core.frame.DataFrame'>
Index: 70000 entries, 0 to 99999
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          70000 non-null  float64
 1   gender       70000 non-null  int64  
 2   height       70000 non-null  int64  
 3   weight       70000 non-null  float64
 4   ap_hi        70000 non-null  int64  
 5   ap_lo        70000 non-null  int64  
 6   cholesterol  70000 non-null  int64  
 7   gluc         70000 non-null  int64  
 8   smoke        70000 non-null  int64  
 9   alco         70000 non-null  int64  
 10  active       70000 non-null  int64  
 11  cardio       70000 non-null  int64  
dtypes: float64(2), int64(10)
memory usage: 6.9 MB
None
cardio
0    35021
1    34979
Name: count, dtype: int64
Out[403]:
age gender height weight ap_hi ap_lo cholesterol gluc smoke alco active cardio
id
0 50.358668 2 168 62.0 110 80 1 1 0 0 1 0
1 55.382762 1 156 85.0 140 90 3 1 0 0 1 1
2 51.629066 1 165 64.0 130 70 3 1 0 0 0 1
3 48.250465 2 169 82.0 150 100 1 1 0 0 1 1
4 47.842515 1 156 56.0 100 60 1 1 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ...
99993 52.677691 2 168 76.0 120 80 1 1 1 0 1 0
99995 61.879860 1 158 126.0 140 90 2 2 0 0 1 1
99996 52.201292 2 183 105.0 180 90 3 1 0 1 0 1
99998 61.414412 1 163 72.0 135 80 1 2 0 0 0 1
99999 56.236995 1 170 72.0 120 80 2 1 0 0 1 0

70000 rows × 12 columns

In [404]:
df.describe().transpose()
Out[404]:
count mean std min 25% 50% 75% max
age 70000.0 53.304309 6.755152 29.564122 48.36272 53.945351 58.391742 64.924433
gender 70000.0 1.349571 0.476838 1.000000 1.00000 1.000000 2.000000 2.000000
height 70000.0 164.359229 8.210126 55.000000 159.00000 165.000000 170.000000 250.000000
weight 70000.0 74.205690 14.395757 10.000000 65.00000 72.000000 82.000000 200.000000
ap_hi 70000.0 128.817286 154.011419 -150.000000 120.00000 120.000000 140.000000 16020.000000
ap_lo 70000.0 96.630414 188.472530 -70.000000 80.00000 80.000000 90.000000 11000.000000
cholesterol 70000.0 1.366871 0.680250 1.000000 1.00000 1.000000 2.000000 3.000000
gluc 70000.0 1.226457 0.572270 1.000000 1.00000 1.000000 1.000000 3.000000
smoke 70000.0 0.088129 0.283484 0.000000 0.00000 0.000000 0.000000 1.000000
alco 70000.0 0.053771 0.225568 0.000000 0.00000 0.000000 0.000000 1.000000
active 70000.0 0.803729 0.397179 0.000000 1.00000 1.000000 1.000000 1.000000
cardio 70000.0 0.499700 0.500003 0.000000 0.00000 0.000000 1.000000 1.000000
In [405]:
df = df.query("ap_hi > 0 and ap_hi < 370")
df = df.query("ap_lo > 0 and ap_lo < 370")
df.describe().transpose()
Out[405]:
count mean std min 25% 50% 75% max
age 68985.0 53.290421 6.757633 29.564122 48.340817 53.939875 58.380791 64.924433
gender 68985.0 1.348670 0.476553 1.000000 1.000000 1.000000 2.000000 2.000000
height 68985.0 164.359672 8.204273 55.000000 159.000000 165.000000 170.000000 250.000000
weight 68985.0 74.118961 14.328938 11.000000 65.000000 72.000000 82.000000 200.000000
ap_hi 68985.0 126.325027 17.698621 7.000000 120.000000 120.000000 140.000000 240.000000
ap_lo 68985.0 81.350482 9.805666 1.000000 80.000000 80.000000 90.000000 190.000000
cholesterol 68985.0 1.364384 0.678691 1.000000 1.000000 1.000000 1.000000 3.000000
gluc 68985.0 1.225875 0.571822 1.000000 1.000000 1.000000 1.000000 3.000000
smoke 68985.0 0.087860 0.283093 0.000000 0.000000 0.000000 0.000000 1.000000
alco 68985.0 0.053591 0.225211 0.000000 0.000000 0.000000 0.000000 1.000000
active 68985.0 0.803276 0.397525 0.000000 1.000000 1.000000 1.000000 1.000000
cardio 68985.0 0.494905 0.499978 0.000000 0.000000 0.000000 1.000000 1.000000
In [406]:
df["bmi"] = df["weight"] / (df["height"] / 100) ** 2
df.drop(["weight", "height"], axis=1, inplace=True)
df
Out[406]:
age gender ap_hi ap_lo cholesterol gluc smoke alco active cardio bmi
id
0 50.358668 2 110 80 1 1 0 0 1 0 21.967120
1 55.382762 1 140 90 3 1 0 0 1 1 34.927679
2 51.629066 1 130 70 3 1 0 0 0 1 23.507805
3 48.250465 2 150 100 1 1 0 0 1 1 28.710479
4 47.842515 1 100 60 1 1 0 0 0 0 23.011177
... ... ... ... ... ... ... ... ... ... ... ...
99993 52.677691 2 120 80 1 1 1 0 1 0 26.927438
99995 61.879860 1 140 90 2 2 0 0 1 1 50.472681
99996 52.201292 2 180 90 3 1 0 1 0 1 31.353579
99998 61.414412 1 135 80 1 2 0 0 0 1 27.099251
99999 56.236995 1 120 80 2 1 0 0 1 0 24.913495

68985 rows × 11 columns

In [407]:
import seaborn as sns

sns.heatmap(df.corr())
Out[407]:
<Axes: >
In [411]:
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import numpy as np

random_state = 9

stat_y = df["cardio"]
stat_X = df.drop(["cardio"], axis=1).copy()
stat_X_train, stat_X_test, stat_y_train, stat_y_test = train_test_split(
    stat_X, stat_y, test_size=0.15, random_state=9
)

log_model = sm.Logit(stat_y_train, sm.add_constant(stat_X_train))
log_result = log_model.fit()
display(log_result.summary2())
np.exp(log_result.params).sort_values(ascending=False)
Optimization terminated successfully.
         Current function value: 0.567793
         Iterations 6
Model: Logit Method: MLE
Dependent Variable: cardio Pseudo R-squared: 0.181
Date: 2025-02-21 23:59 AIC: 66609.4135
No. Observations: 58637 BIC: 66708.1838
Df Model: 10 Log-Likelihood: -33294.
Df Residuals: 58626 LL-Null: -40641.
Converged: 1.0000 LLR p-value: 0.0000
No. Iterations: 6.0000 Scale: 1.0000
Coef. Std.Err. z P>|z| [0.025 0.975]
const -11.5426 0.1324 -87.1794 0.0000 -11.8021 -11.2831
age 0.0523 0.0014 36.0898 0.0000 0.0494 0.0551
gender 0.0206 0.0211 0.9792 0.3275 -0.0207 0.0619
ap_hi 0.0469 0.0009 52.1945 0.0000 0.0451 0.0486
ap_lo 0.0232 0.0014 16.3321 0.0000 0.0204 0.0259
cholesterol 0.4859 0.0168 29.0062 0.0000 0.4531 0.5187
gluc -0.1116 0.0190 -5.8803 0.0000 -0.1488 -0.0744
smoke -0.1279 0.0373 -3.4313 0.0006 -0.2009 -0.0548
alco -0.1782 0.0452 -3.9429 0.0001 -0.2668 -0.0896
active -0.2377 0.0235 -10.1027 0.0000 -0.2838 -0.1916
bmi 0.0226 0.0018 12.4231 0.0000 0.0190 0.0261

Out[411]:
cholesterol    1.625611
age            1.053639
ap_hi          1.047999
ap_lo          1.023425
bmi            1.022841
gender         1.020836
gluc           0.894398
smoke          0.879970
alco           0.836750
active         0.788470
const          0.000010
dtype: float64
In [421]:
# data = df.drop(["gluc", "smoke", "alco", "active", "ap_lo"], axis=1)
data = df.drop(
    [
        "gluc",
        "smoke",
        "alco",
        "ap_lo",
        "gender",
        # "cholesterol",
        "active",
    ],
    axis=1,
)
# data = df.copy()
data
Out[421]:
age ap_hi cholesterol cardio bmi
id
0 50.358668 110 1 0 21.967120
1 55.382762 140 3 1 34.927679
2 51.629066 130 3 1 23.507805
3 48.250465 150 1 1 28.710479
4 47.842515 100 1 0 23.011177
... ... ... ... ... ...
99993 52.677691 120 1 0 26.927438
99995 61.879860 140 2 1 50.472681
99996 52.201292 180 3 1 31.353579
99998 61.414412 135 1 1 27.099251
99999 56.236995 120 2 0 24.913495

68985 rows × 5 columns

In [431]:
data.to_csv("data-cardio/cardio_clear.csv")
In [422]:
y = data["cardio"]
X = data.drop(["cardio"], axis=1).copy()
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=random_state
)
display(X_train, y_train, X_test, y_test)
age ap_hi cholesterol bmi
id
94960 62.014018 120 1 26.892323
30807 57.745592 120 1 28.393726
26485 59.670354 120 3 23.875115
3868 49.715256 110 1 20.820940
45890 59.785347 160 1 23.529412
... ... ... ... ...
61975 62.558865 120 1 28.196921
32741 57.882488 120 1 29.043709
94833 51.371701 120 1 29.242109
95660 45.767167 120 1 24.977043
81002 55.544300 150 1 27.053803

55188 rows × 4 columns

id
94960    0
30807    0
26485    0
3868     1
45890    1
        ..
61975    1
32741    0
94833    0
95660    0
81002    1
Name: cardio, Length: 55188, dtype: int64
age ap_hi cholesterol bmi
id
42270 60.078305 140 1 45.918367
10780 55.360859 120 2 24.998904
42436 48.198445 100 3 21.926126
88647 41.517906 130 2 27.764650
62336 51.692038 110 1 22.230987
... ... ... ... ...
30330 47.697404 100 1 22.724403
62907 58.597087 120 1 23.828125
98612 51.404556 110 1 22.589551
5767 62.033184 120 1 23.875115
14769 41.506954 120 2 22.948116

13797 rows × 4 columns

id
42270    1
10780    0
42436    1
88647    1
62336    0
        ..
30330    1
62907    0
98612    0
5767     0
14769    1
Name: cardio, Length: 13797, dtype: int64
In [ ]:
from src.utils import run_classification
from sklearn import tree
from sklearn import metrics
import numpy as np


model = tree.DecisionTreeClassifier(
    random_state=random_state,
    max_depth=6,
).fit(X_train.values, y_train.values.ravel())


y_train_predict = model.predict(X_train)
y_test_probs = model.predict_proba(X_test)
y_test_predict = model.predict(X_test)


display("Precision_test", metrics.precision_score(y_test, y_test_predict))
display("Recall_test", metrics.recall_score(y_test, y_test_predict))
display("Accuracy_test", metrics.accuracy_score(y_test, y_test_predict))
display("ROC_AUC_test", metrics.roc_auc_score(y_test, y_test_probs[:, 1])) # type: ignore
display("F1_test", metrics.f1_score(y_test, y_test_predict))
display("MCC_test", metrics.matthews_corrcoef(y_test, y_test_predict))
display("Cohen_kappa_test", metrics.cohen_kappa_score(y_test, y_test_predict))
display("Confusion_matrix", metrics.confusion_matrix(y_test, y_test_predict))
/Users/user/Projects/python/fuzzy-rules-generator/.venv/lib/python3.12/site-packages/sklearn/base.py:486: UserWarning: X has feature names, but DecisionTreeClassifier was fitted without feature names
  warnings.warn(
/Users/user/Projects/python/fuzzy-rules-generator/.venv/lib/python3.12/site-packages/sklearn/base.py:486: UserWarning: X has feature names, but DecisionTreeClassifier was fitted without feature names
  warnings.warn(
/Users/user/Projects/python/fuzzy-rules-generator/.venv/lib/python3.12/site-packages/sklearn/base.py:486: UserWarning: X has feature names, but DecisionTreeClassifier was fitted without feature names
  warnings.warn(
'Precision_test'
np.float64(0.7368262116865468)
'Recall_test'
np.float64(0.7180694526191878)
'Accuracy_test'
0.7347974197289265
'ROC_AUC_test'
np.float64(0.7917285464726767)
'F1_test'
np.float64(0.7273269245100231)
'MCC_test'
np.float64(0.46942772902650703)
'Cohen_kappa_test'
np.float64(0.4692799184358021)
'Confusion_matrix'
array([[5258, 1743],
       [1916, 4880]])
In [429]:
rules = tree.export_text(model, feature_names=X_train.columns.values.tolist())
print(rules)
|--- ap_hi <= 129.50
|   |--- age <= 54.65
|   |   |--- cholesterol <= 2.50
|   |   |   |--- age <= 43.79
|   |   |   |   |--- cholesterol <= 1.50
|   |   |   |   |   |--- ap_hi <= 114.50
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- ap_hi >  114.50
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |--- cholesterol >  1.50
|   |   |   |   |   |--- bmi <= 28.87
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- bmi >  28.87
|   |   |   |   |   |   |--- class: 0
|   |   |   |--- age >  43.79
|   |   |   |   |--- ap_hi <= 119.50
|   |   |   |   |   |--- bmi <= 22.05
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- bmi >  22.05
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |--- ap_hi >  119.50
|   |   |   |   |   |--- bmi <= 27.71
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- bmi >  27.71
|   |   |   |   |   |   |--- class: 0
|   |   |--- cholesterol >  2.50
|   |   |   |--- bmi <= 29.04
|   |   |   |   |--- age <= 41.60
|   |   |   |   |   |--- ap_hi <= 115.00
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- ap_hi >  115.00
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |--- age >  41.60
|   |   |   |   |   |--- age <= 54.17
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- age >  54.17
|   |   |   |   |   |   |--- class: 0
|   |   |   |--- bmi >  29.04
|   |   |   |   |--- age <= 54.01
|   |   |   |   |   |--- age <= 39.75
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- age >  39.75
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |--- age >  54.01
|   |   |   |   |   |--- bmi <= 35.02
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- bmi >  35.02
|   |   |   |   |   |   |--- class: 1
|   |--- age >  54.65
|   |   |--- cholesterol <= 2.50
|   |   |   |--- age <= 60.71
|   |   |   |   |--- ap_hi <= 118.50
|   |   |   |   |   |--- bmi <= 23.33
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- bmi >  23.33
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |--- ap_hi >  118.50
|   |   |   |   |   |--- bmi <= 32.89
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- bmi >  32.89
|   |   |   |   |   |   |--- class: 1
|   |   |   |--- age >  60.71
|   |   |   |   |--- bmi <= 20.51
|   |   |   |   |   |--- age <= 64.31
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- age >  64.31
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |--- bmi >  20.51
|   |   |   |   |   |--- ap_hi <= 115.50
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- ap_hi >  115.50
|   |   |   |   |   |   |--- class: 1
|   |   |--- cholesterol >  2.50
|   |   |   |--- bmi <= 26.03
|   |   |   |   |--- age <= 60.89
|   |   |   |   |   |--- age <= 60.48
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- age >  60.48
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |--- age >  60.89
|   |   |   |   |   |--- bmi <= 25.91
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- bmi >  25.91
|   |   |   |   |   |   |--- class: 0
|   |   |   |--- bmi >  26.03
|   |   |   |   |--- age <= 59.39
|   |   |   |   |   |--- bmi <= 35.93
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- bmi >  35.93
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |--- age >  59.39
|   |   |   |   |   |--- bmi <= 35.12
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- bmi >  35.12
|   |   |   |   |   |   |--- class: 1
|--- ap_hi >  129.50
|   |--- ap_hi <= 138.50
|   |   |--- cholesterol <= 2.50
|   |   |   |--- age <= 59.54
|   |   |   |   |--- bmi <= 21.64
|   |   |   |   |   |--- bmi <= 17.30
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- bmi >  17.30
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |--- bmi >  21.64
|   |   |   |   |   |--- age <= 39.99
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- age >  39.99
|   |   |   |   |   |   |--- class: 1
|   |   |   |--- age >  59.54
|   |   |   |   |--- age <= 62.46
|   |   |   |   |   |--- bmi <= 20.61
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- bmi >  20.61
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |--- age >  62.46
|   |   |   |   |   |--- age <= 64.00
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- age >  64.00
|   |   |   |   |   |   |--- class: 1
|   |   |--- cholesterol >  2.50
|   |   |   |--- bmi <= 30.74
|   |   |   |   |--- bmi <= 30.06
|   |   |   |   |   |--- bmi <= 23.93
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- bmi >  23.93
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |--- bmi >  30.06
|   |   |   |   |   |--- bmi <= 30.69
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- bmi >  30.69
|   |   |   |   |   |   |--- class: 0
|   |   |   |--- bmi >  30.74
|   |   |   |   |--- bmi <= 32.05
|   |   |   |   |   |--- age <= 43.63
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- age >  43.63
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |--- bmi >  32.05
|   |   |   |   |   |--- bmi <= 32.34
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- bmi >  32.34
|   |   |   |   |   |   |--- class: 1
|   |--- ap_hi >  138.50
|   |   |--- ap_hi <= 149.50
|   |   |   |--- age <= 39.56
|   |   |   |   |--- bmi <= 38.19
|   |   |   |   |   |--- age <= 39.54
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- age >  39.54
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |--- bmi >  38.19
|   |   |   |   |   |--- bmi <= 50.55
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- bmi >  50.55
|   |   |   |   |   |   |--- class: 1
|   |   |   |--- age >  39.56
|   |   |   |   |--- age <= 47.57
|   |   |   |   |   |--- bmi <= 19.23
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- bmi >  19.23
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |--- age >  47.57
|   |   |   |   |   |--- age <= 61.57
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- age >  61.57
|   |   |   |   |   |   |--- class: 1
|   |   |--- ap_hi >  149.50
|   |   |   |--- bmi <= 20.48
|   |   |   |   |--- age <= 64.27
|   |   |   |   |   |--- age <= 55.82
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- age >  55.82
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |--- age >  64.27
|   |   |   |   |   |--- class: 0
|   |   |   |--- bmi >  20.48
|   |   |   |   |--- age <= 64.35
|   |   |   |   |   |--- age <= 49.82
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- age >  49.82
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |--- age >  64.35
|   |   |   |   |   |--- bmi <= 36.80
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- bmi >  36.80
|   |   |   |   |   |   |--- class: 0

In [430]:
import pickle

pickle.dump(model, open("data-cardio/cardio.model.sav", "wb"))