fuzzy-rules-generator/cardio.ipynb at b2534ba05e5772003abf0ab4a0ddd3d98709b5c7

In [403]:

import pandas as pd

df = pd.read_csv("data-cardio/cardio_train.csv", sep=";", index_col="id")
df["age"] = df["age"] / 365.24
print(df.cardio.value_counts())
display(df.info(), df.cardio.value_counts())

df

cardio
0    35021
1    34979
Name: count, dtype: int64
<class 'pandas.core.frame.DataFrame'>
Index: 70000 entries, 0 to 99999
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          70000 non-null  float64
 1   gender       70000 non-null  int64  
 2   height       70000 non-null  int64  
 3   weight       70000 non-null  float64
 4   ap_hi        70000 non-null  int64  
 5   ap_lo        70000 non-null  int64  
 6   cholesterol  70000 non-null  int64  
 7   gluc         70000 non-null  int64  
 8   smoke        70000 non-null  int64  
 9   alco         70000 non-null  int64  
 10  active       70000 non-null  int64  
 11  cardio       70000 non-null  int64  
dtypes: float64(2), int64(10)
memory usage: 6.9 MB

None

cardio
0    35021
1    34979
Name: count, dtype: int64

Out[403]:

	age	gender	height	weight	ap_hi	ap_lo	cholesterol	gluc	smoke	alco	active	cardio
id
0	50.358668	2	168	62.0	110	80	1	1	0	0	1	0
1	55.382762	1	156	85.0	140	90	3	1	0	0	1	1
2	51.629066	1	165	64.0	130	70	3	1	0	0	0	1
3	48.250465	2	169	82.0	150	100	1	1	0	0	1	1
4	47.842515	1	156	56.0	100	60	1	1	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...
99993	52.677691	2	168	76.0	120	80	1	1	1	0	1	0
99995	61.879860	1	158	126.0	140	90	2	2	0	0	1	1
99996	52.201292	2	183	105.0	180	90	3	1	0	1	0	1
99998	61.414412	1	163	72.0	135	80	1	2	0	0	0	1
99999	56.236995	1	170	72.0	120	80	2	1	0	0	1	0

70000 rows × 12 columns

In [404]:

df.describe().transpose()

Out[404]:

	count	mean	std	min	25%	50%	75%	max
age	70000.0	53.304309	6.755152	29.564122	48.36272	53.945351	58.391742	64.924433
gender	70000.0	1.349571	0.476838	1.000000	1.00000	1.000000	2.000000	2.000000
height	70000.0	164.359229	8.210126	55.000000	159.00000	165.000000	170.000000	250.000000
weight	70000.0	74.205690	14.395757	10.000000	65.00000	72.000000	82.000000	200.000000
ap_hi	70000.0	128.817286	154.011419	-150.000000	120.00000	120.000000	140.000000	16020.000000
ap_lo	70000.0	96.630414	188.472530	-70.000000	80.00000	80.000000	90.000000	11000.000000
cholesterol	70000.0	1.366871	0.680250	1.000000	1.00000	1.000000	2.000000	3.000000
gluc	70000.0	1.226457	0.572270	1.000000	1.00000	1.000000	1.000000	3.000000
smoke	70000.0	0.088129	0.283484	0.000000	0.00000	0.000000	0.000000	1.000000
alco	70000.0	0.053771	0.225568	0.000000	0.00000	0.000000	0.000000	1.000000
active	70000.0	0.803729	0.397179	0.000000	1.00000	1.000000	1.000000	1.000000
cardio	70000.0	0.499700	0.500003	0.000000	0.00000	0.000000	1.000000	1.000000

In [405]:

df = df.query("ap_hi > 0 and ap_hi < 370")
df = df.query("ap_lo > 0 and ap_lo < 370")
df.describe().transpose()

Out[405]:

	count	mean	std	min	25%	50%	75%	max
age	68985.0	53.290421	6.757633	29.564122	48.340817	53.939875	58.380791	64.924433
gender	68985.0	1.348670	0.476553	1.000000	1.000000	1.000000	2.000000	2.000000
height	68985.0	164.359672	8.204273	55.000000	159.000000	165.000000	170.000000	250.000000
weight	68985.0	74.118961	14.328938	11.000000	65.000000	72.000000	82.000000	200.000000
ap_hi	68985.0	126.325027	17.698621	7.000000	120.000000	120.000000	140.000000	240.000000
ap_lo	68985.0	81.350482	9.805666	1.000000	80.000000	80.000000	90.000000	190.000000
cholesterol	68985.0	1.364384	0.678691	1.000000	1.000000	1.000000	1.000000	3.000000
gluc	68985.0	1.225875	0.571822	1.000000	1.000000	1.000000	1.000000	3.000000
smoke	68985.0	0.087860	0.283093	0.000000	0.000000	0.000000	0.000000	1.000000
alco	68985.0	0.053591	0.225211	0.000000	0.000000	0.000000	0.000000	1.000000
active	68985.0	0.803276	0.397525	0.000000	1.000000	1.000000	1.000000	1.000000
cardio	68985.0	0.494905	0.499978	0.000000	0.000000	0.000000	1.000000	1.000000

In [406]:

df["bmi"] = df["weight"] / (df["height"] / 100) ** 2
df.drop(["weight", "height"], axis=1, inplace=True)
df

Out[406]:

	age	gender	ap_hi	ap_lo	cholesterol	gluc	smoke	alco	active	cardio	bmi
id
0	50.358668	2	110	80	1	1	0	0	1	0	21.967120
1	55.382762	1	140	90	3	1	0	0	1	1	34.927679
2	51.629066	1	130	70	3	1	0	0	0	1	23.507805
3	48.250465	2	150	100	1	1	0	0	1	1	28.710479
4	47.842515	1	100	60	1	1	0	0	0	0	23.011177
...	...	...	...	...	...	...	...	...	...	...	...
99993	52.677691	2	120	80	1	1	1	0	1	0	26.927438
99995	61.879860	1	140	90	2	2	0	0	1	1	50.472681
99996	52.201292	2	180	90	3	1	0	1	0	1	31.353579
99998	61.414412	1	135	80	1	2	0	0	0	1	27.099251
99999	56.236995	1	120	80	2	1	0	0	1	0	24.913495

68985 rows × 11 columns

In [407]:

import seaborn as sns

sns.heatmap(df.corr())

Out[407]:

<Axes: >

In [411]:

from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import numpy as np

random_state = 9

stat_y = df["cardio"]
stat_X = df.drop(["cardio"], axis=1).copy()
stat_X_train, stat_X_test, stat_y_train, stat_y_test = train_test_split(
    stat_X, stat_y, test_size=0.15, random_state=9
)

log_model = sm.Logit(stat_y_train, sm.add_constant(stat_X_train))
log_result = log_model.fit()
display(log_result.summary2())
np.exp(log_result.params).sort_values(ascending=False)

Optimization terminated successfully.
         Current function value: 0.567793
         Iterations 6

Model:	Logit	Method:	MLE
Dependent Variable:	cardio	Pseudo R-squared:	0.181
Date:	2025-02-21 23:59	AIC:	66609.4135
No. Observations:	58637	BIC:	66708.1838
Df Model:	10	Log-Likelihood:	-33294.
Df Residuals:	58626	LL-Null:	-40641.
Converged:	1.0000	LLR p-value:	0.0000
No. Iterations:	6.0000	Scale:	1.0000

	Coef.	Std.Err.	z	P>\|z\|	[0.025	0.975]
const	-11.5426	0.1324	-87.1794	0.0000	-11.8021	-11.2831
age	0.0523	0.0014	36.0898	0.0000	0.0494	0.0551
gender	0.0206	0.0211	0.9792	0.3275	-0.0207	0.0619
ap_hi	0.0469	0.0009	52.1945	0.0000	0.0451	0.0486
ap_lo	0.0232	0.0014	16.3321	0.0000	0.0204	0.0259
cholesterol	0.4859	0.0168	29.0062	0.0000	0.4531	0.5187
gluc	-0.1116	0.0190	-5.8803	0.0000	-0.1488	-0.0744
smoke	-0.1279	0.0373	-3.4313	0.0006	-0.2009	-0.0548
alco	-0.1782	0.0452	-3.9429	0.0001	-0.2668	-0.0896
active	-0.2377	0.0235	-10.1027	0.0000	-0.2838	-0.1916
bmi	0.0226	0.0018	12.4231	0.0000	0.0190	0.0261

Out[411]:

cholesterol    1.625611
age            1.053639
ap_hi          1.047999
ap_lo          1.023425
bmi            1.022841
gender         1.020836
gluc           0.894398
smoke          0.879970
alco           0.836750
active         0.788470
const          0.000010
dtype: float64

In [421]:

# data = df.drop(["gluc", "smoke", "alco", "active", "ap_lo"], axis=1)
data = df.drop(
    [
        "gluc",
        "smoke",
        "alco",
        "ap_lo",
        "gender",
        # "cholesterol",
        "active",
    ],
    axis=1,
)
# data = df.copy()
data

Out[421]:

	age	ap_hi	cholesterol	cardio	bmi
id
0	50.358668	110	1	0	21.967120
1	55.382762	140	3	1	34.927679
2	51.629066	130	3	1	23.507805
3	48.250465	150	1	1	28.710479
4	47.842515	100	1	0	23.011177
...	...	...	...	...	...
99993	52.677691	120	1	0	26.927438
99995	61.879860	140	2	1	50.472681
99996	52.201292	180	3	1	31.353579
99998	61.414412	135	1	1	27.099251
99999	56.236995	120	2	0	24.913495

68985 rows × 5 columns

In [431]:

data.to_csv("data-cardio/cardio_clear.csv")

In [422]:

y = data["cardio"]
X = data.drop(["cardio"], axis=1).copy()
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=random_state
)
display(X_train, y_train, X_test, y_test)

	age	ap_hi	cholesterol	bmi
id
94960	62.014018	120	1	26.892323
30807	57.745592	120	1	28.393726
26485	59.670354	120	3	23.875115
3868	49.715256	110	1	20.820940
45890	59.785347	160	1	23.529412
...	...	...	...	...
61975	62.558865	120	1	28.196921
32741	57.882488	120	1	29.043709
94833	51.371701	120	1	29.242109
95660	45.767167	120	1	24.977043
81002	55.544300	150	1	27.053803

55188 rows × 4 columns

id
94960    0
30807    0
26485    0
3868     1
45890    1
        ..
61975    1
32741    0
94833    0
95660    0
81002    1
Name: cardio, Length: 55188, dtype: int64

	age	ap_hi	cholesterol	bmi
id
42270	60.078305	140	1	45.918367
10780	55.360859	120	2	24.998904
42436	48.198445	100	3	21.926126
88647	41.517906	130	2	27.764650
62336	51.692038	110	1	22.230987
...	...	...	...	...
30330	47.697404	100	1	22.724403
62907	58.597087	120	1	23.828125
98612	51.404556	110	1	22.589551
5767	62.033184	120	1	23.875115
14769	41.506954	120	2	22.948116

13797 rows × 4 columns

id
42270    1
10780    0
42436    1
88647    1
62336    0
        ..
30330    1
62907    0
98612    0
5767     0
14769    1
Name: cardio, Length: 13797, dtype: int64

In [ ]:

from src.utils import run_classification
from sklearn import tree
from sklearn import metrics
import numpy as np


model = tree.DecisionTreeClassifier(
    random_state=random_state,
    max_depth=6,
).fit(X_train.values, y_train.values.ravel())


y_train_predict = model.predict(X_train)
y_test_probs = model.predict_proba(X_test)
y_test_predict = model.predict(X_test)


display("Precision_test", metrics.precision_score(y_test, y_test_predict))
display("Recall_test", metrics.recall_score(y_test, y_test_predict))
display("Accuracy_test", metrics.accuracy_score(y_test, y_test_predict))
display("ROC_AUC_test", metrics.roc_auc_score(y_test, y_test_probs[:, 1])) # type: ignore
display("F1_test", metrics.f1_score(y_test, y_test_predict))
display("MCC_test", metrics.matthews_corrcoef(y_test, y_test_predict))
display("Cohen_kappa_test", metrics.cohen_kappa_score(y_test, y_test_predict))
display("Confusion_matrix", metrics.confusion_matrix(y_test, y_test_predict))

/Users/user/Projects/python/fuzzy-rules-generator/.venv/lib/python3.12/site-packages/sklearn/base.py:486: UserWarning: X has feature names, but DecisionTreeClassifier was fitted without feature names
  warnings.warn(
/Users/user/Projects/python/fuzzy-rules-generator/.venv/lib/python3.12/site-packages/sklearn/base.py:486: UserWarning: X has feature names, but DecisionTreeClassifier was fitted without feature names
  warnings.warn(
/Users/user/Projects/python/fuzzy-rules-generator/.venv/lib/python3.12/site-packages/sklearn/base.py:486: UserWarning: X has feature names, but DecisionTreeClassifier was fitted without feature names
  warnings.warn(

'Precision_test'

np.float64(0.7368262116865468)

'Recall_test'

np.float64(0.7180694526191878)

'Accuracy_test'

0.7347974197289265

'ROC_AUC_test'

np.float64(0.7917285464726767)

'F1_test'

np.float64(0.7273269245100231)

'MCC_test'

np.float64(0.46942772902650703)

'Cohen_kappa_test'

np.float64(0.4692799184358021)

'Confusion_matrix'

array([[5258, 1743],
       [1916, 4880]])

In [429]:

rules = tree.export_text(model, feature_names=X_train.columns.values.tolist())
print(rules)

|--- ap_hi <= 129.50
|   |--- age <= 54.65
|   |   |--- cholesterol <= 2.50
|   |   |   |--- age <= 43.79
|   |   |   |   |--- cholesterol <= 1.50
|   |   |   |   |   |--- ap_hi <= 114.50
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- ap_hi >  114.50
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |--- cholesterol >  1.50
|   |   |   |   |   |--- bmi <= 28.87
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- bmi >  28.87
|   |   |   |   |   |   |--- class: 0
|   |   |   |--- age >  43.79
|   |   |   |   |--- ap_hi <= 119.50
|   |   |   |   |   |--- bmi <= 22.05
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- bmi >  22.05
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |--- ap_hi >  119.50
|   |   |   |   |   |--- bmi <= 27.71
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- bmi >  27.71
|   |   |   |   |   |   |--- class: 0
|   |   |--- cholesterol >  2.50
|   |   |   |--- bmi <= 29.04
|   |   |   |   |--- age <= 41.60
|   |   |   |   |   |--- ap_hi <= 115.00
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- ap_hi >  115.00
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |--- age >  41.60
|   |   |   |   |   |--- age <= 54.17
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- age >  54.17
|   |   |   |   |   |   |--- class: 0
|   |   |   |--- bmi >  29.04
|   |   |   |   |--- age <= 54.01
|   |   |   |   |   |--- age <= 39.75
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- age >  39.75
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |--- age >  54.01
|   |   |   |   |   |--- bmi <= 35.02
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- bmi >  35.02
|   |   |   |   |   |   |--- class: 1
|   |--- age >  54.65
|   |   |--- cholesterol <= 2.50
|   |   |   |--- age <= 60.71
|   |   |   |   |--- ap_hi <= 118.50
|   |   |   |   |   |--- bmi <= 23.33
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- bmi >  23.33
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |--- ap_hi >  118.50
|   |   |   |   |   |--- bmi <= 32.89
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- bmi >  32.89
|   |   |   |   |   |   |--- class: 1
|   |   |   |--- age >  60.71
|   |   |   |   |--- bmi <= 20.51
|   |   |   |   |   |--- age <= 64.31
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- age >  64.31
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |--- bmi >  20.51
|   |   |   |   |   |--- ap_hi <= 115.50
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- ap_hi >  115.50
|   |   |   |   |   |   |--- class: 1
|   |   |--- cholesterol >  2.50
|   |   |   |--- bmi <= 26.03
|   |   |   |   |--- age <= 60.89
|   |   |   |   |   |--- age <= 60.48
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- age >  60.48
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |--- age >  60.89
|   |   |   |   |   |--- bmi <= 25.91
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- bmi >  25.91
|   |   |   |   |   |   |--- class: 0
|   |   |   |--- bmi >  26.03
|   |   |   |   |--- age <= 59.39
|   |   |   |   |   |--- bmi <= 35.93
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- bmi >  35.93
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |--- age >  59.39
|   |   |   |   |   |--- bmi <= 35.12
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- bmi >  35.12
|   |   |   |   |   |   |--- class: 1
|--- ap_hi >  129.50
|   |--- ap_hi <= 138.50
|   |   |--- cholesterol <= 2.50
|   |   |   |--- age <= 59.54
|   |   |   |   |--- bmi <= 21.64
|   |   |   |   |   |--- bmi <= 17.30
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- bmi >  17.30
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |--- bmi >  21.64
|   |   |   |   |   |--- age <= 39.99
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- age >  39.99
|   |   |   |   |   |   |--- class: 1
|   |   |   |--- age >  59.54
|   |   |   |   |--- age <= 62.46
|   |   |   |   |   |--- bmi <= 20.61
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- bmi >  20.61
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |--- age >  62.46
|   |   |   |   |   |--- age <= 64.00
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- age >  64.00
|   |   |   |   |   |   |--- class: 1
|   |   |--- cholesterol >  2.50
|   |   |   |--- bmi <= 30.74
|   |   |   |   |--- bmi <= 30.06
|   |   |   |   |   |--- bmi <= 23.93
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- bmi >  23.93
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |--- bmi >  30.06
|   |   |   |   |   |--- bmi <= 30.69
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- bmi >  30.69
|   |   |   |   |   |   |--- class: 0
|   |   |   |--- bmi >  30.74
|   |   |   |   |--- bmi <= 32.05
|   |   |   |   |   |--- age <= 43.63
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- age >  43.63
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |--- bmi >  32.05
|   |   |   |   |   |--- bmi <= 32.34
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- bmi >  32.34
|   |   |   |   |   |   |--- class: 1
|   |--- ap_hi >  138.50
|   |   |--- ap_hi <= 149.50
|   |   |   |--- age <= 39.56
|   |   |   |   |--- bmi <= 38.19
|   |   |   |   |   |--- age <= 39.54
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- age >  39.54
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |--- bmi >  38.19
|   |   |   |   |   |--- bmi <= 50.55
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- bmi >  50.55
|   |   |   |   |   |   |--- class: 1
|   |   |   |--- age >  39.56
|   |   |   |   |--- age <= 47.57
|   |   |   |   |   |--- bmi <= 19.23
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- bmi >  19.23
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |--- age >  47.57
|   |   |   |   |   |--- age <= 61.57
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- age >  61.57
|   |   |   |   |   |   |--- class: 1
|   |   |--- ap_hi >  149.50
|   |   |   |--- bmi <= 20.48
|   |   |   |   |--- age <= 64.27
|   |   |   |   |   |--- age <= 55.82
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- age >  55.82
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |--- age >  64.27
|   |   |   |   |   |--- class: 0
|   |   |   |--- bmi >  20.48
|   |   |   |   |--- age <= 64.35
|   |   |   |   |   |--- age <= 49.82
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- age >  49.82
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |--- age >  64.35
|   |   |   |   |   |--- bmi <= 36.80
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- bmi >  36.80
|   |   |   |   |   |   |--- class: 0

In [430]:

import pickle

pickle.dump(model, open("data-cardio/cardio.model.sav", "wb"))

106 KiB Raw Blame History Unescape Escape

106 KiB

Raw Blame History