{ "cells": [ { "cell_type": "code", "execution_count": 403, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "cardio\n", "0 35021\n", "1 34979\n", "Name: count, dtype: int64\n", "\n", "Index: 70000 entries, 0 to 99999\n", "Data columns (total 12 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 age 70000 non-null float64\n", " 1 gender 70000 non-null int64 \n", " 2 height 70000 non-null int64 \n", " 3 weight 70000 non-null float64\n", " 4 ap_hi 70000 non-null int64 \n", " 5 ap_lo 70000 non-null int64 \n", " 6 cholesterol 70000 non-null int64 \n", " 7 gluc 70000 non-null int64 \n", " 8 smoke 70000 non-null int64 \n", " 9 alco 70000 non-null int64 \n", " 10 active 70000 non-null int64 \n", " 11 cardio 70000 non-null int64 \n", "dtypes: float64(2), int64(10)\n", "memory usage: 6.9 MB\n" ] }, { "data": { "text/plain": [ "None" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "cardio\n", "0 35021\n", "1 34979\n", "Name: count, dtype: int64" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
agegenderheightweightap_hiap_locholesterolglucsmokealcoactivecardio
id
050.358668216862.011080110010
155.382762115685.014090310011
251.629066116564.013070310001
348.250465216982.0150100110011
447.842515115656.010060110000
.......................................
9999352.677691216876.012080111010
9999561.8798601158126.014090220011
9999652.2012922183105.018090310101
9999861.414412116372.013580120001
9999956.236995117072.012080210010
\n", "

70000 rows × 12 columns

\n", "
" ], "text/plain": [ " age gender height weight ap_hi ap_lo cholesterol gluc \\\n", "id \n", "0 50.358668 2 168 62.0 110 80 1 1 \n", "1 55.382762 1 156 85.0 140 90 3 1 \n", "2 51.629066 1 165 64.0 130 70 3 1 \n", "3 48.250465 2 169 82.0 150 100 1 1 \n", "4 47.842515 1 156 56.0 100 60 1 1 \n", "... ... ... ... ... ... ... ... ... \n", "99993 52.677691 2 168 76.0 120 80 1 1 \n", "99995 61.879860 1 158 126.0 140 90 2 2 \n", "99996 52.201292 2 183 105.0 180 90 3 1 \n", "99998 61.414412 1 163 72.0 135 80 1 2 \n", "99999 56.236995 1 170 72.0 120 80 2 1 \n", "\n", " smoke alco active cardio \n", "id \n", "0 0 0 1 0 \n", "1 0 0 1 1 \n", "2 0 0 0 1 \n", "3 0 0 1 1 \n", "4 0 0 0 0 \n", "... ... ... ... ... \n", "99993 1 0 1 0 \n", "99995 0 0 1 1 \n", "99996 0 1 0 1 \n", "99998 0 0 0 1 \n", "99999 0 0 1 0 \n", "\n", "[70000 rows x 12 columns]" ] }, "execution_count": 403, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "df = pd.read_csv(\"data-cardio/cardio_train.csv\", sep=\";\", index_col=\"id\")\n", "df[\"age\"] = df[\"age\"] / 365.24\n", "print(df.cardio.value_counts())\n", "display(df.info(), df.cardio.value_counts())\n", "\n", "df" ] }, { "cell_type": "code", "execution_count": 404, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countmeanstdmin25%50%75%max
age70000.053.3043096.75515229.56412248.3627253.94535158.39174264.924433
gender70000.01.3495710.4768381.0000001.000001.0000002.0000002.000000
height70000.0164.3592298.21012655.000000159.00000165.000000170.000000250.000000
weight70000.074.20569014.39575710.00000065.0000072.00000082.000000200.000000
ap_hi70000.0128.817286154.011419-150.000000120.00000120.000000140.00000016020.000000
ap_lo70000.096.630414188.472530-70.00000080.0000080.00000090.00000011000.000000
cholesterol70000.01.3668710.6802501.0000001.000001.0000002.0000003.000000
gluc70000.01.2264570.5722701.0000001.000001.0000001.0000003.000000
smoke70000.00.0881290.2834840.0000000.000000.0000000.0000001.000000
alco70000.00.0537710.2255680.0000000.000000.0000000.0000001.000000
active70000.00.8037290.3971790.0000001.000001.0000001.0000001.000000
cardio70000.00.4997000.5000030.0000000.000000.0000001.0000001.000000
\n", "
" ], "text/plain": [ " count mean std min 25% \\\n", "age 70000.0 53.304309 6.755152 29.564122 48.36272 \n", "gender 70000.0 1.349571 0.476838 1.000000 1.00000 \n", "height 70000.0 164.359229 8.210126 55.000000 159.00000 \n", "weight 70000.0 74.205690 14.395757 10.000000 65.00000 \n", "ap_hi 70000.0 128.817286 154.011419 -150.000000 120.00000 \n", "ap_lo 70000.0 96.630414 188.472530 -70.000000 80.00000 \n", "cholesterol 70000.0 1.366871 0.680250 1.000000 1.00000 \n", "gluc 70000.0 1.226457 0.572270 1.000000 1.00000 \n", "smoke 70000.0 0.088129 0.283484 0.000000 0.00000 \n", "alco 70000.0 0.053771 0.225568 0.000000 0.00000 \n", "active 70000.0 0.803729 0.397179 0.000000 1.00000 \n", "cardio 70000.0 0.499700 0.500003 0.000000 0.00000 \n", "\n", " 50% 75% max \n", "age 53.945351 58.391742 64.924433 \n", "gender 1.000000 2.000000 2.000000 \n", "height 165.000000 170.000000 250.000000 \n", "weight 72.000000 82.000000 200.000000 \n", "ap_hi 120.000000 140.000000 16020.000000 \n", "ap_lo 80.000000 90.000000 11000.000000 \n", "cholesterol 1.000000 2.000000 3.000000 \n", "gluc 1.000000 1.000000 3.000000 \n", "smoke 0.000000 0.000000 1.000000 \n", "alco 0.000000 0.000000 1.000000 \n", "active 1.000000 1.000000 1.000000 \n", "cardio 0.000000 1.000000 1.000000 " ] }, "execution_count": 404, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.describe().transpose()" ] }, { "cell_type": "code", "execution_count": 405, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countmeanstdmin25%50%75%max
age68985.053.2904216.75763329.56412248.34081753.93987558.38079164.924433
gender68985.01.3486700.4765531.0000001.0000001.0000002.0000002.000000
height68985.0164.3596728.20427355.000000159.000000165.000000170.000000250.000000
weight68985.074.11896114.32893811.00000065.00000072.00000082.000000200.000000
ap_hi68985.0126.32502717.6986217.000000120.000000120.000000140.000000240.000000
ap_lo68985.081.3504829.8056661.00000080.00000080.00000090.000000190.000000
cholesterol68985.01.3643840.6786911.0000001.0000001.0000001.0000003.000000
gluc68985.01.2258750.5718221.0000001.0000001.0000001.0000003.000000
smoke68985.00.0878600.2830930.0000000.0000000.0000000.0000001.000000
alco68985.00.0535910.2252110.0000000.0000000.0000000.0000001.000000
active68985.00.8032760.3975250.0000001.0000001.0000001.0000001.000000
cardio68985.00.4949050.4999780.0000000.0000000.0000001.0000001.000000
\n", "
" ], "text/plain": [ " count mean std min 25% \\\n", "age 68985.0 53.290421 6.757633 29.564122 48.340817 \n", "gender 68985.0 1.348670 0.476553 1.000000 1.000000 \n", "height 68985.0 164.359672 8.204273 55.000000 159.000000 \n", "weight 68985.0 74.118961 14.328938 11.000000 65.000000 \n", "ap_hi 68985.0 126.325027 17.698621 7.000000 120.000000 \n", "ap_lo 68985.0 81.350482 9.805666 1.000000 80.000000 \n", "cholesterol 68985.0 1.364384 0.678691 1.000000 1.000000 \n", "gluc 68985.0 1.225875 0.571822 1.000000 1.000000 \n", "smoke 68985.0 0.087860 0.283093 0.000000 0.000000 \n", "alco 68985.0 0.053591 0.225211 0.000000 0.000000 \n", "active 68985.0 0.803276 0.397525 0.000000 1.000000 \n", "cardio 68985.0 0.494905 0.499978 0.000000 0.000000 \n", "\n", " 50% 75% max \n", "age 53.939875 58.380791 64.924433 \n", "gender 1.000000 2.000000 2.000000 \n", "height 165.000000 170.000000 250.000000 \n", "weight 72.000000 82.000000 200.000000 \n", "ap_hi 120.000000 140.000000 240.000000 \n", "ap_lo 80.000000 90.000000 190.000000 \n", "cholesterol 1.000000 1.000000 3.000000 \n", "gluc 1.000000 1.000000 3.000000 \n", "smoke 0.000000 0.000000 1.000000 \n", "alco 0.000000 0.000000 1.000000 \n", "active 1.000000 1.000000 1.000000 \n", "cardio 0.000000 1.000000 1.000000 " ] }, "execution_count": 405, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = df.query(\"ap_hi > 0 and ap_hi < 370\")\n", "df = df.query(\"ap_lo > 0 and ap_lo < 370\")\n", "df.describe().transpose()" ] }, { "cell_type": "code", "execution_count": 406, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
agegenderap_hiap_locholesterolglucsmokealcoactivecardiobmi
id
050.35866821108011001021.967120
155.38276211409031001134.927679
251.62906611307031000123.507805
348.250465215010011001128.710479
447.84251511006011000023.011177
....................................
9999352.67769121208011101026.927438
9999561.87986011409022001150.472681
9999652.20129221809031010131.353579
9999861.41441211358012000127.099251
9999956.23699511208021001024.913495
\n", "

68985 rows × 11 columns

\n", "
" ], "text/plain": [ " age gender ap_hi ap_lo cholesterol gluc smoke alco \\\n", "id \n", "0 50.358668 2 110 80 1 1 0 0 \n", "1 55.382762 1 140 90 3 1 0 0 \n", "2 51.629066 1 130 70 3 1 0 0 \n", "3 48.250465 2 150 100 1 1 0 0 \n", "4 47.842515 1 100 60 1 1 0 0 \n", "... ... ... ... ... ... ... ... ... \n", "99993 52.677691 2 120 80 1 1 1 0 \n", "99995 61.879860 1 140 90 2 2 0 0 \n", "99996 52.201292 2 180 90 3 1 0 1 \n", "99998 61.414412 1 135 80 1 2 0 0 \n", "99999 56.236995 1 120 80 2 1 0 0 \n", "\n", " active cardio bmi \n", "id \n", "0 1 0 21.967120 \n", "1 1 1 34.927679 \n", "2 0 1 23.507805 \n", "3 1 1 28.710479 \n", "4 0 0 23.011177 \n", "... ... ... ... \n", "99993 1 0 26.927438 \n", "99995 1 1 50.472681 \n", "99996 0 1 31.353579 \n", "99998 0 1 27.099251 \n", "99999 1 0 24.913495 \n", "\n", "[68985 rows x 11 columns]" ] }, "execution_count": 406, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[\"bmi\"] = df[\"weight\"] / (df[\"height\"] / 100) ** 2\n", "df.drop([\"weight\", \"height\"], axis=1, inplace=True)\n", "df" ] }, { "cell_type": "code", "execution_count": 407, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 407, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import seaborn as sns\n", "\n", "sns.heatmap(df.corr())" ] }, { "cell_type": "code", "execution_count": 411, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Optimization terminated successfully.\n", " Current function value: 0.567793\n", " Iterations 6\n" ] }, { "data": { "text/html": [ "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "
Model: Logit Method: MLE
Dependent Variable: cardio Pseudo R-squared: 0.181
Date: 2025-02-21 23:59 AIC: 66609.4135
No. Observations: 58637 BIC: 66708.1838
Df Model: 10 Log-Likelihood: -33294.
Df Residuals: 58626 LL-Null: -40641.
Converged: 1.0000 LLR p-value: 0.0000
No. Iterations: 6.0000 Scale: 1.0000
\n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "
Coef. Std.Err. z P>|z| [0.025 0.975]
const -11.5426 0.1324 -87.1794 0.0000 -11.8021 -11.2831
age 0.0523 0.0014 36.0898 0.0000 0.0494 0.0551
gender 0.0206 0.0211 0.9792 0.3275 -0.0207 0.0619
ap_hi 0.0469 0.0009 52.1945 0.0000 0.0451 0.0486
ap_lo 0.0232 0.0014 16.3321 0.0000 0.0204 0.0259
cholesterol 0.4859 0.0168 29.0062 0.0000 0.4531 0.5187
gluc -0.1116 0.0190 -5.8803 0.0000 -0.1488 -0.0744
smoke -0.1279 0.0373 -3.4313 0.0006 -0.2009 -0.0548
alco -0.1782 0.0452 -3.9429 0.0001 -0.2668 -0.0896
active -0.2377 0.0235 -10.1027 0.0000 -0.2838 -0.1916
bmi 0.0226 0.0018 12.4231 0.0000 0.0190 0.0261

\n" ], "text/latex": [ "\\begin{table}\n", "\\caption{Results: Logit}\n", "\\label{}\n", "\\begin{center}\n", "\\begin{tabular}{llll}\n", "\\hline\n", "Model: & Logit & Method: & MLE \\\\\n", "Dependent Variable: & cardio & Pseudo R-squared: & 0.181 \\\\\n", "Date: & 2025-02-21 23:59 & AIC: & 66609.4135 \\\\\n", "No. Observations: & 58637 & BIC: & 66708.1838 \\\\\n", "Df Model: & 10 & Log-Likelihood: & -33294. \\\\\n", "Df Residuals: & 58626 & LL-Null: & -40641. \\\\\n", "Converged: & 1.0000 & LLR p-value: & 0.0000 \\\\\n", "No. Iterations: & 6.0000 & Scale: & 1.0000 \\\\\n", "\\hline\n", "\\end{tabular}\n", "\\end{center}\n", "\n", "\\begin{center}\n", "\\begin{tabular}{lrrrrrr}\n", "\\hline\n", " & Coef. & Std.Err. & z & P$> |$z$|$ & [0.025 & 0.975] \\\\\n", "\\hline\n", "const & -11.5426 & 0.1324 & -87.1794 & 0.0000 & -11.8021 & -11.2831 \\\\\n", "age & 0.0523 & 0.0014 & 36.0898 & 0.0000 & 0.0494 & 0.0551 \\\\\n", "gender & 0.0206 & 0.0211 & 0.9792 & 0.3275 & -0.0207 & 0.0619 \\\\\n", "ap\\_hi & 0.0469 & 0.0009 & 52.1945 & 0.0000 & 0.0451 & 0.0486 \\\\\n", "ap\\_lo & 0.0232 & 0.0014 & 16.3321 & 0.0000 & 0.0204 & 0.0259 \\\\\n", "cholesterol & 0.4859 & 0.0168 & 29.0062 & 0.0000 & 0.4531 & 0.5187 \\\\\n", "gluc & -0.1116 & 0.0190 & -5.8803 & 0.0000 & -0.1488 & -0.0744 \\\\\n", "smoke & -0.1279 & 0.0373 & -3.4313 & 0.0006 & -0.2009 & -0.0548 \\\\\n", "alco & -0.1782 & 0.0452 & -3.9429 & 0.0001 & -0.2668 & -0.0896 \\\\\n", "active & -0.2377 & 0.0235 & -10.1027 & 0.0000 & -0.2838 & -0.1916 \\\\\n", "bmi & 0.0226 & 0.0018 & 12.4231 & 0.0000 & 0.0190 & 0.0261 \\\\\n", "\\hline\n", "\\end{tabular}\n", "\\end{center}\n", "\\end{table}\n", "\\bigskip\n" ], "text/plain": [ "\n", "\"\"\"\n", " Results: Logit\n", "=================================================================\n", "Model: Logit Method: MLE \n", "Dependent Variable: cardio Pseudo R-squared: 0.181 \n", "Date: 2025-02-21 23:59 AIC: 66609.4135\n", "No. Observations: 58637 BIC: 66708.1838\n", "Df Model: 10 Log-Likelihood: -33294. \n", "Df Residuals: 58626 LL-Null: -40641. \n", "Converged: 1.0000 LLR p-value: 0.0000 \n", "No. Iterations: 6.0000 Scale: 1.0000 \n", "-----------------------------------------------------------------\n", " Coef. Std.Err. z P>|z| [0.025 0.975] \n", "-----------------------------------------------------------------\n", "const -11.5426 0.1324 -87.1794 0.0000 -11.8021 -11.2831\n", "age 0.0523 0.0014 36.0898 0.0000 0.0494 0.0551\n", "gender 0.0206 0.0211 0.9792 0.3275 -0.0207 0.0619\n", "ap_hi 0.0469 0.0009 52.1945 0.0000 0.0451 0.0486\n", "ap_lo 0.0232 0.0014 16.3321 0.0000 0.0204 0.0259\n", "cholesterol 0.4859 0.0168 29.0062 0.0000 0.4531 0.5187\n", "gluc -0.1116 0.0190 -5.8803 0.0000 -0.1488 -0.0744\n", "smoke -0.1279 0.0373 -3.4313 0.0006 -0.2009 -0.0548\n", "alco -0.1782 0.0452 -3.9429 0.0001 -0.2668 -0.0896\n", "active -0.2377 0.0235 -10.1027 0.0000 -0.2838 -0.1916\n", "bmi 0.0226 0.0018 12.4231 0.0000 0.0190 0.0261\n", "=================================================================\n", "\n", "\"\"\"" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "cholesterol 1.625611\n", "age 1.053639\n", "ap_hi 1.047999\n", "ap_lo 1.023425\n", "bmi 1.022841\n", "gender 1.020836\n", "gluc 0.894398\n", "smoke 0.879970\n", "alco 0.836750\n", "active 0.788470\n", "const 0.000010\n", "dtype: float64" ] }, "execution_count": 411, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.model_selection import train_test_split\n", "import statsmodels.api as sm\n", "import numpy as np\n", "\n", "random_state = 9\n", "\n", "stat_y = df[\"cardio\"]\n", "stat_X = df.drop([\"cardio\"], axis=1).copy()\n", "stat_X_train, stat_X_test, stat_y_train, stat_y_test = train_test_split(\n", " stat_X, stat_y, test_size=0.15, random_state=9\n", ")\n", "\n", "log_model = sm.Logit(stat_y_train, sm.add_constant(stat_X_train))\n", "log_result = log_model.fit()\n", "display(log_result.summary2())\n", "np.exp(log_result.params).sort_values(ascending=False)" ] }, { "cell_type": "code", "execution_count": 421, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ageap_hicholesterolcardiobmi
id
050.3586681101021.967120
155.3827621403134.927679
251.6290661303123.507805
348.2504651501128.710479
447.8425151001023.011177
..................
9999352.6776911201026.927438
9999561.8798601402150.472681
9999652.2012921803131.353579
9999861.4144121351127.099251
9999956.2369951202024.913495
\n", "

68985 rows × 5 columns

\n", "
" ], "text/plain": [ " age ap_hi cholesterol cardio bmi\n", "id \n", "0 50.358668 110 1 0 21.967120\n", "1 55.382762 140 3 1 34.927679\n", "2 51.629066 130 3 1 23.507805\n", "3 48.250465 150 1 1 28.710479\n", "4 47.842515 100 1 0 23.011177\n", "... ... ... ... ... ...\n", "99993 52.677691 120 1 0 26.927438\n", "99995 61.879860 140 2 1 50.472681\n", "99996 52.201292 180 3 1 31.353579\n", "99998 61.414412 135 1 1 27.099251\n", "99999 56.236995 120 2 0 24.913495\n", "\n", "[68985 rows x 5 columns]" ] }, "execution_count": 421, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# data = df.drop([\"gluc\", \"smoke\", \"alco\", \"active\", \"ap_lo\"], axis=1)\n", "data = df.drop(\n", " [\n", " \"gluc\",\n", " \"smoke\",\n", " \"alco\",\n", " \"ap_lo\",\n", " \"gender\",\n", " # \"cholesterol\",\n", " \"active\",\n", " ],\n", " axis=1,\n", ")\n", "# data = df.copy()\n", "data" ] }, { "cell_type": "code", "execution_count": 431, "metadata": {}, "outputs": [], "source": [ "data.to_csv(\"data-cardio/cardio_clear.csv\")" ] }, { "cell_type": "code", "execution_count": 422, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ageap_hicholesterolbmi
id
9496062.014018120126.892323
3080757.745592120128.393726
2648559.670354120323.875115
386849.715256110120.820940
4589059.785347160123.529412
...............
6197562.558865120128.196921
3274157.882488120129.043709
9483351.371701120129.242109
9566045.767167120124.977043
8100255.544300150127.053803
\n", "

55188 rows × 4 columns

\n", "
" ], "text/plain": [ " age ap_hi cholesterol bmi\n", "id \n", "94960 62.014018 120 1 26.892323\n", "30807 57.745592 120 1 28.393726\n", "26485 59.670354 120 3 23.875115\n", "3868 49.715256 110 1 20.820940\n", "45890 59.785347 160 1 23.529412\n", "... ... ... ... ...\n", "61975 62.558865 120 1 28.196921\n", "32741 57.882488 120 1 29.043709\n", "94833 51.371701 120 1 29.242109\n", "95660 45.767167 120 1 24.977043\n", "81002 55.544300 150 1 27.053803\n", "\n", "[55188 rows x 4 columns]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "id\n", "94960 0\n", "30807 0\n", "26485 0\n", "3868 1\n", "45890 1\n", " ..\n", "61975 1\n", "32741 0\n", "94833 0\n", "95660 0\n", "81002 1\n", "Name: cardio, Length: 55188, dtype: int64" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ageap_hicholesterolbmi
id
4227060.078305140145.918367
1078055.360859120224.998904
4243648.198445100321.926126
8864741.517906130227.764650
6233651.692038110122.230987
...............
3033047.697404100122.724403
6290758.597087120123.828125
9861251.404556110122.589551
576762.033184120123.875115
1476941.506954120222.948116
\n", "

13797 rows × 4 columns

\n", "
" ], "text/plain": [ " age ap_hi cholesterol bmi\n", "id \n", "42270 60.078305 140 1 45.918367\n", "10780 55.360859 120 2 24.998904\n", "42436 48.198445 100 3 21.926126\n", "88647 41.517906 130 2 27.764650\n", "62336 51.692038 110 1 22.230987\n", "... ... ... ... ...\n", "30330 47.697404 100 1 22.724403\n", "62907 58.597087 120 1 23.828125\n", "98612 51.404556 110 1 22.589551\n", "5767 62.033184 120 1 23.875115\n", "14769 41.506954 120 2 22.948116\n", "\n", "[13797 rows x 4 columns]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "id\n", "42270 1\n", "10780 0\n", "42436 1\n", "88647 1\n", "62336 0\n", " ..\n", "30330 1\n", "62907 0\n", "98612 0\n", "5767 0\n", "14769 1\n", "Name: cardio, Length: 13797, dtype: int64" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "y = data[\"cardio\"]\n", "X = data.drop([\"cardio\"], axis=1).copy()\n", "X_train, X_test, y_train, y_test = train_test_split(\n", " X, y, test_size=0.2, random_state=random_state\n", ")\n", "display(X_train, y_train, X_test, y_test)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/user/Projects/python/fuzzy-rules-generator/.venv/lib/python3.12/site-packages/sklearn/base.py:486: UserWarning: X has feature names, but DecisionTreeClassifier was fitted without feature names\n", " warnings.warn(\n", "/Users/user/Projects/python/fuzzy-rules-generator/.venv/lib/python3.12/site-packages/sklearn/base.py:486: UserWarning: X has feature names, but DecisionTreeClassifier was fitted without feature names\n", " warnings.warn(\n", "/Users/user/Projects/python/fuzzy-rules-generator/.venv/lib/python3.12/site-packages/sklearn/base.py:486: UserWarning: X has feature names, but DecisionTreeClassifier was fitted without feature names\n", " warnings.warn(\n" ] }, { "data": { "text/plain": [ "'Precision_test'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "np.float64(0.7368262116865468)" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Recall_test'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "np.float64(0.7180694526191878)" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Accuracy_test'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "0.7347974197289265" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'ROC_AUC_test'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "np.float64(0.7917285464726767)" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'F1_test'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "np.float64(0.7273269245100231)" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'MCC_test'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "np.float64(0.46942772902650703)" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Cohen_kappa_test'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "np.float64(0.4692799184358021)" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Confusion_matrix'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "array([[5258, 1743],\n", " [1916, 4880]])" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from src.utils import run_classification\n", "from sklearn import tree\n", "from sklearn import metrics\n", "import numpy as np\n", "\n", "\n", "model = tree.DecisionTreeClassifier(\n", " random_state=random_state,\n", " max_depth=6,\n", ").fit(X_train.values, y_train.values.ravel())\n", "\n", "\n", "y_train_predict = model.predict(X_train)\n", "y_test_probs = model.predict_proba(X_test)\n", "y_test_predict = model.predict(X_test)\n", "\n", "\n", "display(\"Precision_test\", metrics.precision_score(y_test, y_test_predict))\n", "display(\"Recall_test\", metrics.recall_score(y_test, y_test_predict))\n", "display(\"Accuracy_test\", metrics.accuracy_score(y_test, y_test_predict))\n", "display(\"ROC_AUC_test\", metrics.roc_auc_score(y_test, y_test_probs[:, 1])) # type: ignore\n", "display(\"F1_test\", metrics.f1_score(y_test, y_test_predict))\n", "display(\"MCC_test\", metrics.matthews_corrcoef(y_test, y_test_predict))\n", "display(\"Cohen_kappa_test\", metrics.cohen_kappa_score(y_test, y_test_predict))\n", "display(\"Confusion_matrix\", metrics.confusion_matrix(y_test, y_test_predict))" ] }, { "cell_type": "code", "execution_count": 429, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "|--- ap_hi <= 129.50\n", "| |--- age <= 54.65\n", "| | |--- cholesterol <= 2.50\n", "| | | |--- age <= 43.79\n", "| | | | |--- cholesterol <= 1.50\n", "| | | | | |--- ap_hi <= 114.50\n", "| | | | | | |--- class: 0\n", "| | | | | |--- ap_hi > 114.50\n", "| | | | | | |--- class: 0\n", "| | | | |--- cholesterol > 1.50\n", "| | | | | |--- bmi <= 28.87\n", "| | | | | | |--- class: 0\n", "| | | | | |--- bmi > 28.87\n", "| | | | | | |--- class: 0\n", "| | | |--- age > 43.79\n", "| | | | |--- ap_hi <= 119.50\n", "| | | | | |--- bmi <= 22.05\n", "| | | | | | |--- class: 0\n", "| | | | | |--- bmi > 22.05\n", "| | | | | | |--- class: 0\n", "| | | | |--- ap_hi > 119.50\n", "| | | | | |--- bmi <= 27.71\n", "| | | | | | |--- class: 0\n", "| | | | | |--- bmi > 27.71\n", "| | | | | | |--- class: 0\n", "| | |--- cholesterol > 2.50\n", "| | | |--- bmi <= 29.04\n", "| | | | |--- age <= 41.60\n", "| | | | | |--- ap_hi <= 115.00\n", "| | | | | | |--- class: 0\n", "| | | | | |--- ap_hi > 115.00\n", "| | | | | | |--- class: 0\n", "| | | | |--- age > 41.60\n", "| | | | | |--- age <= 54.17\n", "| | | | | | |--- class: 1\n", "| | | | | |--- age > 54.17\n", "| | | | | | |--- class: 0\n", "| | | |--- bmi > 29.04\n", "| | | | |--- age <= 54.01\n", "| | | | | |--- age <= 39.75\n", "| | | | | | |--- class: 0\n", "| | | | | |--- age > 39.75\n", "| | | | | | |--- class: 1\n", "| | | | |--- age > 54.01\n", "| | | | | |--- bmi <= 35.02\n", "| | | | | | |--- class: 0\n", "| | | | | |--- bmi > 35.02\n", "| | | | | | |--- class: 1\n", "| |--- age > 54.65\n", "| | |--- cholesterol <= 2.50\n", "| | | |--- age <= 60.71\n", "| | | | |--- ap_hi <= 118.50\n", "| | | | | |--- bmi <= 23.33\n", "| | | | | | |--- class: 0\n", "| | | | | |--- bmi > 23.33\n", "| | | | | | |--- class: 0\n", "| | | | |--- ap_hi > 118.50\n", "| | | | | |--- bmi <= 32.89\n", "| | | | | | |--- class: 0\n", "| | | | | |--- bmi > 32.89\n", "| | | | | | |--- class: 1\n", "| | | |--- age > 60.71\n", "| | | | |--- bmi <= 20.51\n", "| | | | | |--- age <= 64.31\n", "| | | | | | |--- class: 0\n", "| | | | | |--- age > 64.31\n", "| | | | | | |--- class: 1\n", "| | | | |--- bmi > 20.51\n", "| | | | | |--- ap_hi <= 115.50\n", "| | | | | | |--- class: 0\n", "| | | | | |--- ap_hi > 115.50\n", "| | | | | | |--- class: 1\n", "| | |--- cholesterol > 2.50\n", "| | | |--- bmi <= 26.03\n", "| | | | |--- age <= 60.89\n", "| | | | | |--- age <= 60.48\n", "| | | | | | |--- class: 1\n", "| | | | | |--- age > 60.48\n", "| | | | | | |--- class: 0\n", "| | | | |--- age > 60.89\n", "| | | | | |--- bmi <= 25.91\n", "| | | | | | |--- class: 1\n", "| | | | | |--- bmi > 25.91\n", "| | | | | | |--- class: 0\n", "| | | |--- bmi > 26.03\n", "| | | | |--- age <= 59.39\n", "| | | | | |--- bmi <= 35.93\n", "| | | | | | |--- class: 1\n", "| | | | | |--- bmi > 35.93\n", "| | | | | | |--- class: 1\n", "| | | | |--- age > 59.39\n", "| | | | | |--- bmi <= 35.12\n", "| | | | | | |--- class: 1\n", "| | | | | |--- bmi > 35.12\n", "| | | | | | |--- class: 1\n", "|--- ap_hi > 129.50\n", "| |--- ap_hi <= 138.50\n", "| | |--- cholesterol <= 2.50\n", "| | | |--- age <= 59.54\n", "| | | | |--- bmi <= 21.64\n", "| | | | | |--- bmi <= 17.30\n", "| | | | | | |--- class: 1\n", "| | | | | |--- bmi > 17.30\n", "| | | | | | |--- class: 0\n", "| | | | |--- bmi > 21.64\n", "| | | | | |--- age <= 39.99\n", "| | | | | | |--- class: 0\n", "| | | | | |--- age > 39.99\n", "| | | | | | |--- class: 1\n", "| | | |--- age > 59.54\n", "| | | | |--- age <= 62.46\n", "| | | | | |--- bmi <= 20.61\n", "| | | | | | |--- class: 0\n", "| | | | | |--- bmi > 20.61\n", "| | | | | | |--- class: 1\n", "| | | | |--- age > 62.46\n", "| | | | | |--- age <= 64.00\n", "| | | | | | |--- class: 1\n", "| | | | | |--- age > 64.00\n", "| | | | | | |--- class: 1\n", "| | |--- cholesterol > 2.50\n", "| | | |--- bmi <= 30.74\n", "| | | | |--- bmi <= 30.06\n", "| | | | | |--- bmi <= 23.93\n", "| | | | | | |--- class: 1\n", "| | | | | |--- bmi > 23.93\n", "| | | | | | |--- class: 1\n", "| | | | |--- bmi > 30.06\n", "| | | | | |--- bmi <= 30.69\n", "| | | | | | |--- class: 1\n", "| | | | | |--- bmi > 30.69\n", "| | | | | | |--- class: 0\n", "| | | |--- bmi > 30.74\n", "| | | | |--- bmi <= 32.05\n", "| | | | | |--- age <= 43.63\n", "| | | | | | |--- class: 0\n", "| | | | | |--- age > 43.63\n", "| | | | | | |--- class: 1\n", "| | | | |--- bmi > 32.05\n", "| | | | | |--- bmi <= 32.34\n", "| | | | | | |--- class: 1\n", "| | | | | |--- bmi > 32.34\n", "| | | | | | |--- class: 1\n", "| |--- ap_hi > 138.50\n", "| | |--- ap_hi <= 149.50\n", "| | | |--- age <= 39.56\n", "| | | | |--- bmi <= 38.19\n", "| | | | | |--- age <= 39.54\n", "| | | | | | |--- class: 1\n", "| | | | | |--- age > 39.54\n", "| | | | | | |--- class: 0\n", "| | | | |--- bmi > 38.19\n", "| | | | | |--- bmi <= 50.55\n", "| | | | | | |--- class: 0\n", "| | | | | |--- bmi > 50.55\n", "| | | | | | |--- class: 1\n", "| | | |--- age > 39.56\n", "| | | | |--- age <= 47.57\n", "| | | | | |--- bmi <= 19.23\n", "| | | | | | |--- class: 0\n", "| | | | | |--- bmi > 19.23\n", "| | | | | | |--- class: 1\n", "| | | | |--- age > 47.57\n", "| | | | | |--- age <= 61.57\n", "| | | | | | |--- class: 1\n", "| | | | | |--- age > 61.57\n", "| | | | | | |--- class: 1\n", "| | |--- ap_hi > 149.50\n", "| | | |--- bmi <= 20.48\n", "| | | | |--- age <= 64.27\n", "| | | | | |--- age <= 55.82\n", "| | | | | | |--- class: 1\n", "| | | | | |--- age > 55.82\n", "| | | | | | |--- class: 1\n", "| | | | |--- age > 64.27\n", "| | | | | |--- class: 0\n", "| | | |--- bmi > 20.48\n", "| | | | |--- age <= 64.35\n", "| | | | | |--- age <= 49.82\n", "| | | | | | |--- class: 1\n", "| | | | | |--- age > 49.82\n", "| | | | | | |--- class: 1\n", "| | | | |--- age > 64.35\n", "| | | | | |--- bmi <= 36.80\n", "| | | | | | |--- class: 1\n", "| | | | | |--- bmi > 36.80\n", "| | | | | | |--- class: 0\n", "\n" ] } ], "source": [ "rules = tree.export_text(model, feature_names=X_train.columns.values.tolist())\n", "print(rules)" ] }, { "cell_type": "code", "execution_count": 430, "metadata": {}, "outputs": [], "source": [ "import pickle\n", "\n", "pickle.dump(model, open(\"data-cardio/cardio.model.sav\", \"wb\"))" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.9" } }, "nbformat": 4, "nbformat_minor": 2 }