fuzzy-rules-generator/cardio.ipynb

2188 lines
106 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 403,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"cardio\n",
"0 35021\n",
"1 34979\n",
"Name: count, dtype: int64\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 70000 entries, 0 to 99999\n",
"Data columns (total 12 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 age 70000 non-null float64\n",
" 1 gender 70000 non-null int64 \n",
" 2 height 70000 non-null int64 \n",
" 3 weight 70000 non-null float64\n",
" 4 ap_hi 70000 non-null int64 \n",
" 5 ap_lo 70000 non-null int64 \n",
" 6 cholesterol 70000 non-null int64 \n",
" 7 gluc 70000 non-null int64 \n",
" 8 smoke 70000 non-null int64 \n",
" 9 alco 70000 non-null int64 \n",
" 10 active 70000 non-null int64 \n",
" 11 cardio 70000 non-null int64 \n",
"dtypes: float64(2), int64(10)\n",
"memory usage: 6.9 MB\n"
]
},
{
"data": {
"text/plain": [
"None"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"cardio\n",
"0 35021\n",
"1 34979\n",
"Name: count, dtype: int64"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>age</th>\n",
" <th>gender</th>\n",
" <th>height</th>\n",
" <th>weight</th>\n",
" <th>ap_hi</th>\n",
" <th>ap_lo</th>\n",
" <th>cholesterol</th>\n",
" <th>gluc</th>\n",
" <th>smoke</th>\n",
" <th>alco</th>\n",
" <th>active</th>\n",
" <th>cardio</th>\n",
" </tr>\n",
" <tr>\n",
" <th>id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>50.358668</td>\n",
" <td>2</td>\n",
" <td>168</td>\n",
" <td>62.0</td>\n",
" <td>110</td>\n",
" <td>80</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>55.382762</td>\n",
" <td>1</td>\n",
" <td>156</td>\n",
" <td>85.0</td>\n",
" <td>140</td>\n",
" <td>90</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>51.629066</td>\n",
" <td>1</td>\n",
" <td>165</td>\n",
" <td>64.0</td>\n",
" <td>130</td>\n",
" <td>70</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>48.250465</td>\n",
" <td>2</td>\n",
" <td>169</td>\n",
" <td>82.0</td>\n",
" <td>150</td>\n",
" <td>100</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>47.842515</td>\n",
" <td>1</td>\n",
" <td>156</td>\n",
" <td>56.0</td>\n",
" <td>100</td>\n",
" <td>60</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99993</th>\n",
" <td>52.677691</td>\n",
" <td>2</td>\n",
" <td>168</td>\n",
" <td>76.0</td>\n",
" <td>120</td>\n",
" <td>80</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99995</th>\n",
" <td>61.879860</td>\n",
" <td>1</td>\n",
" <td>158</td>\n",
" <td>126.0</td>\n",
" <td>140</td>\n",
" <td>90</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99996</th>\n",
" <td>52.201292</td>\n",
" <td>2</td>\n",
" <td>183</td>\n",
" <td>105.0</td>\n",
" <td>180</td>\n",
" <td>90</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99998</th>\n",
" <td>61.414412</td>\n",
" <td>1</td>\n",
" <td>163</td>\n",
" <td>72.0</td>\n",
" <td>135</td>\n",
" <td>80</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99999</th>\n",
" <td>56.236995</td>\n",
" <td>1</td>\n",
" <td>170</td>\n",
" <td>72.0</td>\n",
" <td>120</td>\n",
" <td>80</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>70000 rows × 12 columns</p>\n",
"</div>"
],
"text/plain": [
" age gender height weight ap_hi ap_lo cholesterol gluc \\\n",
"id \n",
"0 50.358668 2 168 62.0 110 80 1 1 \n",
"1 55.382762 1 156 85.0 140 90 3 1 \n",
"2 51.629066 1 165 64.0 130 70 3 1 \n",
"3 48.250465 2 169 82.0 150 100 1 1 \n",
"4 47.842515 1 156 56.0 100 60 1 1 \n",
"... ... ... ... ... ... ... ... ... \n",
"99993 52.677691 2 168 76.0 120 80 1 1 \n",
"99995 61.879860 1 158 126.0 140 90 2 2 \n",
"99996 52.201292 2 183 105.0 180 90 3 1 \n",
"99998 61.414412 1 163 72.0 135 80 1 2 \n",
"99999 56.236995 1 170 72.0 120 80 2 1 \n",
"\n",
" smoke alco active cardio \n",
"id \n",
"0 0 0 1 0 \n",
"1 0 0 1 1 \n",
"2 0 0 0 1 \n",
"3 0 0 1 1 \n",
"4 0 0 0 0 \n",
"... ... ... ... ... \n",
"99993 1 0 1 0 \n",
"99995 0 0 1 1 \n",
"99996 0 1 0 1 \n",
"99998 0 0 0 1 \n",
"99999 0 0 1 0 \n",
"\n",
"[70000 rows x 12 columns]"
]
},
"execution_count": 403,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"\n",
"df = pd.read_csv(\"data-cardio/cardio_train.csv\", sep=\";\", index_col=\"id\")\n",
"df[\"age\"] = df[\"age\"] / 365.24\n",
"print(df.cardio.value_counts())\n",
"display(df.info(), df.cardio.value_counts())\n",
"\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 404,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>count</th>\n",
" <th>mean</th>\n",
" <th>std</th>\n",
" <th>min</th>\n",
" <th>25%</th>\n",
" <th>50%</th>\n",
" <th>75%</th>\n",
" <th>max</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>age</th>\n",
" <td>70000.0</td>\n",
" <td>53.304309</td>\n",
" <td>6.755152</td>\n",
" <td>29.564122</td>\n",
" <td>48.36272</td>\n",
" <td>53.945351</td>\n",
" <td>58.391742</td>\n",
" <td>64.924433</td>\n",
" </tr>\n",
" <tr>\n",
" <th>gender</th>\n",
" <td>70000.0</td>\n",
" <td>1.349571</td>\n",
" <td>0.476838</td>\n",
" <td>1.000000</td>\n",
" <td>1.00000</td>\n",
" <td>1.000000</td>\n",
" <td>2.000000</td>\n",
" <td>2.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>height</th>\n",
" <td>70000.0</td>\n",
" <td>164.359229</td>\n",
" <td>8.210126</td>\n",
" <td>55.000000</td>\n",
" <td>159.00000</td>\n",
" <td>165.000000</td>\n",
" <td>170.000000</td>\n",
" <td>250.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>weight</th>\n",
" <td>70000.0</td>\n",
" <td>74.205690</td>\n",
" <td>14.395757</td>\n",
" <td>10.000000</td>\n",
" <td>65.00000</td>\n",
" <td>72.000000</td>\n",
" <td>82.000000</td>\n",
" <td>200.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ap_hi</th>\n",
" <td>70000.0</td>\n",
" <td>128.817286</td>\n",
" <td>154.011419</td>\n",
" <td>-150.000000</td>\n",
" <td>120.00000</td>\n",
" <td>120.000000</td>\n",
" <td>140.000000</td>\n",
" <td>16020.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ap_lo</th>\n",
" <td>70000.0</td>\n",
" <td>96.630414</td>\n",
" <td>188.472530</td>\n",
" <td>-70.000000</td>\n",
" <td>80.00000</td>\n",
" <td>80.000000</td>\n",
" <td>90.000000</td>\n",
" <td>11000.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>cholesterol</th>\n",
" <td>70000.0</td>\n",
" <td>1.366871</td>\n",
" <td>0.680250</td>\n",
" <td>1.000000</td>\n",
" <td>1.00000</td>\n",
" <td>1.000000</td>\n",
" <td>2.000000</td>\n",
" <td>3.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>gluc</th>\n",
" <td>70000.0</td>\n",
" <td>1.226457</td>\n",
" <td>0.572270</td>\n",
" <td>1.000000</td>\n",
" <td>1.00000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>3.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>smoke</th>\n",
" <td>70000.0</td>\n",
" <td>0.088129</td>\n",
" <td>0.283484</td>\n",
" <td>0.000000</td>\n",
" <td>0.00000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>alco</th>\n",
" <td>70000.0</td>\n",
" <td>0.053771</td>\n",
" <td>0.225568</td>\n",
" <td>0.000000</td>\n",
" <td>0.00000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>active</th>\n",
" <td>70000.0</td>\n",
" <td>0.803729</td>\n",
" <td>0.397179</td>\n",
" <td>0.000000</td>\n",
" <td>1.00000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>cardio</th>\n",
" <td>70000.0</td>\n",
" <td>0.499700</td>\n",
" <td>0.500003</td>\n",
" <td>0.000000</td>\n",
" <td>0.00000</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" count mean std min 25% \\\n",
"age 70000.0 53.304309 6.755152 29.564122 48.36272 \n",
"gender 70000.0 1.349571 0.476838 1.000000 1.00000 \n",
"height 70000.0 164.359229 8.210126 55.000000 159.00000 \n",
"weight 70000.0 74.205690 14.395757 10.000000 65.00000 \n",
"ap_hi 70000.0 128.817286 154.011419 -150.000000 120.00000 \n",
"ap_lo 70000.0 96.630414 188.472530 -70.000000 80.00000 \n",
"cholesterol 70000.0 1.366871 0.680250 1.000000 1.00000 \n",
"gluc 70000.0 1.226457 0.572270 1.000000 1.00000 \n",
"smoke 70000.0 0.088129 0.283484 0.000000 0.00000 \n",
"alco 70000.0 0.053771 0.225568 0.000000 0.00000 \n",
"active 70000.0 0.803729 0.397179 0.000000 1.00000 \n",
"cardio 70000.0 0.499700 0.500003 0.000000 0.00000 \n",
"\n",
" 50% 75% max \n",
"age 53.945351 58.391742 64.924433 \n",
"gender 1.000000 2.000000 2.000000 \n",
"height 165.000000 170.000000 250.000000 \n",
"weight 72.000000 82.000000 200.000000 \n",
"ap_hi 120.000000 140.000000 16020.000000 \n",
"ap_lo 80.000000 90.000000 11000.000000 \n",
"cholesterol 1.000000 2.000000 3.000000 \n",
"gluc 1.000000 1.000000 3.000000 \n",
"smoke 0.000000 0.000000 1.000000 \n",
"alco 0.000000 0.000000 1.000000 \n",
"active 1.000000 1.000000 1.000000 \n",
"cardio 0.000000 1.000000 1.000000 "
]
},
"execution_count": 404,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe().transpose()"
]
},
{
"cell_type": "code",
"execution_count": 405,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>count</th>\n",
" <th>mean</th>\n",
" <th>std</th>\n",
" <th>min</th>\n",
" <th>25%</th>\n",
" <th>50%</th>\n",
" <th>75%</th>\n",
" <th>max</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>age</th>\n",
" <td>68985.0</td>\n",
" <td>53.290421</td>\n",
" <td>6.757633</td>\n",
" <td>29.564122</td>\n",
" <td>48.340817</td>\n",
" <td>53.939875</td>\n",
" <td>58.380791</td>\n",
" <td>64.924433</td>\n",
" </tr>\n",
" <tr>\n",
" <th>gender</th>\n",
" <td>68985.0</td>\n",
" <td>1.348670</td>\n",
" <td>0.476553</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>2.000000</td>\n",
" <td>2.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>height</th>\n",
" <td>68985.0</td>\n",
" <td>164.359672</td>\n",
" <td>8.204273</td>\n",
" <td>55.000000</td>\n",
" <td>159.000000</td>\n",
" <td>165.000000</td>\n",
" <td>170.000000</td>\n",
" <td>250.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>weight</th>\n",
" <td>68985.0</td>\n",
" <td>74.118961</td>\n",
" <td>14.328938</td>\n",
" <td>11.000000</td>\n",
" <td>65.000000</td>\n",
" <td>72.000000</td>\n",
" <td>82.000000</td>\n",
" <td>200.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ap_hi</th>\n",
" <td>68985.0</td>\n",
" <td>126.325027</td>\n",
" <td>17.698621</td>\n",
" <td>7.000000</td>\n",
" <td>120.000000</td>\n",
" <td>120.000000</td>\n",
" <td>140.000000</td>\n",
" <td>240.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ap_lo</th>\n",
" <td>68985.0</td>\n",
" <td>81.350482</td>\n",
" <td>9.805666</td>\n",
" <td>1.000000</td>\n",
" <td>80.000000</td>\n",
" <td>80.000000</td>\n",
" <td>90.000000</td>\n",
" <td>190.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>cholesterol</th>\n",
" <td>68985.0</td>\n",
" <td>1.364384</td>\n",
" <td>0.678691</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>3.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>gluc</th>\n",
" <td>68985.0</td>\n",
" <td>1.225875</td>\n",
" <td>0.571822</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>3.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>smoke</th>\n",
" <td>68985.0</td>\n",
" <td>0.087860</td>\n",
" <td>0.283093</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>alco</th>\n",
" <td>68985.0</td>\n",
" <td>0.053591</td>\n",
" <td>0.225211</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>active</th>\n",
" <td>68985.0</td>\n",
" <td>0.803276</td>\n",
" <td>0.397525</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>cardio</th>\n",
" <td>68985.0</td>\n",
" <td>0.494905</td>\n",
" <td>0.499978</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" count mean std min 25% \\\n",
"age 68985.0 53.290421 6.757633 29.564122 48.340817 \n",
"gender 68985.0 1.348670 0.476553 1.000000 1.000000 \n",
"height 68985.0 164.359672 8.204273 55.000000 159.000000 \n",
"weight 68985.0 74.118961 14.328938 11.000000 65.000000 \n",
"ap_hi 68985.0 126.325027 17.698621 7.000000 120.000000 \n",
"ap_lo 68985.0 81.350482 9.805666 1.000000 80.000000 \n",
"cholesterol 68985.0 1.364384 0.678691 1.000000 1.000000 \n",
"gluc 68985.0 1.225875 0.571822 1.000000 1.000000 \n",
"smoke 68985.0 0.087860 0.283093 0.000000 0.000000 \n",
"alco 68985.0 0.053591 0.225211 0.000000 0.000000 \n",
"active 68985.0 0.803276 0.397525 0.000000 1.000000 \n",
"cardio 68985.0 0.494905 0.499978 0.000000 0.000000 \n",
"\n",
" 50% 75% max \n",
"age 53.939875 58.380791 64.924433 \n",
"gender 1.000000 2.000000 2.000000 \n",
"height 165.000000 170.000000 250.000000 \n",
"weight 72.000000 82.000000 200.000000 \n",
"ap_hi 120.000000 140.000000 240.000000 \n",
"ap_lo 80.000000 90.000000 190.000000 \n",
"cholesterol 1.000000 1.000000 3.000000 \n",
"gluc 1.000000 1.000000 3.000000 \n",
"smoke 0.000000 0.000000 1.000000 \n",
"alco 0.000000 0.000000 1.000000 \n",
"active 1.000000 1.000000 1.000000 \n",
"cardio 0.000000 1.000000 1.000000 "
]
},
"execution_count": 405,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = df.query(\"ap_hi > 0 and ap_hi < 370\")\n",
"df = df.query(\"ap_lo > 0 and ap_lo < 370\")\n",
"df.describe().transpose()"
]
},
{
"cell_type": "code",
"execution_count": 406,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>age</th>\n",
" <th>gender</th>\n",
" <th>ap_hi</th>\n",
" <th>ap_lo</th>\n",
" <th>cholesterol</th>\n",
" <th>gluc</th>\n",
" <th>smoke</th>\n",
" <th>alco</th>\n",
" <th>active</th>\n",
" <th>cardio</th>\n",
" <th>bmi</th>\n",
" </tr>\n",
" <tr>\n",
" <th>id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>50.358668</td>\n",
" <td>2</td>\n",
" <td>110</td>\n",
" <td>80</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>21.967120</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>55.382762</td>\n",
" <td>1</td>\n",
" <td>140</td>\n",
" <td>90</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>34.927679</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>51.629066</td>\n",
" <td>1</td>\n",
" <td>130</td>\n",
" <td>70</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>23.507805</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>48.250465</td>\n",
" <td>2</td>\n",
" <td>150</td>\n",
" <td>100</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>28.710479</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>47.842515</td>\n",
" <td>1</td>\n",
" <td>100</td>\n",
" <td>60</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>23.011177</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99993</th>\n",
" <td>52.677691</td>\n",
" <td>2</td>\n",
" <td>120</td>\n",
" <td>80</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>26.927438</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99995</th>\n",
" <td>61.879860</td>\n",
" <td>1</td>\n",
" <td>140</td>\n",
" <td>90</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>50.472681</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99996</th>\n",
" <td>52.201292</td>\n",
" <td>2</td>\n",
" <td>180</td>\n",
" <td>90</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>31.353579</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99998</th>\n",
" <td>61.414412</td>\n",
" <td>1</td>\n",
" <td>135</td>\n",
" <td>80</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>27.099251</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99999</th>\n",
" <td>56.236995</td>\n",
" <td>1</td>\n",
" <td>120</td>\n",
" <td>80</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>24.913495</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>68985 rows × 11 columns</p>\n",
"</div>"
],
"text/plain": [
" age gender ap_hi ap_lo cholesterol gluc smoke alco \\\n",
"id \n",
"0 50.358668 2 110 80 1 1 0 0 \n",
"1 55.382762 1 140 90 3 1 0 0 \n",
"2 51.629066 1 130 70 3 1 0 0 \n",
"3 48.250465 2 150 100 1 1 0 0 \n",
"4 47.842515 1 100 60 1 1 0 0 \n",
"... ... ... ... ... ... ... ... ... \n",
"99993 52.677691 2 120 80 1 1 1 0 \n",
"99995 61.879860 1 140 90 2 2 0 0 \n",
"99996 52.201292 2 180 90 3 1 0 1 \n",
"99998 61.414412 1 135 80 1 2 0 0 \n",
"99999 56.236995 1 120 80 2 1 0 0 \n",
"\n",
" active cardio bmi \n",
"id \n",
"0 1 0 21.967120 \n",
"1 1 1 34.927679 \n",
"2 0 1 23.507805 \n",
"3 1 1 28.710479 \n",
"4 0 0 23.011177 \n",
"... ... ... ... \n",
"99993 1 0 26.927438 \n",
"99995 1 1 50.472681 \n",
"99996 0 1 31.353579 \n",
"99998 0 1 27.099251 \n",
"99999 1 0 24.913495 \n",
"\n",
"[68985 rows x 11 columns]"
]
},
"execution_count": 406,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[\"bmi\"] = df[\"weight\"] / (df[\"height\"] / 100) ** 2\n",
"df.drop([\"weight\", \"height\"], axis=1, inplace=True)\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 407,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<Axes: >"
]
},
"execution_count": 407,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 640x480 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import seaborn as sns\n",
"\n",
"sns.heatmap(df.corr())"
]
},
{
"cell_type": "code",
"execution_count": 411,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Optimization terminated successfully.\n",
" Current function value: 0.567793\n",
" Iterations 6\n"
]
},
{
"data": {
"text/html": [
"<table class=\"simpletable\">\n",
"<tr>\n",
" <td>Model:</td> <td>Logit</td> <td>Method:</td> <td>MLE</td> \n",
"</tr>\n",
"<tr>\n",
" <td>Dependent Variable:</td> <td>cardio</td> <td>Pseudo R-squared:</td> <td>0.181</td> \n",
"</tr>\n",
"<tr>\n",
" <td>Date:</td> <td>2025-02-21 23:59</td> <td>AIC:</td> <td>66609.4135</td>\n",
"</tr>\n",
"<tr>\n",
" <td>No. Observations:</td> <td>58637</td> <td>BIC:</td> <td>66708.1838</td>\n",
"</tr>\n",
"<tr>\n",
" <td>Df Model:</td> <td>10</td> <td>Log-Likelihood:</td> <td>-33294.</td> \n",
"</tr>\n",
"<tr>\n",
" <td>Df Residuals:</td> <td>58626</td> <td>LL-Null:</td> <td>-40641.</td> \n",
"</tr>\n",
"<tr>\n",
" <td>Converged:</td> <td>1.0000</td> <td>LLR p-value:</td> <td>0.0000</td> \n",
"</tr>\n",
"<tr>\n",
" <td>No. Iterations:</td> <td>6.0000</td> <td>Scale:</td> <td>1.0000</td> \n",
"</tr>\n",
"</table>\n",
"<table class=\"simpletable\">\n",
"<tr>\n",
" <td></td> <th>Coef.</th> <th>Std.Err.</th> <th>z</th> <th>P>|z|</th> <th>[0.025</th> <th>0.975]</th> \n",
"</tr>\n",
"<tr>\n",
" <th>const</th> <td>-11.5426</td> <td>0.1324</td> <td>-87.1794</td> <td>0.0000</td> <td>-11.8021</td> <td>-11.2831</td>\n",
"</tr>\n",
"<tr>\n",
" <th>age</th> <td>0.0523</td> <td>0.0014</td> <td>36.0898</td> <td>0.0000</td> <td>0.0494</td> <td>0.0551</td> \n",
"</tr>\n",
"<tr>\n",
" <th>gender</th> <td>0.0206</td> <td>0.0211</td> <td>0.9792</td> <td>0.3275</td> <td>-0.0207</td> <td>0.0619</td> \n",
"</tr>\n",
"<tr>\n",
" <th>ap_hi</th> <td>0.0469</td> <td>0.0009</td> <td>52.1945</td> <td>0.0000</td> <td>0.0451</td> <td>0.0486</td> \n",
"</tr>\n",
"<tr>\n",
" <th>ap_lo</th> <td>0.0232</td> <td>0.0014</td> <td>16.3321</td> <td>0.0000</td> <td>0.0204</td> <td>0.0259</td> \n",
"</tr>\n",
"<tr>\n",
" <th>cholesterol</th> <td>0.4859</td> <td>0.0168</td> <td>29.0062</td> <td>0.0000</td> <td>0.4531</td> <td>0.5187</td> \n",
"</tr>\n",
"<tr>\n",
" <th>gluc</th> <td>-0.1116</td> <td>0.0190</td> <td>-5.8803</td> <td>0.0000</td> <td>-0.1488</td> <td>-0.0744</td>\n",
"</tr>\n",
"<tr>\n",
" <th>smoke</th> <td>-0.1279</td> <td>0.0373</td> <td>-3.4313</td> <td>0.0006</td> <td>-0.2009</td> <td>-0.0548</td>\n",
"</tr>\n",
"<tr>\n",
" <th>alco</th> <td>-0.1782</td> <td>0.0452</td> <td>-3.9429</td> <td>0.0001</td> <td>-0.2668</td> <td>-0.0896</td>\n",
"</tr>\n",
"<tr>\n",
" <th>active</th> <td>-0.2377</td> <td>0.0235</td> <td>-10.1027</td> <td>0.0000</td> <td>-0.2838</td> <td>-0.1916</td>\n",
"</tr>\n",
"<tr>\n",
" <th>bmi</th> <td>0.0226</td> <td>0.0018</td> <td>12.4231</td> <td>0.0000</td> <td>0.0190</td> <td>0.0261</td> \n",
"</tr>\n",
"</table><br/>\n"
],
"text/latex": [
"\\begin{table}\n",
"\\caption{Results: Logit}\n",
"\\label{}\n",
"\\begin{center}\n",
"\\begin{tabular}{llll}\n",
"\\hline\n",
"Model: & Logit & Method: & MLE \\\\\n",
"Dependent Variable: & cardio & Pseudo R-squared: & 0.181 \\\\\n",
"Date: & 2025-02-21 23:59 & AIC: & 66609.4135 \\\\\n",
"No. Observations: & 58637 & BIC: & 66708.1838 \\\\\n",
"Df Model: & 10 & Log-Likelihood: & -33294. \\\\\n",
"Df Residuals: & 58626 & LL-Null: & -40641. \\\\\n",
"Converged: & 1.0000 & LLR p-value: & 0.0000 \\\\\n",
"No. Iterations: & 6.0000 & Scale: & 1.0000 \\\\\n",
"\\hline\n",
"\\end{tabular}\n",
"\\end{center}\n",
"\n",
"\\begin{center}\n",
"\\begin{tabular}{lrrrrrr}\n",
"\\hline\n",
" & Coef. & Std.Err. & z & P$> |$z$|$ & [0.025 & 0.975] \\\\\n",
"\\hline\n",
"const & -11.5426 & 0.1324 & -87.1794 & 0.0000 & -11.8021 & -11.2831 \\\\\n",
"age & 0.0523 & 0.0014 & 36.0898 & 0.0000 & 0.0494 & 0.0551 \\\\\n",
"gender & 0.0206 & 0.0211 & 0.9792 & 0.3275 & -0.0207 & 0.0619 \\\\\n",
"ap\\_hi & 0.0469 & 0.0009 & 52.1945 & 0.0000 & 0.0451 & 0.0486 \\\\\n",
"ap\\_lo & 0.0232 & 0.0014 & 16.3321 & 0.0000 & 0.0204 & 0.0259 \\\\\n",
"cholesterol & 0.4859 & 0.0168 & 29.0062 & 0.0000 & 0.4531 & 0.5187 \\\\\n",
"gluc & -0.1116 & 0.0190 & -5.8803 & 0.0000 & -0.1488 & -0.0744 \\\\\n",
"smoke & -0.1279 & 0.0373 & -3.4313 & 0.0006 & -0.2009 & -0.0548 \\\\\n",
"alco & -0.1782 & 0.0452 & -3.9429 & 0.0001 & -0.2668 & -0.0896 \\\\\n",
"active & -0.2377 & 0.0235 & -10.1027 & 0.0000 & -0.2838 & -0.1916 \\\\\n",
"bmi & 0.0226 & 0.0018 & 12.4231 & 0.0000 & 0.0190 & 0.0261 \\\\\n",
"\\hline\n",
"\\end{tabular}\n",
"\\end{center}\n",
"\\end{table}\n",
"\\bigskip\n"
],
"text/plain": [
"<class 'statsmodels.iolib.summary2.Summary'>\n",
"\"\"\"\n",
" Results: Logit\n",
"=================================================================\n",
"Model: Logit Method: MLE \n",
"Dependent Variable: cardio Pseudo R-squared: 0.181 \n",
"Date: 2025-02-21 23:59 AIC: 66609.4135\n",
"No. Observations: 58637 BIC: 66708.1838\n",
"Df Model: 10 Log-Likelihood: -33294. \n",
"Df Residuals: 58626 LL-Null: -40641. \n",
"Converged: 1.0000 LLR p-value: 0.0000 \n",
"No. Iterations: 6.0000 Scale: 1.0000 \n",
"-----------------------------------------------------------------\n",
" Coef. Std.Err. z P>|z| [0.025 0.975] \n",
"-----------------------------------------------------------------\n",
"const -11.5426 0.1324 -87.1794 0.0000 -11.8021 -11.2831\n",
"age 0.0523 0.0014 36.0898 0.0000 0.0494 0.0551\n",
"gender 0.0206 0.0211 0.9792 0.3275 -0.0207 0.0619\n",
"ap_hi 0.0469 0.0009 52.1945 0.0000 0.0451 0.0486\n",
"ap_lo 0.0232 0.0014 16.3321 0.0000 0.0204 0.0259\n",
"cholesterol 0.4859 0.0168 29.0062 0.0000 0.4531 0.5187\n",
"gluc -0.1116 0.0190 -5.8803 0.0000 -0.1488 -0.0744\n",
"smoke -0.1279 0.0373 -3.4313 0.0006 -0.2009 -0.0548\n",
"alco -0.1782 0.0452 -3.9429 0.0001 -0.2668 -0.0896\n",
"active -0.2377 0.0235 -10.1027 0.0000 -0.2838 -0.1916\n",
"bmi 0.0226 0.0018 12.4231 0.0000 0.0190 0.0261\n",
"=================================================================\n",
"\n",
"\"\"\""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"cholesterol 1.625611\n",
"age 1.053639\n",
"ap_hi 1.047999\n",
"ap_lo 1.023425\n",
"bmi 1.022841\n",
"gender 1.020836\n",
"gluc 0.894398\n",
"smoke 0.879970\n",
"alco 0.836750\n",
"active 0.788470\n",
"const 0.000010\n",
"dtype: float64"
]
},
"execution_count": 411,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.model_selection import train_test_split\n",
"import statsmodels.api as sm\n",
"import numpy as np\n",
"\n",
"random_state = 9\n",
"\n",
"stat_y = df[\"cardio\"]\n",
"stat_X = df.drop([\"cardio\"], axis=1).copy()\n",
"stat_X_train, stat_X_test, stat_y_train, stat_y_test = train_test_split(\n",
" stat_X, stat_y, test_size=0.15, random_state=9\n",
")\n",
"\n",
"log_model = sm.Logit(stat_y_train, sm.add_constant(stat_X_train))\n",
"log_result = log_model.fit()\n",
"display(log_result.summary2())\n",
"np.exp(log_result.params).sort_values(ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": 421,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>age</th>\n",
" <th>ap_hi</th>\n",
" <th>cholesterol</th>\n",
" <th>cardio</th>\n",
" <th>bmi</th>\n",
" </tr>\n",
" <tr>\n",
" <th>id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>50.358668</td>\n",
" <td>110</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>21.967120</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>55.382762</td>\n",
" <td>140</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>34.927679</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>51.629066</td>\n",
" <td>130</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>23.507805</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>48.250465</td>\n",
" <td>150</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>28.710479</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>47.842515</td>\n",
" <td>100</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>23.011177</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99993</th>\n",
" <td>52.677691</td>\n",
" <td>120</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>26.927438</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99995</th>\n",
" <td>61.879860</td>\n",
" <td>140</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>50.472681</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99996</th>\n",
" <td>52.201292</td>\n",
" <td>180</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>31.353579</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99998</th>\n",
" <td>61.414412</td>\n",
" <td>135</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>27.099251</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99999</th>\n",
" <td>56.236995</td>\n",
" <td>120</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>24.913495</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>68985 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" age ap_hi cholesterol cardio bmi\n",
"id \n",
"0 50.358668 110 1 0 21.967120\n",
"1 55.382762 140 3 1 34.927679\n",
"2 51.629066 130 3 1 23.507805\n",
"3 48.250465 150 1 1 28.710479\n",
"4 47.842515 100 1 0 23.011177\n",
"... ... ... ... ... ...\n",
"99993 52.677691 120 1 0 26.927438\n",
"99995 61.879860 140 2 1 50.472681\n",
"99996 52.201292 180 3 1 31.353579\n",
"99998 61.414412 135 1 1 27.099251\n",
"99999 56.236995 120 2 0 24.913495\n",
"\n",
"[68985 rows x 5 columns]"
]
},
"execution_count": 421,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# data = df.drop([\"gluc\", \"smoke\", \"alco\", \"active\", \"ap_lo\"], axis=1)\n",
"data = df.drop(\n",
" [\n",
" \"gluc\",\n",
" \"smoke\",\n",
" \"alco\",\n",
" \"ap_lo\",\n",
" \"gender\",\n",
" # \"cholesterol\",\n",
" \"active\",\n",
" ],\n",
" axis=1,\n",
")\n",
"# data = df.copy()\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 431,
"metadata": {},
"outputs": [],
"source": [
"data.to_csv(\"data-cardio/cardio_clear.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 422,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>age</th>\n",
" <th>ap_hi</th>\n",
" <th>cholesterol</th>\n",
" <th>bmi</th>\n",
" </tr>\n",
" <tr>\n",
" <th>id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>94960</th>\n",
" <td>62.014018</td>\n",
" <td>120</td>\n",
" <td>1</td>\n",
" <td>26.892323</td>\n",
" </tr>\n",
" <tr>\n",
" <th>30807</th>\n",
" <td>57.745592</td>\n",
" <td>120</td>\n",
" <td>1</td>\n",
" <td>28.393726</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26485</th>\n",
" <td>59.670354</td>\n",
" <td>120</td>\n",
" <td>3</td>\n",
" <td>23.875115</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3868</th>\n",
" <td>49.715256</td>\n",
" <td>110</td>\n",
" <td>1</td>\n",
" <td>20.820940</td>\n",
" </tr>\n",
" <tr>\n",
" <th>45890</th>\n",
" <td>59.785347</td>\n",
" <td>160</td>\n",
" <td>1</td>\n",
" <td>23.529412</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>61975</th>\n",
" <td>62.558865</td>\n",
" <td>120</td>\n",
" <td>1</td>\n",
" <td>28.196921</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32741</th>\n",
" <td>57.882488</td>\n",
" <td>120</td>\n",
" <td>1</td>\n",
" <td>29.043709</td>\n",
" </tr>\n",
" <tr>\n",
" <th>94833</th>\n",
" <td>51.371701</td>\n",
" <td>120</td>\n",
" <td>1</td>\n",
" <td>29.242109</td>\n",
" </tr>\n",
" <tr>\n",
" <th>95660</th>\n",
" <td>45.767167</td>\n",
" <td>120</td>\n",
" <td>1</td>\n",
" <td>24.977043</td>\n",
" </tr>\n",
" <tr>\n",
" <th>81002</th>\n",
" <td>55.544300</td>\n",
" <td>150</td>\n",
" <td>1</td>\n",
" <td>27.053803</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>55188 rows × 4 columns</p>\n",
"</div>"
],
"text/plain": [
" age ap_hi cholesterol bmi\n",
"id \n",
"94960 62.014018 120 1 26.892323\n",
"30807 57.745592 120 1 28.393726\n",
"26485 59.670354 120 3 23.875115\n",
"3868 49.715256 110 1 20.820940\n",
"45890 59.785347 160 1 23.529412\n",
"... ... ... ... ...\n",
"61975 62.558865 120 1 28.196921\n",
"32741 57.882488 120 1 29.043709\n",
"94833 51.371701 120 1 29.242109\n",
"95660 45.767167 120 1 24.977043\n",
"81002 55.544300 150 1 27.053803\n",
"\n",
"[55188 rows x 4 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"id\n",
"94960 0\n",
"30807 0\n",
"26485 0\n",
"3868 1\n",
"45890 1\n",
" ..\n",
"61975 1\n",
"32741 0\n",
"94833 0\n",
"95660 0\n",
"81002 1\n",
"Name: cardio, Length: 55188, dtype: int64"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>age</th>\n",
" <th>ap_hi</th>\n",
" <th>cholesterol</th>\n",
" <th>bmi</th>\n",
" </tr>\n",
" <tr>\n",
" <th>id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>42270</th>\n",
" <td>60.078305</td>\n",
" <td>140</td>\n",
" <td>1</td>\n",
" <td>45.918367</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10780</th>\n",
" <td>55.360859</td>\n",
" <td>120</td>\n",
" <td>2</td>\n",
" <td>24.998904</td>\n",
" </tr>\n",
" <tr>\n",
" <th>42436</th>\n",
" <td>48.198445</td>\n",
" <td>100</td>\n",
" <td>3</td>\n",
" <td>21.926126</td>\n",
" </tr>\n",
" <tr>\n",
" <th>88647</th>\n",
" <td>41.517906</td>\n",
" <td>130</td>\n",
" <td>2</td>\n",
" <td>27.764650</td>\n",
" </tr>\n",
" <tr>\n",
" <th>62336</th>\n",
" <td>51.692038</td>\n",
" <td>110</td>\n",
" <td>1</td>\n",
" <td>22.230987</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>30330</th>\n",
" <td>47.697404</td>\n",
" <td>100</td>\n",
" <td>1</td>\n",
" <td>22.724403</td>\n",
" </tr>\n",
" <tr>\n",
" <th>62907</th>\n",
" <td>58.597087</td>\n",
" <td>120</td>\n",
" <td>1</td>\n",
" <td>23.828125</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98612</th>\n",
" <td>51.404556</td>\n",
" <td>110</td>\n",
" <td>1</td>\n",
" <td>22.589551</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5767</th>\n",
" <td>62.033184</td>\n",
" <td>120</td>\n",
" <td>1</td>\n",
" <td>23.875115</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14769</th>\n",
" <td>41.506954</td>\n",
" <td>120</td>\n",
" <td>2</td>\n",
" <td>22.948116</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>13797 rows × 4 columns</p>\n",
"</div>"
],
"text/plain": [
" age ap_hi cholesterol bmi\n",
"id \n",
"42270 60.078305 140 1 45.918367\n",
"10780 55.360859 120 2 24.998904\n",
"42436 48.198445 100 3 21.926126\n",
"88647 41.517906 130 2 27.764650\n",
"62336 51.692038 110 1 22.230987\n",
"... ... ... ... ...\n",
"30330 47.697404 100 1 22.724403\n",
"62907 58.597087 120 1 23.828125\n",
"98612 51.404556 110 1 22.589551\n",
"5767 62.033184 120 1 23.875115\n",
"14769 41.506954 120 2 22.948116\n",
"\n",
"[13797 rows x 4 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"id\n",
"42270 1\n",
"10780 0\n",
"42436 1\n",
"88647 1\n",
"62336 0\n",
" ..\n",
"30330 1\n",
"62907 0\n",
"98612 0\n",
"5767 0\n",
"14769 1\n",
"Name: cardio, Length: 13797, dtype: int64"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"y = data[\"cardio\"]\n",
"X = data.drop([\"cardio\"], axis=1).copy()\n",
"X_train, X_test, y_train, y_test = train_test_split(\n",
" X, y, test_size=0.2, random_state=random_state\n",
")\n",
"display(X_train, y_train, X_test, y_test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/user/Projects/python/fuzzy-rules-generator/.venv/lib/python3.12/site-packages/sklearn/base.py:486: UserWarning: X has feature names, but DecisionTreeClassifier was fitted without feature names\n",
" warnings.warn(\n",
"/Users/user/Projects/python/fuzzy-rules-generator/.venv/lib/python3.12/site-packages/sklearn/base.py:486: UserWarning: X has feature names, but DecisionTreeClassifier was fitted without feature names\n",
" warnings.warn(\n",
"/Users/user/Projects/python/fuzzy-rules-generator/.venv/lib/python3.12/site-packages/sklearn/base.py:486: UserWarning: X has feature names, but DecisionTreeClassifier was fitted without feature names\n",
" warnings.warn(\n"
]
},
{
"data": {
"text/plain": [
"'Precision_test'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"np.float64(0.7368262116865468)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Recall_test'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"np.float64(0.7180694526191878)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Accuracy_test'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"0.7347974197289265"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'ROC_AUC_test'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"np.float64(0.7917285464726767)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'F1_test'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"np.float64(0.7273269245100231)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'MCC_test'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"np.float64(0.46942772902650703)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Cohen_kappa_test'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"np.float64(0.4692799184358021)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Confusion_matrix'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"array([[5258, 1743],\n",
" [1916, 4880]])"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from src.utils import run_classification\n",
"from sklearn import tree\n",
"from sklearn import metrics\n",
"import numpy as np\n",
"\n",
"\n",
"model = tree.DecisionTreeClassifier(\n",
" random_state=random_state,\n",
" max_depth=6,\n",
").fit(X_train.values, y_train.values.ravel())\n",
"\n",
"\n",
"y_train_predict = model.predict(X_train)\n",
"y_test_probs = model.predict_proba(X_test)\n",
"y_test_predict = model.predict(X_test)\n",
"\n",
"\n",
"display(\"Precision_test\", metrics.precision_score(y_test, y_test_predict))\n",
"display(\"Recall_test\", metrics.recall_score(y_test, y_test_predict))\n",
"display(\"Accuracy_test\", metrics.accuracy_score(y_test, y_test_predict))\n",
"display(\"ROC_AUC_test\", metrics.roc_auc_score(y_test, y_test_probs[:, 1])) # type: ignore\n",
"display(\"F1_test\", metrics.f1_score(y_test, y_test_predict))\n",
"display(\"MCC_test\", metrics.matthews_corrcoef(y_test, y_test_predict))\n",
"display(\"Cohen_kappa_test\", metrics.cohen_kappa_score(y_test, y_test_predict))\n",
"display(\"Confusion_matrix\", metrics.confusion_matrix(y_test, y_test_predict))"
]
},
{
"cell_type": "code",
"execution_count": 429,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"|--- ap_hi <= 129.50\n",
"| |--- age <= 54.65\n",
"| | |--- cholesterol <= 2.50\n",
"| | | |--- age <= 43.79\n",
"| | | | |--- cholesterol <= 1.50\n",
"| | | | | |--- ap_hi <= 114.50\n",
"| | | | | | |--- class: 0\n",
"| | | | | |--- ap_hi > 114.50\n",
"| | | | | | |--- class: 0\n",
"| | | | |--- cholesterol > 1.50\n",
"| | | | | |--- bmi <= 28.87\n",
"| | | | | | |--- class: 0\n",
"| | | | | |--- bmi > 28.87\n",
"| | | | | | |--- class: 0\n",
"| | | |--- age > 43.79\n",
"| | | | |--- ap_hi <= 119.50\n",
"| | | | | |--- bmi <= 22.05\n",
"| | | | | | |--- class: 0\n",
"| | | | | |--- bmi > 22.05\n",
"| | | | | | |--- class: 0\n",
"| | | | |--- ap_hi > 119.50\n",
"| | | | | |--- bmi <= 27.71\n",
"| | | | | | |--- class: 0\n",
"| | | | | |--- bmi > 27.71\n",
"| | | | | | |--- class: 0\n",
"| | |--- cholesterol > 2.50\n",
"| | | |--- bmi <= 29.04\n",
"| | | | |--- age <= 41.60\n",
"| | | | | |--- ap_hi <= 115.00\n",
"| | | | | | |--- class: 0\n",
"| | | | | |--- ap_hi > 115.00\n",
"| | | | | | |--- class: 0\n",
"| | | | |--- age > 41.60\n",
"| | | | | |--- age <= 54.17\n",
"| | | | | | |--- class: 1\n",
"| | | | | |--- age > 54.17\n",
"| | | | | | |--- class: 0\n",
"| | | |--- bmi > 29.04\n",
"| | | | |--- age <= 54.01\n",
"| | | | | |--- age <= 39.75\n",
"| | | | | | |--- class: 0\n",
"| | | | | |--- age > 39.75\n",
"| | | | | | |--- class: 1\n",
"| | | | |--- age > 54.01\n",
"| | | | | |--- bmi <= 35.02\n",
"| | | | | | |--- class: 0\n",
"| | | | | |--- bmi > 35.02\n",
"| | | | | | |--- class: 1\n",
"| |--- age > 54.65\n",
"| | |--- cholesterol <= 2.50\n",
"| | | |--- age <= 60.71\n",
"| | | | |--- ap_hi <= 118.50\n",
"| | | | | |--- bmi <= 23.33\n",
"| | | | | | |--- class: 0\n",
"| | | | | |--- bmi > 23.33\n",
"| | | | | | |--- class: 0\n",
"| | | | |--- ap_hi > 118.50\n",
"| | | | | |--- bmi <= 32.89\n",
"| | | | | | |--- class: 0\n",
"| | | | | |--- bmi > 32.89\n",
"| | | | | | |--- class: 1\n",
"| | | |--- age > 60.71\n",
"| | | | |--- bmi <= 20.51\n",
"| | | | | |--- age <= 64.31\n",
"| | | | | | |--- class: 0\n",
"| | | | | |--- age > 64.31\n",
"| | | | | | |--- class: 1\n",
"| | | | |--- bmi > 20.51\n",
"| | | | | |--- ap_hi <= 115.50\n",
"| | | | | | |--- class: 0\n",
"| | | | | |--- ap_hi > 115.50\n",
"| | | | | | |--- class: 1\n",
"| | |--- cholesterol > 2.50\n",
"| | | |--- bmi <= 26.03\n",
"| | | | |--- age <= 60.89\n",
"| | | | | |--- age <= 60.48\n",
"| | | | | | |--- class: 1\n",
"| | | | | |--- age > 60.48\n",
"| | | | | | |--- class: 0\n",
"| | | | |--- age > 60.89\n",
"| | | | | |--- bmi <= 25.91\n",
"| | | | | | |--- class: 1\n",
"| | | | | |--- bmi > 25.91\n",
"| | | | | | |--- class: 0\n",
"| | | |--- bmi > 26.03\n",
"| | | | |--- age <= 59.39\n",
"| | | | | |--- bmi <= 35.93\n",
"| | | | | | |--- class: 1\n",
"| | | | | |--- bmi > 35.93\n",
"| | | | | | |--- class: 1\n",
"| | | | |--- age > 59.39\n",
"| | | | | |--- bmi <= 35.12\n",
"| | | | | | |--- class: 1\n",
"| | | | | |--- bmi > 35.12\n",
"| | | | | | |--- class: 1\n",
"|--- ap_hi > 129.50\n",
"| |--- ap_hi <= 138.50\n",
"| | |--- cholesterol <= 2.50\n",
"| | | |--- age <= 59.54\n",
"| | | | |--- bmi <= 21.64\n",
"| | | | | |--- bmi <= 17.30\n",
"| | | | | | |--- class: 1\n",
"| | | | | |--- bmi > 17.30\n",
"| | | | | | |--- class: 0\n",
"| | | | |--- bmi > 21.64\n",
"| | | | | |--- age <= 39.99\n",
"| | | | | | |--- class: 0\n",
"| | | | | |--- age > 39.99\n",
"| | | | | | |--- class: 1\n",
"| | | |--- age > 59.54\n",
"| | | | |--- age <= 62.46\n",
"| | | | | |--- bmi <= 20.61\n",
"| | | | | | |--- class: 0\n",
"| | | | | |--- bmi > 20.61\n",
"| | | | | | |--- class: 1\n",
"| | | | |--- age > 62.46\n",
"| | | | | |--- age <= 64.00\n",
"| | | | | | |--- class: 1\n",
"| | | | | |--- age > 64.00\n",
"| | | | | | |--- class: 1\n",
"| | |--- cholesterol > 2.50\n",
"| | | |--- bmi <= 30.74\n",
"| | | | |--- bmi <= 30.06\n",
"| | | | | |--- bmi <= 23.93\n",
"| | | | | | |--- class: 1\n",
"| | | | | |--- bmi > 23.93\n",
"| | | | | | |--- class: 1\n",
"| | | | |--- bmi > 30.06\n",
"| | | | | |--- bmi <= 30.69\n",
"| | | | | | |--- class: 1\n",
"| | | | | |--- bmi > 30.69\n",
"| | | | | | |--- class: 0\n",
"| | | |--- bmi > 30.74\n",
"| | | | |--- bmi <= 32.05\n",
"| | | | | |--- age <= 43.63\n",
"| | | | | | |--- class: 0\n",
"| | | | | |--- age > 43.63\n",
"| | | | | | |--- class: 1\n",
"| | | | |--- bmi > 32.05\n",
"| | | | | |--- bmi <= 32.34\n",
"| | | | | | |--- class: 1\n",
"| | | | | |--- bmi > 32.34\n",
"| | | | | | |--- class: 1\n",
"| |--- ap_hi > 138.50\n",
"| | |--- ap_hi <= 149.50\n",
"| | | |--- age <= 39.56\n",
"| | | | |--- bmi <= 38.19\n",
"| | | | | |--- age <= 39.54\n",
"| | | | | | |--- class: 1\n",
"| | | | | |--- age > 39.54\n",
"| | | | | | |--- class: 0\n",
"| | | | |--- bmi > 38.19\n",
"| | | | | |--- bmi <= 50.55\n",
"| | | | | | |--- class: 0\n",
"| | | | | |--- bmi > 50.55\n",
"| | | | | | |--- class: 1\n",
"| | | |--- age > 39.56\n",
"| | | | |--- age <= 47.57\n",
"| | | | | |--- bmi <= 19.23\n",
"| | | | | | |--- class: 0\n",
"| | | | | |--- bmi > 19.23\n",
"| | | | | | |--- class: 1\n",
"| | | | |--- age > 47.57\n",
"| | | | | |--- age <= 61.57\n",
"| | | | | | |--- class: 1\n",
"| | | | | |--- age > 61.57\n",
"| | | | | | |--- class: 1\n",
"| | |--- ap_hi > 149.50\n",
"| | | |--- bmi <= 20.48\n",
"| | | | |--- age <= 64.27\n",
"| | | | | |--- age <= 55.82\n",
"| | | | | | |--- class: 1\n",
"| | | | | |--- age > 55.82\n",
"| | | | | | |--- class: 1\n",
"| | | | |--- age > 64.27\n",
"| | | | | |--- class: 0\n",
"| | | |--- bmi > 20.48\n",
"| | | | |--- age <= 64.35\n",
"| | | | | |--- age <= 49.82\n",
"| | | | | | |--- class: 1\n",
"| | | | | |--- age > 49.82\n",
"| | | | | | |--- class: 1\n",
"| | | | |--- age > 64.35\n",
"| | | | | |--- bmi <= 36.80\n",
"| | | | | | |--- class: 1\n",
"| | | | | |--- bmi > 36.80\n",
"| | | | | | |--- class: 0\n",
"\n"
]
}
],
"source": [
"rules = tree.export_text(model, feature_names=X_train.columns.values.tolist())\n",
"print(rules)"
]
},
{
"cell_type": "code",
"execution_count": 430,
"metadata": {},
"outputs": [],
"source": [
"import pickle\n",
"\n",
"pickle.dump(model, open(\"data-cardio/cardio.model.sav\", \"wb\"))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}