106 KiB
106 KiB
In [403]:
import pandas as pd
df = pd.read_csv("data-cardio/cardio_train.csv", sep=";", index_col="id")
df["age"] = df["age"] / 365.24
print(df.cardio.value_counts())
display(df.info(), df.cardio.value_counts())
df
Out[403]:
In [404]:
df.describe().transpose()
Out[404]:
In [405]:
df = df.query("ap_hi > 0 and ap_hi < 370")
df = df.query("ap_lo > 0 and ap_lo < 370")
df.describe().transpose()
Out[405]:
In [406]:
df["bmi"] = df["weight"] / (df["height"] / 100) ** 2
df.drop(["weight", "height"], axis=1, inplace=True)
df
Out[406]:
In [407]:
import seaborn as sns
sns.heatmap(df.corr())
Out[407]:
In [411]:
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import numpy as np
random_state = 9
stat_y = df["cardio"]
stat_X = df.drop(["cardio"], axis=1).copy()
stat_X_train, stat_X_test, stat_y_train, stat_y_test = train_test_split(
stat_X, stat_y, test_size=0.15, random_state=9
)
log_model = sm.Logit(stat_y_train, sm.add_constant(stat_X_train))
log_result = log_model.fit()
display(log_result.summary2())
np.exp(log_result.params).sort_values(ascending=False)
Out[411]:
In [421]:
# data = df.drop(["gluc", "smoke", "alco", "active", "ap_lo"], axis=1)
data = df.drop(
[
"gluc",
"smoke",
"alco",
"ap_lo",
"gender",
# "cholesterol",
"active",
],
axis=1,
)
# data = df.copy()
data
Out[421]:
In [431]:
data.to_csv("data-cardio/cardio_clear.csv")
In [422]:
y = data["cardio"]
X = data.drop(["cardio"], axis=1).copy()
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=random_state
)
display(X_train, y_train, X_test, y_test)
In [ ]:
from src.utils import run_classification
from sklearn import tree
from sklearn import metrics
import numpy as np
model = tree.DecisionTreeClassifier(
random_state=random_state,
max_depth=6,
).fit(X_train.values, y_train.values.ravel())
y_train_predict = model.predict(X_train)
y_test_probs = model.predict_proba(X_test)
y_test_predict = model.predict(X_test)
display("Precision_test", metrics.precision_score(y_test, y_test_predict))
display("Recall_test", metrics.recall_score(y_test, y_test_predict))
display("Accuracy_test", metrics.accuracy_score(y_test, y_test_predict))
display("ROC_AUC_test", metrics.roc_auc_score(y_test, y_test_probs[:, 1])) # type: ignore
display("F1_test", metrics.f1_score(y_test, y_test_predict))
display("MCC_test", metrics.matthews_corrcoef(y_test, y_test_predict))
display("Cohen_kappa_test", metrics.cohen_kappa_score(y_test, y_test_predict))
display("Confusion_matrix", metrics.confusion_matrix(y_test, y_test_predict))
In [429]:
rules = tree.export_text(model, feature_names=X_train.columns.values.tolist())
print(rules)
In [430]:
import pickle
pickle.dump(model, open("data-cardio/cardio.model.sav", "wb"))