192 KiB
192 KiB
In [15]:
import pandas as pd
import numpy as np
import seaborn as sns
random_state = 9
df = pd.read_csv("data-distress/FinancialDistress.csv").drop(["Company", "Time"], axis=1)
corr = df.corr()
display(df)
sns.heatmap(corr)
Financial Distress | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | ... | x74 | x75 | x76 | x77 | x78 | x79 | x80 | x81 | x82 | x83 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.010636 | 1.2810 | 0.022934 | 0.87454 | 1.21640 | 0.060940 | 0.188270 | 0.52510 | 0.018854 | 0.182790 | ... | 85.437 | 27.07 | 26.102 | 16.000 | 16.0 | 0.2 | 22 | 0.060390 | 30 | 49 |
1 | -0.455970 | 1.2700 | 0.006454 | 0.82067 | 1.00490 | -0.014080 | 0.181040 | 0.62288 | 0.006423 | 0.035991 | ... | 107.090 | 31.31 | 30.194 | 17.000 | 16.0 | 0.4 | 22 | 0.010636 | 31 | 50 |
2 | -0.325390 | 1.0529 | -0.059379 | 0.92242 | 0.72926 | 0.020476 | 0.044865 | 0.43292 | -0.081423 | -0.765400 | ... | 120.870 | 36.07 | 35.273 | 17.000 | 15.0 | -0.2 | 22 | -0.455970 | 32 | 51 |
3 | -0.566570 | 1.1131 | -0.015229 | 0.85888 | 0.80974 | 0.076037 | 0.091033 | 0.67546 | -0.018807 | -0.107910 | ... | 54.806 | 39.80 | 38.377 | 17.167 | 16.0 | 5.6 | 22 | -0.325390 | 33 | 52 |
4 | 1.357300 | 1.0623 | 0.107020 | 0.81460 | 0.83593 | 0.199960 | 0.047800 | 0.74200 | 0.128030 | 0.577250 | ... | 85.437 | 27.07 | 26.102 | 16.000 | 16.0 | 0.2 | 29 | 1.251000 | 7 | 27 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
3667 | 0.438020 | 2.2605 | 0.202890 | 0.16037 | 0.18588 | 0.175970 | 0.198400 | 2.22360 | 1.091500 | 0.241640 | ... | 100.000 | 100.00 | 100.000 | 17.125 | 14.5 | -7.0 | 37 | 0.436380 | 4 | 41 |
3668 | 0.482410 | 1.9615 | 0.216440 | 0.20095 | 0.21642 | 0.203590 | 0.189870 | 1.93820 | 1.000100 | 0.270870 | ... | 91.500 | 130.50 | 132.400 | 20.000 | 14.5 | -16.0 | 37 | 0.438020 | 5 | 42 |
3669 | 0.500770 | 1.7099 | 0.207970 | 0.26136 | 0.21399 | 0.193670 | 0.183890 | 1.68980 | 0.971860 | 0.281560 | ... | 87.100 | 175.90 | 178.100 | 20.000 | 14.5 | -20.2 | 37 | 0.482410 | 6 | 43 |
3670 | 0.611030 | 1.5590 | 0.185450 | 0.30728 | 0.19307 | 0.172140 | 0.170680 | 1.53890 | 0.960570 | 0.267720 | ... | 92.900 | 203.20 | 204.500 | 22.000 | 22.0 | 6.4 | 37 | 0.500770 | 7 | 44 |
3671 | 0.518650 | 1.6148 | 0.176760 | 0.36369 | 0.18442 | 0.169550 | 0.197860 | 1.58420 | 0.958450 | 0.277780 | ... | 91.700 | 227.50 | 214.500 | 21.000 | 20.5 | 8.6 | 37 | 0.611030 | 8 | 45 |
3672 rows × 84 columns
Out[15]:
<Axes: >
In [16]:
columns = np.full((corr.shape[0],), True, dtype=bool)
for i in range(corr.shape[0]):
for j in range(i + 1, corr.shape[0]):
if corr.iloc[i, j] >= 0.9 or corr.iloc[i, j] <= -0.9: # type: ignore
if columns[j]:
columns[j] = False
selected_columns = df.columns[columns]
selected_columns.shape
Out[16]:
(68,)
In [17]:
df = df[selected_columns]
df
Out[17]:
Financial Distress | x1 | x2 | x3 | x4 | x5 | x6 | x8 | x9 | x10 | ... | x69 | x70 | x71 | x72 | x73 | x74 | x78 | x80 | x82 | x83 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.010636 | 1.2810 | 0.022934 | 0.87454 | 1.21640 | 0.060940 | 0.188270 | 0.018854 | 0.182790 | 0.006449 | ... | 364.9500 | 15.8 | 61.476 | 4.0 | 36.0 | 85.437 | 16.0 | 22 | 30 | 49 |
1 | -0.455970 | 1.2700 | 0.006454 | 0.82067 | 1.00490 | -0.014080 | 0.181040 | 0.006423 | 0.035991 | 0.001795 | ... | 0.1896 | 15.6 | 24.579 | 0.0 | 36.0 | 107.090 | 16.0 | 22 | 31 | 50 |
2 | -0.325390 | 1.0529 | -0.059379 | 0.92242 | 0.72926 | 0.020476 | 0.044865 | -0.081423 | -0.765400 | -0.054324 | ... | 11.9460 | 15.2 | 20.700 | 0.0 | 35.0 | 120.870 | 15.0 | 22 | 32 | 51 |
3 | -0.566570 | 1.1131 | -0.015229 | 0.85888 | 0.80974 | 0.076037 | 0.091033 | -0.018807 | -0.107910 | -0.065316 | ... | -18.7480 | 10.4 | 47.429 | 4.0 | 33.0 | 54.806 | 16.0 | 22 | 33 | 52 |
4 | 1.357300 | 1.0623 | 0.107020 | 0.81460 | 0.83593 | 0.199960 | 0.047800 | 0.128030 | 0.577250 | 0.094075 | ... | 364.9500 | 15.8 | 61.476 | 4.0 | 36.0 | 85.437 | 16.0 | 29 | 7 | 27 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
3667 | 0.438020 | 2.2605 | 0.202890 | 0.16037 | 0.18588 | 0.175970 | 0.198400 | 1.091500 | 0.241640 | 0.226860 | ... | 14.5290 | 21.5 | 33.768 | 2.0 | 22.0 | 100.000 | 14.5 | 37 | 4 | 41 |
3668 | 0.482410 | 1.9615 | 0.216440 | 0.20095 | 0.21642 | 0.203590 | 0.189870 | 1.000100 | 0.270870 | 0.213610 | ... | 3.8523 | 30.5 | -10.665 | 0.0 | 28.0 | 91.500 | 14.5 | 37 | 5 | 42 |
3669 | 0.500770 | 1.7099 | 0.207970 | 0.26136 | 0.21399 | 0.193670 | 0.183890 | 0.971860 | 0.281560 | 0.210970 | ... | -25.8410 | 34.7 | 36.030 | 2.0 | 32.0 | 87.100 | 14.5 | 37 | 6 | 43 |
3670 | 0.611030 | 1.5590 | 0.185450 | 0.30728 | 0.19307 | 0.172140 | 0.170680 | 0.960570 | 0.267720 | 0.203190 | ... | -58.1220 | 15.6 | 22.571 | 2.0 | 30.0 | 92.900 | 22.0 | 37 | 7 | 44 |
3671 | 0.518650 | 1.6148 | 0.176760 | 0.36369 | 0.18442 | 0.169550 | 0.197860 | 0.958450 | 0.277780 | 0.213850 | ... | -32.2090 | 11.9 | 13.871 | 1.0 | 29.0 | 91.700 | 20.5 | 37 | 8 | 45 |
3672 rows × 68 columns
In [18]:
import statsmodels.api as sm
def backwardElimination(x, Y, sl, columns):
numVars = len(x[0])
for i in range(0, numVars):
regressor_OLS = sm.OLS(Y, x).fit()
maxVar = max(regressor_OLS.pvalues).astype(float)
if maxVar > sl:
for j in range(0, numVars - i):
if regressor_OLS.pvalues[j].astype(float) == maxVar:
x = np.delete(x, j, 1)
columns = np.delete(columns, j)
regressor_OLS.summary()
return x, columns
selected_columns = selected_columns.drop(["Financial Distress"], errors='ignore')
selected_columns
Out[18]:
Index(['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x8', 'x9', 'x10', 'x11', 'x12', 'x14', 'x15', 'x16', 'x17', 'x18', 'x19', 'x20', 'x21', 'x22', 'x23', 'x24', 'x25', 'x26', 'x27', 'x28', 'x29', 'x30', 'x31', 'x32', 'x35', 'x36', 'x37', 'x39', 'x40', 'x41', 'x42', 'x43', 'x44', 'x45', 'x46', 'x47', 'x51', 'x54', 'x55', 'x56', 'x57', 'x58', 'x59', 'x60', 'x61', 'x63', 'x64', 'x65', 'x66', 'x67', 'x68', 'x69', 'x70', 'x71', 'x72', 'x73', 'x74', 'x78', 'x80', 'x82', 'x83'], dtype='object')
In [19]:
SL = 0.05
new_data, new_columns = backwardElimination(
df.iloc[:, 1:].values, df.iloc[:, 0].values, SL, selected_columns
)
data = pd.DataFrame(data=new_data, columns=new_columns)
data["Distress"] = df["Financial Distress"]
data
Out[19]:
x3 | x4 | x5 | x10 | x14 | x18 | x23 | x24 | x25 | x29 | x37 | x41 | x46 | x54 | x63 | x70 | x73 | x80 | Distress | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.87454 | 1.21640 | 0.060940 | 0.006449 | 6.97060 | 0.018265 | 0.148720 | 0.66995 | 214.760 | 0.204590 | 1.630700 | 9.69510 | 0.026224 | 209.87 | 3.27020 | 15.8 | 36.0 | 22.0 | 0.010636 |
1 | 0.82067 | 1.00490 | -0.014080 | 0.001795 | 4.57640 | 0.027558 | 0.056026 | 0.67048 | 38.242 | 0.150190 | 0.837540 | 5.60350 | 0.007864 | 250.14 | 14.32100 | 15.6 | 36.0 | 22.0 | -0.455970 |
2 | 0.92242 | 0.72926 | 0.020476 | -0.054324 | 11.89000 | 0.012595 | 0.065220 | 0.84827 | -498.390 | 0.074149 | 0.955790 | 9.40030 | -0.064373 | 280.55 | 1.15380 | 15.2 | 35.0 | 22.0 | -0.325390 |
3 | 0.85888 | 0.80974 | 0.076037 | -0.065316 | 6.08620 | 0.011601 | 0.125160 | 0.80478 | -75.867 | 0.054098 | 0.383350 | 5.73790 | -0.017731 | 413.74 | 2.04080 | 10.4 | 33.0 | 22.0 | -0.566570 |
4 | 0.81460 | 0.83593 | 0.199960 | 0.094075 | 4.39380 | 0.006814 | 0.266020 | 0.76770 | 1423.100 | 0.046907 | 0.253010 | 4.50880 | 0.131380 | 315.34 | 3.27020 | 15.8 | 36.0 | 29.0 | 1.357300 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
3667 | 0.16037 | 0.18588 | 0.175970 | 0.226860 | 0.19101 | 0.014077 | 0.994340 | 0.15740 | 390.260 | 0.002976 | 0.003544 | 0.22138 | 1.265100 | 16961.00 | -0.53449 | 21.5 | 22.0 | 37.0 | 0.438020 |
3668 | 0.20095 | 0.21642 | 0.203590 | 0.213610 | 0.25149 | 0.018249 | 0.992440 | 0.19747 | 443.840 | 0.003484 | 0.004359 | 0.27085 | 1.077100 | 20689.00 | -25.73600 | 30.5 | 28.0 | 37.0 | 0.482410 |
3669 | 0.26136 | 0.21399 | 0.193670 | 0.210970 | 0.35384 | 0.007451 | 0.982420 | 0.25902 | 475.560 | 0.002343 | 0.003172 | 0.28971 | 0.795720 | 34012.00 | -3.06590 | 34.7 | 32.0 | 37.0 | 0.500770 |
3670 | 0.30728 | 0.19307 | 0.172140 | 0.203190 | 0.44358 | 0.021239 | 0.985230 | 0.30533 | 457.060 | 0.001942 | 0.002803 | 0.27871 | 0.603540 | 35901.00 | 7.15620 | 15.6 | 30.0 | 37.0 | 0.611030 |
3671 | 0.36369 | 0.18442 | 0.169550 | 0.213850 | 0.57156 | 0.013783 | 0.994000 | 0.32184 | 505.040 | 0.041852 | 0.065773 | 0.28982 | 0.486010 | 28173.00 | 12.14500 | 11.9 | 29.0 | 37.0 | 0.518650 |
3672 rows × 19 columns
In [20]:
from src.utils import split_stratified_into_train_val_test
X_train, X_test, y_train, y_test = split_stratified_into_train_val_test(
data,
stratify_colname="Distress",
frac_train=0.8,
frac_val=0,
frac_test=0.2,
random_state=random_state,
)
display(X_train.head(3))
display(y_train.head(3))
display(X_test.head(3))
display(y_test.head(3))
x3 | x4 | x5 | x10 | x14 | x18 | x23 | x24 | x25 | x29 | x37 | x41 | x46 | x54 | x63 | x70 | x73 | x80 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1156 | 0.71056 | 0.93446 | 0.14445 | 0.14572 | 2.45500 | 0.045089 | 0.19754 | 0.66553 | 625.41 | 0.045031 | 0.155580 | 3.22850 | 0.11500 | 874.69 | -3.0266 | 25.4 | 28.0 | 9.0 |
1993 | 0.21104 | 0.59523 | 0.30998 | 0.48288 | 0.26750 | 0.001754 | 0.56306 | 0.19858 | 1600.20 | 0.012465 | 0.015800 | 0.75445 | 2.10980 | 47173.00 | -3.0659 | 34.7 | 32.0 | 4.0 |
1924 | 0.46072 | 0.90327 | 0.28563 | 0.45008 | 0.85431 | 0.024656 | 0.43336 | 0.45475 | 4659.80 | 0.005962 | 0.011055 | 1.67490 | 0.81567 | 12851.00 | 7.1562 | 15.6 | 30.0 | 25.0 |
Distress | |
---|---|
1156 | 0.6382 |
1993 | 0.4402 |
1924 | 3.2629 |
x3 | x4 | x5 | x10 | x14 | x18 | x23 | x24 | x25 | x29 | x37 | x41 | x46 | x54 | x63 | x70 | x73 | x80 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
3379 | 0.62266 | 0.74377 | 0.13716 | 0.008050 | 1.65010 | 0.034872 | 0.20639 | 0.42211 | 734.24 | 0.20055 | 0.53147 | 1.9711 | 0.207370 | 620.53 | 7.7373 | 15.400 | 35.5 | 25.0 |
156 | 0.79108 | 0.68615 | 0.10943 | 0.011391 | 3.78650 | 0.002455 | 0.19456 | 0.56425 | 653.83 | 0.22683 | 1.08570 | 3.2842 | 0.061802 | 225.64 | 1.1538 | 15.200 | 35.0 | 12.0 |
2215 | 0.46538 | 0.54146 | 0.25140 | 0.187750 | 0.87049 | 0.027462 | 0.46916 | 0.22192 | 601.83 | 0.24346 | 0.45540 | 1.0128 | 0.431220 | 473.60 | 9.7164 | 15.683 | 36.0 | 15.0 |
Distress | |
---|---|
3379 | 0.121330 |
156 | 0.080083 |
2215 | 1.164000 |
In [21]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model, tree, neighbors, ensemble
models = {
"linear": {"model": linear_model.LinearRegression(n_jobs=-1)},
"linear_poly": {
"model": make_pipeline(
PolynomialFeatures(degree=2),
linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),
)
},
"linear_interact": {
"model": make_pipeline(
PolynomialFeatures(interaction_only=True),
linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),
)
},
"ridge": {"model": linear_model.RidgeCV()},
"decision_tree": {
"model": tree.DecisionTreeRegressor(max_depth=7, random_state=random_state)
},
"knn": {"model": neighbors.KNeighborsRegressor(n_neighbors=7, n_jobs=-1)},
"random_forest": {
"model": ensemble.RandomForestRegressor(
max_depth=7, random_state=random_state, n_jobs=-1
)
},
}
In [22]:
from src.utils import run_regression
for model_name in models.keys():
print(f"Model: {model_name}")
fitted_model = models[model_name]["model"].fit(
X_train.values, y_train.values.ravel()
)
models[model_name] = run_regression(
fitted_model, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test
)
Model: linear Model: linear_poly Model: linear_interact Model: ridge Model: decision_tree Model: knn Model: random_forest
In [23]:
reg_metrics = pd.DataFrame.from_dict(models, "index")[
["RMSE_train", "RMSE_test", "RMAE_test", "R2_test"]
]
reg_metrics.sort_values(by="RMSE_test").style.background_gradient(
cmap="viridis", low=1, high=0.3, subset=["RMSE_train", "RMSE_test"]
).background_gradient(cmap="plasma", low=0.3, high=1, subset=["RMAE_test", "R2_test"])
Out[23]:
RMSE_train | RMSE_test | RMAE_test | R2_test | |
---|---|---|---|---|
random_forest | 1.394198 | 1.042729 | 0.778401 | 0.456952 |
ridge | 2.488097 | 1.198888 | 0.865585 | 0.282120 |
linear | 2.474171 | 1.228277 | 0.885807 | 0.246493 |
linear_poly | 0.981309 | 1.267218 | 0.833594 | 0.197957 |
linear_interact | 1.025112 | 1.466789 | 0.850850 | -0.074560 |
knn | 2.376262 | 1.541027 | 0.879611 | -0.186083 |
decision_tree | 0.872007 | 1.566888 | 0.850226 | -0.226227 |
In [24]:
from src.utils import run_classification
def get_class(row):
return 0 if row["Distress"] > -0.5 else 1
datac = data.copy()
datac["Distress"] = datac.apply(get_class, axis=1)
datac
Out[24]:
x3 | x4 | x5 | x10 | x14 | x18 | x23 | x24 | x25 | x29 | x37 | x41 | x46 | x54 | x63 | x70 | x73 | x80 | Distress | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.87454 | 1.21640 | 0.060940 | 0.006449 | 6.97060 | 0.018265 | 0.148720 | 0.66995 | 214.760 | 0.204590 | 1.630700 | 9.69510 | 0.026224 | 209.87 | 3.27020 | 15.8 | 36.0 | 22.0 | 0 |
1 | 0.82067 | 1.00490 | -0.014080 | 0.001795 | 4.57640 | 0.027558 | 0.056026 | 0.67048 | 38.242 | 0.150190 | 0.837540 | 5.60350 | 0.007864 | 250.14 | 14.32100 | 15.6 | 36.0 | 22.0 | 0 |
2 | 0.92242 | 0.72926 | 0.020476 | -0.054324 | 11.89000 | 0.012595 | 0.065220 | 0.84827 | -498.390 | 0.074149 | 0.955790 | 9.40030 | -0.064373 | 280.55 | 1.15380 | 15.2 | 35.0 | 22.0 | 0 |
3 | 0.85888 | 0.80974 | 0.076037 | -0.065316 | 6.08620 | 0.011601 | 0.125160 | 0.80478 | -75.867 | 0.054098 | 0.383350 | 5.73790 | -0.017731 | 413.74 | 2.04080 | 10.4 | 33.0 | 22.0 | 1 |
4 | 0.81460 | 0.83593 | 0.199960 | 0.094075 | 4.39380 | 0.006814 | 0.266020 | 0.76770 | 1423.100 | 0.046907 | 0.253010 | 4.50880 | 0.131380 | 315.34 | 3.27020 | 15.8 | 36.0 | 29.0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
3667 | 0.16037 | 0.18588 | 0.175970 | 0.226860 | 0.19101 | 0.014077 | 0.994340 | 0.15740 | 390.260 | 0.002976 | 0.003544 | 0.22138 | 1.265100 | 16961.00 | -0.53449 | 21.5 | 22.0 | 37.0 | 0 |
3668 | 0.20095 | 0.21642 | 0.203590 | 0.213610 | 0.25149 | 0.018249 | 0.992440 | 0.19747 | 443.840 | 0.003484 | 0.004359 | 0.27085 | 1.077100 | 20689.00 | -25.73600 | 30.5 | 28.0 | 37.0 | 0 |
3669 | 0.26136 | 0.21399 | 0.193670 | 0.210970 | 0.35384 | 0.007451 | 0.982420 | 0.25902 | 475.560 | 0.002343 | 0.003172 | 0.28971 | 0.795720 | 34012.00 | -3.06590 | 34.7 | 32.0 | 37.0 | 0 |
3670 | 0.30728 | 0.19307 | 0.172140 | 0.203190 | 0.44358 | 0.021239 | 0.985230 | 0.30533 | 457.060 | 0.001942 | 0.002803 | 0.27871 | 0.603540 | 35901.00 | 7.15620 | 15.6 | 30.0 | 37.0 | 0 |
3671 | 0.36369 | 0.18442 | 0.169550 | 0.213850 | 0.57156 | 0.013783 | 0.994000 | 0.32184 | 505.040 | 0.041852 | 0.065773 | 0.28982 | 0.486010 | 28173.00 | 12.14500 | 11.9 | 29.0 | 37.0 | 0 |
3672 rows × 19 columns
In [25]:
from imblearn.over_sampling import ADASYN
Xc_train, Xc_test, yc_train, yc_test = split_stratified_into_train_val_test(
datac,
stratify_colname="Distress",
frac_train=0.8,
frac_val=0,
frac_test=0.2,
random_state=random_state,
)
ada = ADASYN()
Xc_train, yc_train = ada.fit_resample(Xc_train, yc_train)
display(Xc_train.head(3))
display(yc_train.head(3))
display(Xc_test.head(3))
display(yc_test.head(3))
x3 | x4 | x5 | x10 | x14 | x18 | x23 | x24 | x25 | x29 | x37 | x41 | x46 | x54 | x63 | x70 | x73 | x80 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.71056 | 0.93446 | 0.14445 | 0.14572 | 2.45500 | 0.045089 | 0.19754 | 0.66553 | 625.41 | 0.045031 | 0.155580 | 3.22850 | 0.11500 | 874.69 | -3.0266 | 25.4 | 28.0 | 9.0 |
1 | 0.21104 | 0.59523 | 0.30998 | 0.48288 | 0.26750 | 0.001754 | 0.56306 | 0.19858 | 1600.20 | 0.012465 | 0.015800 | 0.75445 | 2.10980 | 47173.00 | -3.0659 | 34.7 | 32.0 | 4.0 |
2 | 0.46072 | 0.90327 | 0.28563 | 0.45008 | 0.85431 | 0.024656 | 0.43336 | 0.45475 | 4659.80 | 0.005962 | 0.011055 | 1.67490 | 0.81567 | 12851.00 | 7.1562 | 15.6 | 30.0 | 25.0 |
Distress | |
---|---|
0 | 0 |
1 | 0 |
2 | 0 |
x3 | x4 | x5 | x10 | x14 | x18 | x23 | x24 | x25 | x29 | x37 | x41 | x46 | x54 | x63 | x70 | x73 | x80 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
3379 | 0.62266 | 0.74377 | 0.13716 | 0.008050 | 1.65010 | 0.034872 | 0.20639 | 0.42211 | 734.24 | 0.20055 | 0.53147 | 1.9711 | 0.207370 | 620.53 | 7.7373 | 15.400 | 35.5 | 25.0 |
156 | 0.79108 | 0.68615 | 0.10943 | 0.011391 | 3.78650 | 0.002455 | 0.19456 | 0.56425 | 653.83 | 0.22683 | 1.08570 | 3.2842 | 0.061802 | 225.64 | 1.1538 | 15.200 | 35.0 | 12.0 |
2215 | 0.46538 | 0.54146 | 0.25140 | 0.187750 | 0.87049 | 0.027462 | 0.46916 | 0.22192 | 601.83 | 0.24346 | 0.45540 | 1.0128 | 0.431220 | 473.60 | 9.7164 | 15.683 | 36.0 | 15.0 |
Distress | |
---|---|
3379 | 0 |
156 | 0 |
2215 | 0 |
In [26]:
from src.utils import run_classification
from sklearn import tree
fitted_model = tree.DecisionTreeClassifier(max_depth=7, random_state=random_state).fit(
Xc_train.values, yc_train.values.ravel()
)
result = run_classification(
fitted_model, X_train=Xc_train, X_test=Xc_test, y_train=yc_train, y_test=yc_test
)
result
c:\Users\user\Projects\python\fuzzy\.venv\Lib\site-packages\sklearn\base.py:486: UserWarning: X has feature names, but DecisionTreeClassifier was fitted without feature names warnings.warn( c:\Users\user\Projects\python\fuzzy\.venv\Lib\site-packages\sklearn\base.py:486: UserWarning: X has feature names, but DecisionTreeClassifier was fitted without feature names warnings.warn(
Out[26]:
{'pipeline': DecisionTreeClassifier(max_depth=7, random_state=9), 'probs': array([1. , 0.17698154, 1. , 0.04407713, 1. , 1. , 1. , 1. , 1. , 1. , 0.04407713, 1. , 1. , 1. , 1. , 1. , 0.08695652, 1. , 1. , 1. , 0.04407713, 1. , 1. , 1. , 1. , 1. , 1. , 1. , 0.04407713, 1. , 1. , 1. , 1. , 0.04407713, 1. , 0.04407713, 1. , 0.17698154, 1. , 1. , 1. , 0.17698154, 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 0.17698154, 1. , 1. , 1. , 1. , 1. , 0.04407713, 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 0.17698154, 1. , 1. , 0. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 0.04407713, 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 0.17698154, 0.04407713, 1. , 1. , 1. , 0.04407713, 1. , 1. , 0.94736842, 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 0.17698154, 1. , 0.17698154, 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 0.04407713, 1. , 1. , 1. , 1. , 1. , 1. , 0.04407713, 1. , 0.08695652, 1. , 1. , 1. , 1. , 1. , 1. , 1. , 0.17698154, 0.17698154, 0.17698154, 1. , 1. , 1. , 1. , 1. , 1. , 0.04407713, 1. , 1. , 1. , 1. , 1. , 1. , 0.04407713, 1. , 1. , 1. , 0.04407713, 1. , 0.04407713, 0.17698154, 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 0.04407713, 1. , 0.17698154, 1. , 0.04407713, 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 0.5 , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 0.17698154, 1. , 1. , 1. , 1. , 1. , 0.17698154, 1. , 1. , 1. , 1. , 1. , 0.17698154, 0.13207547, 1. , 1. , 1. , 1. , 1. , 1. , 0.17698154, 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 0.13207547, 0.17698154, 1. , 1. , 1. , 1. , 1. , 0.17698154, 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 0.04407713, 1. , 1. , 1. , 0.04407713, 1. , 1. , 1. , 1. , 1. , 1. , 0.625 , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 0.17698154, 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 0.04407713, 1. , 1. , 1. , 1. , 0.17698154, 1. , 0.13207547, 0.04407713, 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 0.17698154, 1. , 0.04407713, 1. , 0.04407713, 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 0.04407713, 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 0.17698154, 1. , 0.08695652, 0.04407713, 1. , 1. , 1. , 1. , 1. , 1. , 0. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 0.17698154, 1. , 0.17698154, 1. , 1. , 1. , 1. , 1. , 1. , 0.17698154, 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 0.04407713, 1. , 1. , 0.13207547, 1. , 1. , 0.04407713, 1. , 1. , 0.625 , 1. , 1. , 1. , 1. , 0.17698154, 1. , 1. , 0.13207547, 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 0.17698154, 1. , 1. , 0.17698154, 1. , 1. , 1. , 1. , 0.04407713, 1. , 1. , 1. , 1. , 0.17698154, 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 0.17698154, 1. , 1. , 0.17698154, 1. , 1. , 0.17698154, 0. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 0.04407713, 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 0.04407713, 1. , 1. , 0.04407713, 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 0.17698154, 1. , 0.17698154, 1. , 1. , 1. , 0.17698154, 1. , 1. , 0.17698154, 0.05454545, 0.04407713, 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 0.17698154, 1. , 1. , 1. , 1. , 0.17698154, 1. , 1. , 1. , 0.04407713, 1. , 1. , 0.04407713, 0.17698154, 1. , 1. , 1. , 0.04407713, 1. , 1. , 1. , 0.17698154, 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 0.17698154, 1. , 1. , 0.04407713, 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 0.17698154, 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 0.04407713, 1. , 1. , 1. , 1. , 0.17698154, 1. , 0.04407713, 1. , 1. , 1. , 1. , 1. , 0.04407713, 1. , 1. , 1. , 1. , 0.17698154, 1. , 1. , 0.04407713, 1. , 1. , 1. , 1. , 1. , 1. , 0.17698154, 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 0.04407713, 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 0.17698154, 1. , 1. , 0.04407713, 1. , 0.17698154, 1. , 1. , 1. , 0.04407713, 0.04407713, 1. , 1. , 1. , 1. , 1. , 1. , 0.04407713, 1. , 1. , 1. , 1. , 1. , 0. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 0. , 1. , 0.17698154, 1. , 1. , 1. , 0.04407713, 1. , 1. , 1. , 1. , 1. , 1. , 1. , 0.17698154, 1. , 1. , 1. , 1. , 1. , 1. , 0.17698154, 1. , 0. , 1. , 0.08695652, 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 0. , 1. , 1. , 0.17698154, 0.04407713, 1. , 1. , 1. , 1. ]), 'preds': array([1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1]), 'Precision_train': np.float64(0.9157792836398838), 'Precision_test': np.float64(0.011345218800648298), 'Recall_train': np.float64(0.9978902953586498), 'Recall_test': np.float64(0.3181818181818182), 'Accuracy_train': 0.9528851244044468, 'Accuracy_test': 0.14965986394557823, 'ROC_AUC_test': np.float64(0.21318373071528754), 'F1_train': np.float64(0.9550731953558809), 'F1_test': np.float64(0.02190923317683881), 'MCC_test': np.float64(-0.2494229220759723), 'Cohen_kappa_test': np.float64(-0.03809571157718228), 'Confusion_matrix': array([[103, 610], [ 15, 7]])}
In [27]:
rules = tree.export_text(
fitted_model,
feature_names=X_train.columns.values.tolist(),
)
print(rules)
|--- x46 <= 0.07 | |--- x10 <= -0.00 | | |--- x14 <= 1.65 | | | |--- x23 <= 0.16 | | | | |--- x54 <= 148.12 | | | | | |--- x70 <= 15.70 | | | | | | |--- x54 <= 40.29 | | | | | | | |--- class: 0 | | | | | | |--- x54 > 40.29 | | | | | | | |--- class: 1 | | | | | |--- x70 > 15.70 | | | | | | |--- class: 0 | | | | |--- x54 > 148.12 | | | | | |--- class: 0 | | | |--- x23 > 0.16 | | | | |--- x24 <= 0.31 | | | | | |--- class: 0 | | | | |--- x24 > 0.31 | | | | | |--- class: 1 | | |--- x14 > 1.65 | | | |--- x41 <= 0.27 | | | | |--- class: 0 | | | |--- x41 > 0.27 | | | | |--- x70 <= 10.43 | | | | | |--- x46 <= -0.02 | | | | | | |--- x37 <= 0.39 | | | | | | | |--- class: 1 | | | | | | |--- x37 > 0.39 | | | | | | | |--- class: 0 | | | | | |--- x46 > -0.02 | | | | | | |--- class: 0 | | | | |--- x70 > 10.43 | | | | | |--- x41 <= 24.10 | | | | | | |--- x73 <= 22.02 | | | | | | | |--- class: 0 | | | | | | |--- x73 > 22.02 | | | | | | | |--- class: 1 | | | | | |--- x41 > 24.10 | | | | | | |--- class: 0 | |--- x10 > -0.00 | | |--- x41 <= 2.26 | | | |--- x37 <= 0.37 | | | | |--- class: 0 | | | |--- x37 > 0.37 | | | | |--- x5 <= 0.03 | | | | | |--- x73 <= 29.22 | | | | | | |--- x14 <= 6.58 | | | | | | | |--- class: 0 | | | | | | |--- x14 > 6.58 | | | | | | | |--- class: 1 | | | | | |--- x73 > 29.22 | | | | | | |--- x4 <= 0.20 | | | | | | | |--- class: 0 | | | | | | |--- x4 > 0.20 | | | | | | | |--- class: 1 | | | | |--- x5 > 0.03 | | | | | |--- x29 <= 0.09 | | | | | | |--- class: 1 | | | | | |--- x29 > 0.09 | | | | | | |--- x41 <= 2.15 | | | | | | | |--- class: 0 | | | | | | |--- x41 > 2.15 | | | | | | | |--- class: 0 | | |--- x41 > 2.26 | | | |--- x37 <= 0.17 | | | | |--- x25 <= 112.70 | | | | | |--- x41 <= 3.17 | | | | | | |--- x23 <= 0.26 | | | | | | | |--- class: 0 | | | | | | |--- x23 > 0.26 | | | | | | | |--- class: 1 | | | | | |--- x41 > 3.17 | | | | | | |--- x4 <= 1.30 | | | | | | | |--- class: 1 | | | | | | |--- x4 > 1.30 | | | | | | | |--- class: 0 | | | | |--- x25 > 112.70 | | | | | |--- x3 <= 0.69 | | | | | | |--- x23 <= 0.16 | | | | | | | |--- class: 0 | | | | | | |--- x23 > 0.16 | | | | | | | |--- class: 1 | | | | | |--- x3 > 0.69 | | | | | | |--- x46 <= 0.07 | | | | | | | |--- class: 0 | | | | | | |--- x46 > 0.07 | | | | | | | |--- class: 1 | | | |--- x37 > 0.17 | | | | |--- x73 <= 27.06 | | | | | |--- x29 <= 0.03 | | | | | | |--- x80 <= 19.00 | | | | | | | |--- class: 1 | | | | | | |--- x80 > 19.00 | | | | | | | |--- class: 0 | | | | | |--- x29 > 0.03 | | | | | | |--- class: 0 | | | | |--- x73 > 27.06 | | | | | |--- x70 <= 30.33 | | | | | | |--- x80 <= 27.95 | | | | | | | |--- class: 1 | | | | | | |--- x80 > 27.95 | | | | | | | |--- class: 0 | | | | | |--- x70 > 30.33 | | | | | | |--- x37 <= 0.19 | | | | | | | |--- class: 1 | | | | | | |--- x37 > 0.19 | | | | | | | |--- class: 0 |--- x46 > 0.07 | |--- x14 <= 3.44 | | |--- x25 <= 152.01 | | | |--- x37 <= 0.25 | | | | |--- class: 0 | | | |--- x37 > 0.25 | | | | |--- x41 <= 1.66 | | | | | |--- class: 0 | | | | |--- x41 > 1.66 | | | | | |--- class: 1 | | |--- x25 > 152.01 | | | |--- x46 <= 0.10 | | | | |--- x41 <= 3.13 | | | | | |--- x25 <= 1069.03 | | | | | | |--- class: 0 | | | | | |--- x25 > 1069.03 | | | | | | |--- x23 <= 0.24 | | | | | | | |--- class: 1 | | | | | | |--- x23 > 0.24 | | | | | | | |--- class: 0 | | | | |--- x41 > 3.13 | | | | | |--- x4 <= 1.12 | | | | | | |--- x10 <= 0.04 | | | | | | | |--- class: 0 | | | | | | |--- x10 > 0.04 | | | | | | | |--- class: 1 | | | | | |--- x4 > 1.12 | | | | | | |--- class: 0 | | | |--- x46 > 0.10 | | | | |--- x25 <= 240.30 | | | | | |--- x41 <= 3.33 | | | | | | |--- class: 0 | | | | | |--- x41 > 3.33 | | | | | | |--- x23 <= 0.17 | | | | | | | |--- class: 0 | | | | | | |--- x23 > 0.17 | | | | | | | |--- class: 1 | | | | |--- x25 > 240.30 | | | | | |--- x5 <= 0.02 | | | | | | |--- x5 <= 0.02 | | | | | | | |--- class: 0 | | | | | | |--- x5 > 0.02 | | | | | | | |--- class: 1 | | | | | |--- x5 > 0.02 | | | | | | |--- class: 0 | |--- x14 > 3.44 | | |--- x5 <= 0.09 | | | |--- x54 <= 1165.84 | | | | |--- x37 <= 0.14 | | | | | |--- class: 0 | | | | |--- x37 > 0.14 | | | | | |--- x41 <= 2.36 | | | | | | |--- class: 0 | | | | | |--- x41 > 2.36 | | | | | | |--- x80 <= 10.34 | | | | | | | |--- class: 0 | | | | | | |--- x80 > 10.34 | | | | | | | |--- class: 1 | | | |--- x54 > 1165.84 | | | | |--- class: 0 | | |--- x5 > 0.09 | | | |--- x70 <= 16.37 | | | | |--- x23 <= 0.08 | | | | | |--- class: 1 | | | | |--- x23 > 0.08 | | | | | |--- x54 <= 150.56 | | | | | | |--- x10 <= 0.04 | | | | | | | |--- class: 0 | | | | | | |--- x10 > 0.04 | | | | | | | |--- class: 1 | | | | | |--- x54 > 150.56 | | | | | | |--- class: 0 | | | |--- x70 > 16.37 | | | | |--- x54 <= 911.20 | | | | | |--- x41 <= 4.52 | | | | | | |--- class: 1 | | | | | |--- x41 > 4.52 | | | | | | |--- class: 0 | | | | |--- x54 > 911.20 | | | | | |--- x25 <= 2874.98 | | | | | | |--- class: 0 | | | | | |--- x25 > 2874.98 | | | | | | |--- class: 1
In [28]:
import pickle
pickle.dump(fitted_model, open("data-distress/tree.model.sav", "wb"))