26 KiB
26 KiB
In [2]:
import pandas as pd
train = pd.read_csv("data/viscosity_train.csv", sep=";", decimal=",")
test = pd.read_csv("data/viscosity_test.csv", sep=";", decimal=",")
display(train.head())
display(test.head())
In [3]:
y_train = train["T"]
X_train = train.drop(["T"], axis=1)
display(X_train.head())
display(y_train.head())
y_test = test["T"]
X_test = test.drop(["T"], axis=1)
display(X_test.head())
display(y_test.head())
In [10]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model, tree, neighbors, ensemble
random_state = 9
models = {
"linear": {"model": linear_model.LinearRegression(n_jobs=-1)},
"linear_poly": {
"model": make_pipeline(
PolynomialFeatures(degree=2),
linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),
)
},
"linear_interact": {
"model": make_pipeline(
PolynomialFeatures(interaction_only=True),
linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),
)
},
"ridge": {"model": linear_model.RidgeCV()},
"decision_tree": {
"model": tree.DecisionTreeRegressor(random_state=random_state, max_depth=6, criterion="absolute_error")
},
"knn": {"model": neighbors.KNeighborsRegressor(n_neighbors=7, n_jobs=-1)},
"random_forest": {
"model": ensemble.RandomForestRegressor(
max_depth=7, random_state=random_state, n_jobs=-1
)
},
}
In [11]:
import math
from sklearn import metrics
for model_name in models.keys():
print(f"Model: {model_name}")
fitted_model = models[model_name]["model"].fit(
X_train.values, y_train.values.ravel()
)
y_train_pred = fitted_model.predict(X_train.values)
y_test_pred = fitted_model.predict(X_test.values)
models[model_name]["fitted"] = fitted_model
models[model_name]["MSE_train"] = metrics.mean_squared_error(y_train, y_train_pred)
models[model_name]["MSE_test"] = metrics.mean_squared_error(y_test, y_test_pred)
models[model_name]["MAE_train"] = metrics.mean_absolute_error(y_train, y_train_pred)
models[model_name]["MAE_test"] = metrics.mean_absolute_error(y_test, y_test_pred)
models[model_name]["R2_train"] = metrics.r2_score(y_train, y_train_pred)
models[model_name]["R2_test"] = metrics.r2_score(y_test, y_test_pred)
In [12]:
reg_metrics = pd.DataFrame.from_dict(models, "index")[
["MSE_train", "MSE_test", "MAE_train", "MAE_test", "R2_train", "R2_test"]
]
reg_metrics.sort_values(by="MAE_test").style.background_gradient(
cmap="viridis", low=1, high=0.3, subset=["MSE_train", "MSE_test"]
).background_gradient(cmap="plasma", low=0.3, high=1, subset=["MAE_test", "R2_test"])
Out[12]:
In [13]:
model = models["decision_tree"]["fitted"]
rules = tree.export_text(
model, feature_names=X_train.columns.values.tolist()
)
print(rules)
In [14]:
import pickle
pickle.dump(model, open("data/temp_viscosity_tree.model.sav", "wb"))