281 KiB
281 KiB
In [30]:
import pandas as pd
data = pd.read_csv("data-turbine/gt_full.csv", index_col=0)
data
Out[30]:
In [31]:
import matplotlib.pyplot as plt
data.hist(bins=30, figsize=(10, 10))
plt.show()
In [32]:
data.describe().transpose()
Out[32]:
In [33]:
import seaborn as sns
sns.heatmap(data.corr(), annot=True)
Out[33]:
In [34]:
data.drop(["AT", "AP", "AFDP", "GTEP", "TEY", "CDP", "NOX"], axis=1, inplace=True)
data
Out[34]:
In [35]:
sns.heatmap(data.corr(), annot=True)
Out[35]:
In [36]:
from sklearn.model_selection import train_test_split
random_state = 9
y = data["CO"]
X = data.drop(["CO"], axis=1).copy()
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=random_state
)
display(X_train, y_train, X_test, y_test)
In [37]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model, tree, neighbors, ensemble
models = {
"linear": {"model": linear_model.LinearRegression(n_jobs=-1)},
"linear_poly": {
"model": make_pipeline(
PolynomialFeatures(degree=2),
linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),
)
},
"linear_interact": {
"model": make_pipeline(
PolynomialFeatures(interaction_only=True),
linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),
)
},
"ridge": {"model": linear_model.RidgeCV()},
"decision_tree": {
"model": tree.DecisionTreeRegressor(max_depth=4, random_state=random_state)
},
"knn": {"model": neighbors.KNeighborsRegressor(n_neighbors=7, n_jobs=-1)},
"random_forest": {
"model": ensemble.RandomForestRegressor(
max_depth=7, random_state=random_state, n_jobs=-1
)
},
}
In [38]:
import math
from sklearn import metrics
for model_name in models.keys():
print(f"Model: {model_name}")
fitted_model = models[model_name]["model"].fit(
X_train.values, y_train.values.ravel()
)
y_train_pred = fitted_model.predict(X_train.values)
y_test_pred = fitted_model.predict(X_test.values)
models[model_name]["fitted"] = fitted_model
models[model_name]["train_preds"] = y_train_pred
models[model_name]["preds"] = y_test_pred
models[model_name]["RMSE_train"] = math.sqrt(
metrics.mean_squared_error(y_train, y_train_pred)
)
models[model_name]["RMSE_test"] = math.sqrt(
metrics.mean_squared_error(y_test, y_test_pred)
)
models[model_name]["RMAE_test"] = math.sqrt(
metrics.mean_absolute_error(y_test, y_test_pred)
)
models[model_name]["R2_test"] = metrics.r2_score(y_test, y_test_pred)
In [39]:
reg_metrics = pd.DataFrame.from_dict(models, "index")[
["RMSE_train", "RMSE_test", "RMAE_test", "R2_test"]
]
reg_metrics.sort_values(by="RMSE_test").style.background_gradient(
cmap="viridis", low=1, high=0.3, subset=["RMSE_train", "RMSE_test"]
).background_gradient(cmap="plasma", low=0.3, high=1, subset=["RMAE_test", "R2_test"])
Out[39]:
In [40]:
import numpy as np
from sklearn import model_selection
parameters = {
"criterion": ["squared_error", "absolute_error", "friedman_mse", "poisson"],
"max_depth": np.arange(1, 21).tolist()[0::2],
"min_samples_split": np.arange(2, 11).tolist()[0::2],
}
grid = model_selection.GridSearchCV(
tree.DecisionTreeRegressor(random_state=random_state), parameters, cv=5, n_jobs=-1, scoring="r2"
)
grid.fit(X_train, y_train)
grid.best_params_
Out[40]:
In [47]:
model = grid.best_estimator_
y_pred = model.predict(X_test)
old_metrics = {
"RMSE_test": models["decision_tree"]["RMSE_test"],
"RMAE_test": models["decision_tree"]["RMAE_test"],
"R2_test": models["decision_tree"]["R2_test"],
}
new_metrics = {}
new_metrics["RMSE_test"] = math.sqrt(metrics.mean_squared_error(y_test, y_pred))
new_metrics["RMAE_test"] = math.sqrt(
metrics.mean_absolute_error(y_test, y_pred)
)
new_metrics["MAE_test"] = float(
metrics.mean_absolute_error(y_test, y_pred)
)
new_metrics["R2_test"] = metrics.r2_score(y_test, y_pred)
display(old_metrics)
display(new_metrics)
In [55]:
def rmse(row):
return math.sqrt(metrics.mean_squared_error([row["Real"]], [row["Inferred"]]))
res = X_test.copy()
res["Real"] = y_test
res["Inferred"] = y_pred
res["RMSE"] = res.apply(rmse, axis=1)
res.sort_values(by="Real", ascending=False).head(30)
Out[55]:
In [42]:
rules = tree.export_text(model, feature_names=X_train.columns.values.tolist())
print(rules)
In [46]:
import pickle
pickle.dump(model, open("data-turbine/tree-gs.model.sav", "wb"))
In [48]:
rules2 = tree.export_text(
models["decision_tree"]["fitted"], feature_names=X_train.columns.values.tolist()
)
print(rules2)
In [50]:
import pickle
pickle.dump(
models["decision_tree"]["fitted"], open("data-turbine/tree.model.sav", "wb")
)
In [45]:
data.to_csv("data-turbine/clear-data.csv", index=False)