Add regression tree implementation
This commit is contained in:
parent
d02ed03d8e
commit
1682e25dcb
@ -0,0 +1,22 @@
|
||||
from werkzeug.datastructures import FileStorage
|
||||
|
||||
from backend import api_bp, dataset_path, service
|
||||
from backend.dataset.dto import DatasetUploadDto
|
||||
from backend.dataset.model import DatasetParams
|
||||
from backend.dto import RegressionDto
|
||||
from backend.regression.dto import RegressionResultDto
|
||||
from backend.regression.model import RegressionTreeParams
|
||||
|
||||
|
||||
@api_bp.post("/regression")
|
||||
@api_bp.input(DatasetUploadDto, location="files")
|
||||
@api_bp.input(RegressionDto, location="query")
|
||||
@api_bp.output(RegressionResultDto)
|
||||
def upload_dataset(files_data, query_data):
|
||||
uploaded_file: FileStorage = files_data["dataset"]
|
||||
schema = RegressionDto()
|
||||
dataset_params: DatasetParams = schema.get_dataset_params(query_data)
|
||||
tree_params: RegressionTreeParams = schema.get_tree_params(query_data)
|
||||
return service.run_regression(
|
||||
dataset_path, uploaded_file, dataset_params, tree_params
|
||||
)
|
56
dt-cart/backend/dataset/__init__.py
Normal file
56
dt-cart/backend/dataset/__init__.py
Normal file
@ -0,0 +1,56 @@
|
||||
import os
|
||||
import uuid
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame
|
||||
from sklearn.model_selection import train_test_split
|
||||
from werkzeug import utils
|
||||
|
||||
from backend.api import FileStorage
|
||||
from backend.dataset.model import DatasetParams, SplittedDataset
|
||||
|
||||
|
||||
class Dataset:
|
||||
def __init__(self, path: str | None, file: FileStorage) -> None:
|
||||
if path is None:
|
||||
raise Exception("Dataset path is not defined")
|
||||
self.__path: str = path
|
||||
self.__file_name: str = self.__save(file)
|
||||
|
||||
def __get_file_name(self, file: FileStorage) -> str:
|
||||
if file.filename is None:
|
||||
raise Exception("Dataset upload error")
|
||||
file_uuid: str = str(uuid.uuid4())
|
||||
file_name: str = utils.secure_filename(file_uuid)
|
||||
return os.path.join(self.__path, file_name)
|
||||
|
||||
def __save(self, file: FileStorage) -> str:
|
||||
file_name: str = self.__get_file_name(file=file)
|
||||
if os.path.exists(file_name):
|
||||
raise Exception(f"File with name '{file_name}' is already exists")
|
||||
file.stream.seek(0)
|
||||
file.save(file_name)
|
||||
return file_name
|
||||
|
||||
def read(self, params: DatasetParams) -> DataFrame:
|
||||
df = pd.read_csv(self.__file_name, sep=params.sep, decimal=params.decimal)
|
||||
if params.input is not None:
|
||||
return df[params.input + [params.target]]
|
||||
|
||||
return df
|
||||
|
||||
def split(
|
||||
self, data: DataFrame, params: DatasetParams, random_state: int
|
||||
) -> SplittedDataset:
|
||||
X = data.drop([params.target], axis=1)
|
||||
y = data[[params.target]]
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X,
|
||||
y,
|
||||
test_size=(1.0 - params.train_volume),
|
||||
random_state=random_state,
|
||||
)
|
||||
return SplittedDataset(X_train, X_test, y_train, y_test)
|
||||
|
||||
def remove(self):
|
||||
os.remove(self.__file_name)
|
14
dt-cart/backend/dataset/dto.py
Normal file
14
dt-cart/backend/dataset/dto.py
Normal file
@ -0,0 +1,14 @@
|
||||
from apiflask import Schema, fields
|
||||
from apiflask.validators import Range
|
||||
|
||||
|
||||
class DatasetUploadDto(Schema):
|
||||
dataset = fields.File(attribute="dataset", required=True)
|
||||
|
||||
|
||||
class DatasetDto(Schema):
|
||||
input = fields.List(fields.String(), load_default=None)
|
||||
target = fields.String(required=True)
|
||||
sep = fields.String(load_default=",")
|
||||
decimal = fields.String(load_default=".")
|
||||
train_volume = fields.Float(load_default=0.8, validate=Range(min=0.1, max=0.9))
|
21
dt-cart/backend/dataset/model.py
Normal file
21
dt-cart/backend/dataset/model.py
Normal file
@ -0,0 +1,21 @@
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List
|
||||
|
||||
from pandas import DataFrame
|
||||
|
||||
|
||||
@dataclass
|
||||
class DatasetParams:
|
||||
target: str
|
||||
sep: str
|
||||
decimal: str
|
||||
train_volume: float
|
||||
input: List[str] = field(default_factory=lambda: [])
|
||||
|
||||
|
||||
@dataclass
|
||||
class SplittedDataset:
|
||||
X_train: DataFrame
|
||||
X_test: DataFrame
|
||||
y_train: DataFrame
|
||||
y_test: DataFrame
|
18
dt-cart/backend/dto.py
Normal file
18
dt-cart/backend/dto.py
Normal file
@ -0,0 +1,18 @@
|
||||
import dataclasses
|
||||
|
||||
from backend.dataset.dto import DatasetDto
|
||||
from backend.dataset.model import DatasetParams
|
||||
from backend.regression.dto import RegressionTreeDto
|
||||
from backend.regression.model import RegressionTreeParams
|
||||
|
||||
|
||||
class RegressionDto(DatasetDto, RegressionTreeDto):
|
||||
def get_dataset_params(self, data) -> DatasetParams:
|
||||
field_names = set(f.name for f in dataclasses.fields(DatasetParams))
|
||||
return DatasetParams(**{k: v for k, v in data.items() if k in field_names})
|
||||
|
||||
def get_tree_params(self, data) -> RegressionTreeParams:
|
||||
field_names = set(f.name for f in dataclasses.fields(RegressionTreeParams))
|
||||
return RegressionTreeParams(
|
||||
**{k: v for k, v in data.items() if k in field_names}
|
||||
)
|
30
dt-cart/backend/metric/__init__.py
Normal file
30
dt-cart/backend/metric/__init__.py
Normal file
@ -0,0 +1,30 @@
|
||||
import math
|
||||
from typing import Callable
|
||||
|
||||
from sklearn import metrics
|
||||
|
||||
from backend.metric.model import MetricValue
|
||||
|
||||
|
||||
def mse(y, y_pred) -> float:
|
||||
return float(metrics.mean_squared_error(y, y_pred))
|
||||
|
||||
|
||||
def rmse(y, y_pred) -> float:
|
||||
return float(math.sqrt(metrics.mean_squared_error(y, y_pred)))
|
||||
|
||||
|
||||
def mae(y, y_pred) -> float:
|
||||
return float(metrics.mean_absolute_error(y, y_pred))
|
||||
|
||||
|
||||
def rmae(y, y_pred) -> float:
|
||||
return float(math.sqrt(metrics.mean_absolute_error(y, y_pred)))
|
||||
|
||||
|
||||
def r2(y, y_pred) -> float:
|
||||
return float(metrics.r2_score(y, y_pred))
|
||||
|
||||
|
||||
def get_metric(metric: Callable, y, y_pred) -> MetricValue:
|
||||
return MetricValue(metric(y[0], y[1]), metric(y_pred[0], y_pred[1]))
|
6
dt-cart/backend/metric/dto.py
Normal file
6
dt-cart/backend/metric/dto.py
Normal file
@ -0,0 +1,6 @@
|
||||
from apiflask import Schema, fields
|
||||
|
||||
|
||||
class MetrciDto(Schema):
|
||||
train = fields.Float()
|
||||
test = fields.Float()
|
7
dt-cart/backend/metric/model.py
Normal file
7
dt-cart/backend/metric/model.py
Normal file
@ -0,0 +1,7 @@
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class MetricValue:
|
||||
train: float
|
||||
test: float
|
25
dt-cart/backend/regression/__init__.py
Normal file
25
dt-cart/backend/regression/__init__.py
Normal file
@ -0,0 +1,25 @@
|
||||
from sklearn import tree
|
||||
|
||||
from backend import metric
|
||||
from backend import tree as tree_helper
|
||||
from backend.dataset.model import SplittedDataset
|
||||
from backend.regression.model import RegressionResult, RegressionTreeParams
|
||||
|
||||
|
||||
def learn_regression_model(
|
||||
data: SplittedDataset,
|
||||
params: RegressionTreeParams,
|
||||
) -> RegressionResult:
|
||||
model = tree.DecisionTreeRegressor(**vars(params))
|
||||
fitted_model = model.fit(data.X_train.values, data.y_train.values.ravel())
|
||||
y = (data.y_train, fitted_model.predict(data.X_train.values))
|
||||
y_pred = (data.y_test, fitted_model.predict(data.X_test.values))
|
||||
return RegressionResult(
|
||||
mse=metric.get_metric(metric.mse, y, y_pred),
|
||||
mae=metric.get_metric(metric.mae, y, y_pred),
|
||||
rmse=metric.get_metric(metric.rmse, y, y_pred),
|
||||
rmae=metric.get_metric(metric.rmae, y, y_pred),
|
||||
r2=metric.get_metric(metric.r2, y, y_pred),
|
||||
rules=tree_helper.get_rules(fitted_model, list(data.X_train.columns)),
|
||||
tree=tree_helper.get_tree(fitted_model, list(data.X_train.columns)),
|
||||
)
|
36
dt-cart/backend/regression/dto.py
Normal file
36
dt-cart/backend/regression/dto.py
Normal file
@ -0,0 +1,36 @@
|
||||
from apiflask import Schema, fields
|
||||
from apiflask.validators import OneOf, Range
|
||||
|
||||
from backend.metric.dto import MetrciDto
|
||||
from backend.tree.dto import RuleDto, TreeNodeDto
|
||||
|
||||
|
||||
class RegressionTreeDto(Schema):
|
||||
criterion = fields.String(
|
||||
load_default="squared_error",
|
||||
validate=OneOf(["squared_error", "friedman_mse", "absolute_error", "poisson"]),
|
||||
)
|
||||
splitter = fields.String(load_default="best", validate=OneOf(["best", "random"]))
|
||||
max_depth = fields.Integer(load_default=None)
|
||||
min_samples_split = fields.Integer(load_default=2, validate=Range(min=2))
|
||||
min_samples_leaf = fields.Integer(load_default=1, validate=Range(min=1))
|
||||
min_weight_fraction_leaf = fields.Float(load_default=0.0)
|
||||
# TODO: Add float values support
|
||||
max_features = fields.String(
|
||||
load_default=None,
|
||||
validate=OneOf(["auto", "sqrt", "log2", None]),
|
||||
)
|
||||
random_state = fields.Integer(load_default=None)
|
||||
max_leaf_nodes = fields.Integer(load_default=None)
|
||||
min_impurity_decrease = fields.Float(load_default=0.0)
|
||||
ccp_alpha = fields.Float(load_default=0.0)
|
||||
|
||||
|
||||
class RegressionResultDto(Schema):
|
||||
rules = fields.List(fields.Nested(RuleDto()))
|
||||
tree = fields.List(fields.Nested(TreeNodeDto()))
|
||||
mse = fields.Nested(MetrciDto())
|
||||
mae = fields.Nested(MetrciDto())
|
||||
rmse = fields.Nested(MetrciDto())
|
||||
rmae = fields.Nested(MetrciDto())
|
||||
r2 = fields.Nested(MetrciDto())
|
31
dt-cart/backend/regression/model.py
Normal file
31
dt-cart/backend/regression/model.py
Normal file
@ -0,0 +1,31 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import List
|
||||
|
||||
from backend.metric.model import MetricValue
|
||||
from backend.tree.model import Rule, TreeNode
|
||||
|
||||
|
||||
@dataclass
|
||||
class RegressionTreeParams:
|
||||
criterion: str
|
||||
splitter: str
|
||||
max_depth: int
|
||||
min_samples_split: int
|
||||
min_samples_leaf: int
|
||||
min_weight_fraction_leaf: float
|
||||
max_features: str
|
||||
random_state: int
|
||||
max_leaf_nodes: int
|
||||
min_impurity_decrease: float
|
||||
ccp_alpha: float
|
||||
|
||||
|
||||
@dataclass
|
||||
class RegressionResult:
|
||||
tree: List[TreeNode]
|
||||
rules: List[Rule]
|
||||
mse: MetricValue
|
||||
mae: MetricValue
|
||||
rmse: MetricValue
|
||||
rmae: MetricValue
|
||||
r2: MetricValue
|
29
dt-cart/backend/service.py
Normal file
29
dt-cart/backend/service.py
Normal file
@ -0,0 +1,29 @@
|
||||
from werkzeug.datastructures import FileStorage
|
||||
|
||||
from backend import regression
|
||||
from backend.dataset import Dataset
|
||||
from backend.dataset.model import DatasetParams, SplittedDataset
|
||||
from backend.regression.model import RegressionResult, RegressionTreeParams
|
||||
|
||||
|
||||
def run_regression(
|
||||
path: str | None,
|
||||
file: FileStorage,
|
||||
dataset_params: DatasetParams,
|
||||
tree_params: RegressionTreeParams,
|
||||
) -> RegressionResult:
|
||||
try:
|
||||
dataset: Dataset = Dataset(path=path, file=file)
|
||||
data = dataset.read(dataset_params)
|
||||
splitted_dataset: SplittedDataset = dataset.split(
|
||||
data=data,
|
||||
params=dataset_params,
|
||||
random_state=tree_params.random_state,
|
||||
)
|
||||
result = regression.learn_regression_model(
|
||||
data=splitted_dataset,
|
||||
params=tree_params,
|
||||
)
|
||||
finally:
|
||||
dataset.remove()
|
||||
return result
|
98
dt-cart/backend/tree/__init__.py
Normal file
98
dt-cart/backend/tree/__init__.py
Normal file
@ -0,0 +1,98 @@
|
||||
import uuid
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
from sklearn import tree
|
||||
from sklearn.tree._tree import TREE_UNDEFINED # type: ignore
|
||||
|
||||
from backend.tree.model import ComparisonType, Rule, RuleAtom, TreeNode
|
||||
|
||||
|
||||
def get_rules(
|
||||
tree: tree.BaseDecisionTree, feature_names: List[str], classes=None
|
||||
) -> List[Rule]:
|
||||
tree_ = tree.tree_ # type: ignore
|
||||
feature_name = [
|
||||
feature_names[i] if i != TREE_UNDEFINED else "undefined!" for i in tree_.feature
|
||||
]
|
||||
|
||||
rules: List[Rule] = []
|
||||
antecedent: List[RuleAtom] = []
|
||||
|
||||
def recurse(node, antecedent, rules):
|
||||
|
||||
if tree_.feature[node] != TREE_UNDEFINED:
|
||||
name = feature_name[node]
|
||||
threshold = tree_.threshold[node]
|
||||
p1, p2 = list(antecedent), list(antecedent)
|
||||
p1.append(RuleAtom(name, ComparisonType.LESS.value, threshold))
|
||||
recurse(tree_.children_left[node], p1, rules)
|
||||
p2.append(RuleAtom(name, ComparisonType.GREATER.value, threshold))
|
||||
recurse(tree_.children_right[node], p2, rules)
|
||||
else:
|
||||
if classes is None:
|
||||
rules.append(Rule(antecedent, tree_.value[node][0][0])) # type: ignore
|
||||
else:
|
||||
value = np.argmax(tree_.value[node][0])
|
||||
rules.append(Rule(antecedent, classes[value])) # type: ignore
|
||||
|
||||
recurse(0, antecedent, rules)
|
||||
|
||||
# sort by antecedent length
|
||||
samples_count = [len(rule.antecedent) for rule in rules]
|
||||
sorted_index = list(np.argsort(samples_count))
|
||||
return [rules[index] for index in sorted_index]
|
||||
|
||||
|
||||
def get_tree(
|
||||
tree: tree.BaseDecisionTree, feature_names: List[str], classes=None
|
||||
) -> List[TreeNode]:
|
||||
tree_ = tree.tree_ # type: ignore
|
||||
feature_name = [
|
||||
feature_names[i] if i != TREE_UNDEFINED else "undefined!" for i in tree_.feature
|
||||
]
|
||||
|
||||
nodes: List[TreeNode] = []
|
||||
|
||||
def recurse(node, parent_node, nodes):
|
||||
parent: str | None = None if parent_node is None else parent_node.name
|
||||
if tree_.feature[node] != TREE_UNDEFINED:
|
||||
feature = feature_name[node]
|
||||
threshold = tree_.threshold[node]
|
||||
p1 = TreeNode(
|
||||
parent,
|
||||
str(uuid.uuid4()),
|
||||
node,
|
||||
feature,
|
||||
ComparisonType.LESS.value,
|
||||
threshold,
|
||||
)
|
||||
recurse(tree_.children_left[node], p1, nodes)
|
||||
p2 = TreeNode(
|
||||
parent,
|
||||
str(uuid.uuid4()),
|
||||
node,
|
||||
feature,
|
||||
ComparisonType.GREATER.value,
|
||||
threshold,
|
||||
)
|
||||
nodes.append(p1)
|
||||
nodes.append(p2)
|
||||
recurse(tree_.children_right[node], p2, nodes)
|
||||
else:
|
||||
if classes is None:
|
||||
nodes.append(
|
||||
TreeNode(parent, None, node, "result", "=", tree_.value[node][0][0])
|
||||
)
|
||||
else:
|
||||
value = np.argmax(tree_.value[node][0])
|
||||
nodes.append(
|
||||
TreeNode(parent, None, node, "result", "=", classes[value])
|
||||
)
|
||||
|
||||
recurse(0, None, nodes)
|
||||
|
||||
# sort by node level
|
||||
levels = [node.level for node in nodes]
|
||||
sorted_index = list(np.argsort(levels))
|
||||
return [nodes[index] for index in sorted_index]
|
21
dt-cart/backend/tree/dto.py
Normal file
21
dt-cart/backend/tree/dto.py
Normal file
@ -0,0 +1,21 @@
|
||||
from apiflask import Schema, fields
|
||||
|
||||
|
||||
class RuleAtomDto(Schema):
|
||||
variable = fields.String()
|
||||
type = fields.String()
|
||||
value = fields.Float()
|
||||
|
||||
|
||||
class RuleDto(Schema):
|
||||
antecedent = fields.List(fields.Nested(RuleAtomDto()))
|
||||
consequent = fields.Field()
|
||||
|
||||
|
||||
class TreeNodeDto(Schema):
|
||||
parent = fields.String()
|
||||
name = fields.String()
|
||||
level = fields.Integer()
|
||||
variable = fields.String()
|
||||
type = fields.String()
|
||||
value = fields.Field()
|
42
dt-cart/backend/tree/model.py
Normal file
42
dt-cart/backend/tree/model.py
Normal file
@ -0,0 +1,42 @@
|
||||
import enum
|
||||
from dataclasses import dataclass
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
class ComparisonType(enum.Enum):
|
||||
LESS = "<="
|
||||
GREATER = ">"
|
||||
|
||||
|
||||
@dataclass(repr=False)
|
||||
class RuleAtom:
|
||||
variable: str
|
||||
type: str
|
||||
value: float
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"({self.variable} {self.type} {np.round(self.value, 3)})"
|
||||
|
||||
|
||||
@dataclass(repr=False)
|
||||
class Rule:
|
||||
antecedent: List[RuleAtom]
|
||||
consequent: float | str
|
||||
|
||||
def __repr__(self) -> str:
|
||||
consequent_value: float | str = str(self.consequent)
|
||||
if consequent_value.isnumeric():
|
||||
consequent_value = np.round(float(consequent_value), 3)
|
||||
return f"if {" and ".join([str(atom) for atom in self.antecedent])} -> {consequent_value}"
|
||||
|
||||
|
||||
@dataclass(repr=False)
|
||||
class TreeNode:
|
||||
parent: str | None
|
||||
name: str | None
|
||||
level: int
|
||||
variable: str
|
||||
type: str
|
||||
value: float | str
|
56
dt-cart/data/density.csv
Normal file
56
dt-cart/data/density.csv
Normal file
@ -0,0 +1,56 @@
|
||||
T;Al2O3;TiO2;Density
|
||||
20;0;0;1,0625
|
||||
25;0;0;1,05979
|
||||
35;0;0;1,05404
|
||||
40;0;0;1,05103
|
||||
45;0;0;1,04794
|
||||
50;0;0;1,04477
|
||||
60;0;0;1,03826
|
||||
65;0;0;1,03484
|
||||
70;0;0;1,03182
|
||||
20;0,05;0;1,08755
|
||||
45;0,05;0;1,07105
|
||||
50;0,05;0;1,0676
|
||||
55;0,05;0;1,06409
|
||||
65;0,05;0;1,05691
|
||||
70;0,05;0;1,05291
|
||||
20;0,3;0;1,18861
|
||||
25;0,3;0;1,18389
|
||||
30;0,3;0;1,1792
|
||||
40;0,3;0;1,17017
|
||||
45;0,3;0;1,16572
|
||||
50;0,3;0;1,16138
|
||||
55;0,3;0;1,15668
|
||||
60;0,3;0;1,15233
|
||||
70;0,3;0;1,14414
|
||||
20;0;0,05;1,09098
|
||||
25;0;0,05;1,08775
|
||||
30;0;0,05;1,08443
|
||||
35;0;0,05;1,08108
|
||||
40;0;0,05;1,07768
|
||||
60;0;0,05;1,06362
|
||||
65;0;0,05;1,05999
|
||||
70;0;0,05;1,05601
|
||||
25;0;0,3;1,2186
|
||||
35;0;0,3;1,20776
|
||||
45;0;0,3;1,19759
|
||||
50;0;0,3;1,19268
|
||||
55;0;0,3;1,18746
|
||||
65;0;0,3;1,178
|
||||
30;0;0;1,05696
|
||||
55;0;0;1,04158
|
||||
25;0,05;0;1,08438
|
||||
30;0,05;0;1,08112
|
||||
35;0,05;0;1,07781
|
||||
40;0,05;0;1,07446
|
||||
60;0,05;0;1,06053
|
||||
35;0,3;0;1,17459
|
||||
65;0,3;0;1,14812
|
||||
45;0;0,05;1,07424
|
||||
50;0;0,05;1,07075
|
||||
55;0;0,05;1,06721
|
||||
20;0;0,3;1,22417
|
||||
30;0;0,3;1,2131
|
||||
40;0;0,3;1,20265
|
||||
60;0;0,3;1,18265
|
||||
70;0;0,3;1,17261
|
|
Loading…
x
Reference in New Issue
Block a user