Add regression tree implementation

This commit is contained in:
Aleksey Filippov 2025-03-11 20:38:54 +04:00
parent d02ed03d8e
commit 1682e25dcb
16 changed files with 512 additions and 0 deletions

View File

@ -0,0 +1,22 @@
from werkzeug.datastructures import FileStorage
from backend import api_bp, dataset_path, service
from backend.dataset.dto import DatasetUploadDto
from backend.dataset.model import DatasetParams
from backend.dto import RegressionDto
from backend.regression.dto import RegressionResultDto
from backend.regression.model import RegressionTreeParams
@api_bp.post("/regression")
@api_bp.input(DatasetUploadDto, location="files")
@api_bp.input(RegressionDto, location="query")
@api_bp.output(RegressionResultDto)
def upload_dataset(files_data, query_data):
uploaded_file: FileStorage = files_data["dataset"]
schema = RegressionDto()
dataset_params: DatasetParams = schema.get_dataset_params(query_data)
tree_params: RegressionTreeParams = schema.get_tree_params(query_data)
return service.run_regression(
dataset_path, uploaded_file, dataset_params, tree_params
)

View File

@ -0,0 +1,56 @@
import os
import uuid
import pandas as pd
from pandas import DataFrame
from sklearn.model_selection import train_test_split
from werkzeug import utils
from backend.api import FileStorage
from backend.dataset.model import DatasetParams, SplittedDataset
class Dataset:
def __init__(self, path: str | None, file: FileStorage) -> None:
if path is None:
raise Exception("Dataset path is not defined")
self.__path: str = path
self.__file_name: str = self.__save(file)
def __get_file_name(self, file: FileStorage) -> str:
if file.filename is None:
raise Exception("Dataset upload error")
file_uuid: str = str(uuid.uuid4())
file_name: str = utils.secure_filename(file_uuid)
return os.path.join(self.__path, file_name)
def __save(self, file: FileStorage) -> str:
file_name: str = self.__get_file_name(file=file)
if os.path.exists(file_name):
raise Exception(f"File with name '{file_name}' is already exists")
file.stream.seek(0)
file.save(file_name)
return file_name
def read(self, params: DatasetParams) -> DataFrame:
df = pd.read_csv(self.__file_name, sep=params.sep, decimal=params.decimal)
if params.input is not None:
return df[params.input + [params.target]]
return df
def split(
self, data: DataFrame, params: DatasetParams, random_state: int
) -> SplittedDataset:
X = data.drop([params.target], axis=1)
y = data[[params.target]]
X_train, X_test, y_train, y_test = train_test_split(
X,
y,
test_size=(1.0 - params.train_volume),
random_state=random_state,
)
return SplittedDataset(X_train, X_test, y_train, y_test)
def remove(self):
os.remove(self.__file_name)

View File

@ -0,0 +1,14 @@
from apiflask import Schema, fields
from apiflask.validators import Range
class DatasetUploadDto(Schema):
dataset = fields.File(attribute="dataset", required=True)
class DatasetDto(Schema):
input = fields.List(fields.String(), load_default=None)
target = fields.String(required=True)
sep = fields.String(load_default=",")
decimal = fields.String(load_default=".")
train_volume = fields.Float(load_default=0.8, validate=Range(min=0.1, max=0.9))

View File

@ -0,0 +1,21 @@
from dataclasses import dataclass, field
from typing import List
from pandas import DataFrame
@dataclass
class DatasetParams:
target: str
sep: str
decimal: str
train_volume: float
input: List[str] = field(default_factory=lambda: [])
@dataclass
class SplittedDataset:
X_train: DataFrame
X_test: DataFrame
y_train: DataFrame
y_test: DataFrame

18
dt-cart/backend/dto.py Normal file
View File

@ -0,0 +1,18 @@
import dataclasses
from backend.dataset.dto import DatasetDto
from backend.dataset.model import DatasetParams
from backend.regression.dto import RegressionTreeDto
from backend.regression.model import RegressionTreeParams
class RegressionDto(DatasetDto, RegressionTreeDto):
def get_dataset_params(self, data) -> DatasetParams:
field_names = set(f.name for f in dataclasses.fields(DatasetParams))
return DatasetParams(**{k: v for k, v in data.items() if k in field_names})
def get_tree_params(self, data) -> RegressionTreeParams:
field_names = set(f.name for f in dataclasses.fields(RegressionTreeParams))
return RegressionTreeParams(
**{k: v for k, v in data.items() if k in field_names}
)

View File

@ -0,0 +1,30 @@
import math
from typing import Callable
from sklearn import metrics
from backend.metric.model import MetricValue
def mse(y, y_pred) -> float:
return float(metrics.mean_squared_error(y, y_pred))
def rmse(y, y_pred) -> float:
return float(math.sqrt(metrics.mean_squared_error(y, y_pred)))
def mae(y, y_pred) -> float:
return float(metrics.mean_absolute_error(y, y_pred))
def rmae(y, y_pred) -> float:
return float(math.sqrt(metrics.mean_absolute_error(y, y_pred)))
def r2(y, y_pred) -> float:
return float(metrics.r2_score(y, y_pred))
def get_metric(metric: Callable, y, y_pred) -> MetricValue:
return MetricValue(metric(y[0], y[1]), metric(y_pred[0], y_pred[1]))

View File

@ -0,0 +1,6 @@
from apiflask import Schema, fields
class MetrciDto(Schema):
train = fields.Float()
test = fields.Float()

View File

@ -0,0 +1,7 @@
from dataclasses import dataclass
@dataclass
class MetricValue:
train: float
test: float

View File

@ -0,0 +1,25 @@
from sklearn import tree
from backend import metric
from backend import tree as tree_helper
from backend.dataset.model import SplittedDataset
from backend.regression.model import RegressionResult, RegressionTreeParams
def learn_regression_model(
data: SplittedDataset,
params: RegressionTreeParams,
) -> RegressionResult:
model = tree.DecisionTreeRegressor(**vars(params))
fitted_model = model.fit(data.X_train.values, data.y_train.values.ravel())
y = (data.y_train, fitted_model.predict(data.X_train.values))
y_pred = (data.y_test, fitted_model.predict(data.X_test.values))
return RegressionResult(
mse=metric.get_metric(metric.mse, y, y_pred),
mae=metric.get_metric(metric.mae, y, y_pred),
rmse=metric.get_metric(metric.rmse, y, y_pred),
rmae=metric.get_metric(metric.rmae, y, y_pred),
r2=metric.get_metric(metric.r2, y, y_pred),
rules=tree_helper.get_rules(fitted_model, list(data.X_train.columns)),
tree=tree_helper.get_tree(fitted_model, list(data.X_train.columns)),
)

View File

@ -0,0 +1,36 @@
from apiflask import Schema, fields
from apiflask.validators import OneOf, Range
from backend.metric.dto import MetrciDto
from backend.tree.dto import RuleDto, TreeNodeDto
class RegressionTreeDto(Schema):
criterion = fields.String(
load_default="squared_error",
validate=OneOf(["squared_error", "friedman_mse", "absolute_error", "poisson"]),
)
splitter = fields.String(load_default="best", validate=OneOf(["best", "random"]))
max_depth = fields.Integer(load_default=None)
min_samples_split = fields.Integer(load_default=2, validate=Range(min=2))
min_samples_leaf = fields.Integer(load_default=1, validate=Range(min=1))
min_weight_fraction_leaf = fields.Float(load_default=0.0)
# TODO: Add float values support
max_features = fields.String(
load_default=None,
validate=OneOf(["auto", "sqrt", "log2", None]),
)
random_state = fields.Integer(load_default=None)
max_leaf_nodes = fields.Integer(load_default=None)
min_impurity_decrease = fields.Float(load_default=0.0)
ccp_alpha = fields.Float(load_default=0.0)
class RegressionResultDto(Schema):
rules = fields.List(fields.Nested(RuleDto()))
tree = fields.List(fields.Nested(TreeNodeDto()))
mse = fields.Nested(MetrciDto())
mae = fields.Nested(MetrciDto())
rmse = fields.Nested(MetrciDto())
rmae = fields.Nested(MetrciDto())
r2 = fields.Nested(MetrciDto())

View File

@ -0,0 +1,31 @@
from dataclasses import dataclass
from typing import List
from backend.metric.model import MetricValue
from backend.tree.model import Rule, TreeNode
@dataclass
class RegressionTreeParams:
criterion: str
splitter: str
max_depth: int
min_samples_split: int
min_samples_leaf: int
min_weight_fraction_leaf: float
max_features: str
random_state: int
max_leaf_nodes: int
min_impurity_decrease: float
ccp_alpha: float
@dataclass
class RegressionResult:
tree: List[TreeNode]
rules: List[Rule]
mse: MetricValue
mae: MetricValue
rmse: MetricValue
rmae: MetricValue
r2: MetricValue

View File

@ -0,0 +1,29 @@
from werkzeug.datastructures import FileStorage
from backend import regression
from backend.dataset import Dataset
from backend.dataset.model import DatasetParams, SplittedDataset
from backend.regression.model import RegressionResult, RegressionTreeParams
def run_regression(
path: str | None,
file: FileStorage,
dataset_params: DatasetParams,
tree_params: RegressionTreeParams,
) -> RegressionResult:
try:
dataset: Dataset = Dataset(path=path, file=file)
data = dataset.read(dataset_params)
splitted_dataset: SplittedDataset = dataset.split(
data=data,
params=dataset_params,
random_state=tree_params.random_state,
)
result = regression.learn_regression_model(
data=splitted_dataset,
params=tree_params,
)
finally:
dataset.remove()
return result

View File

@ -0,0 +1,98 @@
import uuid
from typing import List
import numpy as np
from sklearn import tree
from sklearn.tree._tree import TREE_UNDEFINED # type: ignore
from backend.tree.model import ComparisonType, Rule, RuleAtom, TreeNode
def get_rules(
tree: tree.BaseDecisionTree, feature_names: List[str], classes=None
) -> List[Rule]:
tree_ = tree.tree_ # type: ignore
feature_name = [
feature_names[i] if i != TREE_UNDEFINED else "undefined!" for i in tree_.feature
]
rules: List[Rule] = []
antecedent: List[RuleAtom] = []
def recurse(node, antecedent, rules):
if tree_.feature[node] != TREE_UNDEFINED:
name = feature_name[node]
threshold = tree_.threshold[node]
p1, p2 = list(antecedent), list(antecedent)
p1.append(RuleAtom(name, ComparisonType.LESS.value, threshold))
recurse(tree_.children_left[node], p1, rules)
p2.append(RuleAtom(name, ComparisonType.GREATER.value, threshold))
recurse(tree_.children_right[node], p2, rules)
else:
if classes is None:
rules.append(Rule(antecedent, tree_.value[node][0][0])) # type: ignore
else:
value = np.argmax(tree_.value[node][0])
rules.append(Rule(antecedent, classes[value])) # type: ignore
recurse(0, antecedent, rules)
# sort by antecedent length
samples_count = [len(rule.antecedent) for rule in rules]
sorted_index = list(np.argsort(samples_count))
return [rules[index] for index in sorted_index]
def get_tree(
tree: tree.BaseDecisionTree, feature_names: List[str], classes=None
) -> List[TreeNode]:
tree_ = tree.tree_ # type: ignore
feature_name = [
feature_names[i] if i != TREE_UNDEFINED else "undefined!" for i in tree_.feature
]
nodes: List[TreeNode] = []
def recurse(node, parent_node, nodes):
parent: str | None = None if parent_node is None else parent_node.name
if tree_.feature[node] != TREE_UNDEFINED:
feature = feature_name[node]
threshold = tree_.threshold[node]
p1 = TreeNode(
parent,
str(uuid.uuid4()),
node,
feature,
ComparisonType.LESS.value,
threshold,
)
recurse(tree_.children_left[node], p1, nodes)
p2 = TreeNode(
parent,
str(uuid.uuid4()),
node,
feature,
ComparisonType.GREATER.value,
threshold,
)
nodes.append(p1)
nodes.append(p2)
recurse(tree_.children_right[node], p2, nodes)
else:
if classes is None:
nodes.append(
TreeNode(parent, None, node, "result", "=", tree_.value[node][0][0])
)
else:
value = np.argmax(tree_.value[node][0])
nodes.append(
TreeNode(parent, None, node, "result", "=", classes[value])
)
recurse(0, None, nodes)
# sort by node level
levels = [node.level for node in nodes]
sorted_index = list(np.argsort(levels))
return [nodes[index] for index in sorted_index]

View File

@ -0,0 +1,21 @@
from apiflask import Schema, fields
class RuleAtomDto(Schema):
variable = fields.String()
type = fields.String()
value = fields.Float()
class RuleDto(Schema):
antecedent = fields.List(fields.Nested(RuleAtomDto()))
consequent = fields.Field()
class TreeNodeDto(Schema):
parent = fields.String()
name = fields.String()
level = fields.Integer()
variable = fields.String()
type = fields.String()
value = fields.Field()

View File

@ -0,0 +1,42 @@
import enum
from dataclasses import dataclass
from typing import List
import numpy as np
class ComparisonType(enum.Enum):
LESS = "<="
GREATER = ">"
@dataclass(repr=False)
class RuleAtom:
variable: str
type: str
value: float
def __repr__(self) -> str:
return f"({self.variable} {self.type} {np.round(self.value, 3)})"
@dataclass(repr=False)
class Rule:
antecedent: List[RuleAtom]
consequent: float | str
def __repr__(self) -> str:
consequent_value: float | str = str(self.consequent)
if consequent_value.isnumeric():
consequent_value = np.round(float(consequent_value), 3)
return f"if {" and ".join([str(atom) for atom in self.antecedent])} -> {consequent_value}"
@dataclass(repr=False)
class TreeNode:
parent: str | None
name: str | None
level: int
variable: str
type: str
value: float | str

56
dt-cart/data/density.csv Normal file
View File

@ -0,0 +1,56 @@
T;Al2O3;TiO2;Density
20;0;0;1,0625
25;0;0;1,05979
35;0;0;1,05404
40;0;0;1,05103
45;0;0;1,04794
50;0;0;1,04477
60;0;0;1,03826
65;0;0;1,03484
70;0;0;1,03182
20;0,05;0;1,08755
45;0,05;0;1,07105
50;0,05;0;1,0676
55;0,05;0;1,06409
65;0,05;0;1,05691
70;0,05;0;1,05291
20;0,3;0;1,18861
25;0,3;0;1,18389
30;0,3;0;1,1792
40;0,3;0;1,17017
45;0,3;0;1,16572
50;0,3;0;1,16138
55;0,3;0;1,15668
60;0,3;0;1,15233
70;0,3;0;1,14414
20;0;0,05;1,09098
25;0;0,05;1,08775
30;0;0,05;1,08443
35;0;0,05;1,08108
40;0;0,05;1,07768
60;0;0,05;1,06362
65;0;0,05;1,05999
70;0;0,05;1,05601
25;0;0,3;1,2186
35;0;0,3;1,20776
45;0;0,3;1,19759
50;0;0,3;1,19268
55;0;0,3;1,18746
65;0;0,3;1,178
30;0;0;1,05696
55;0;0;1,04158
25;0,05;0;1,08438
30;0,05;0;1,08112
35;0,05;0;1,07781
40;0,05;0;1,07446
60;0,05;0;1,06053
35;0,3;0;1,17459
65;0,3;0;1,14812
45;0;0,05;1,07424
50;0;0,05;1,07075
55;0;0,05;1,06721
20;0;0,3;1,22417
30;0;0,3;1,2131
40;0;0,3;1,20265
60;0;0,3;1,18265
70;0;0,3;1,17261
1 T Al2O3 TiO2 Density
2 20 0 0 1,0625
3 25 0 0 1,05979
4 35 0 0 1,05404
5 40 0 0 1,05103
6 45 0 0 1,04794
7 50 0 0 1,04477
8 60 0 0 1,03826
9 65 0 0 1,03484
10 70 0 0 1,03182
11 20 0,05 0 1,08755
12 45 0,05 0 1,07105
13 50 0,05 0 1,0676
14 55 0,05 0 1,06409
15 65 0,05 0 1,05691
16 70 0,05 0 1,05291
17 20 0,3 0 1,18861
18 25 0,3 0 1,18389
19 30 0,3 0 1,1792
20 40 0,3 0 1,17017
21 45 0,3 0 1,16572
22 50 0,3 0 1,16138
23 55 0,3 0 1,15668
24 60 0,3 0 1,15233
25 70 0,3 0 1,14414
26 20 0 0,05 1,09098
27 25 0 0,05 1,08775
28 30 0 0,05 1,08443
29 35 0 0,05 1,08108
30 40 0 0,05 1,07768
31 60 0 0,05 1,06362
32 65 0 0,05 1,05999
33 70 0 0,05 1,05601
34 25 0 0,3 1,2186
35 35 0 0,3 1,20776
36 45 0 0,3 1,19759
37 50 0 0,3 1,19268
38 55 0 0,3 1,18746
39 65 0 0,3 1,178
40 30 0 0 1,05696
41 55 0 0 1,04158
42 25 0,05 0 1,08438
43 30 0,05 0 1,08112
44 35 0,05 0 1,07781
45 40 0,05 0 1,07446
46 60 0,05 0 1,06053
47 35 0,3 0 1,17459
48 65 0,3 0 1,14812
49 45 0 0,05 1,07424
50 50 0 0,05 1,07075
51 55 0 0,05 1,06721
52 20 0 0,3 1,22417
53 30 0 0,3 1,2131
54 40 0 0,3 1,20265
55 60 0 0,3 1,18265
56 70 0 0,3 1,17261