diff --git a/dt-cart/backend/api.py b/dt-cart/backend/api.py index e69de29..1794c0f 100644 --- a/dt-cart/backend/api.py +++ b/dt-cart/backend/api.py @@ -0,0 +1,22 @@ +from werkzeug.datastructures import FileStorage + +from backend import api_bp, dataset_path, service +from backend.dataset.dto import DatasetUploadDto +from backend.dataset.model import DatasetParams +from backend.dto import RegressionDto +from backend.regression.dto import RegressionResultDto +from backend.regression.model import RegressionTreeParams + + +@api_bp.post("/regression") +@api_bp.input(DatasetUploadDto, location="files") +@api_bp.input(RegressionDto, location="query") +@api_bp.output(RegressionResultDto) +def upload_dataset(files_data, query_data): + uploaded_file: FileStorage = files_data["dataset"] + schema = RegressionDto() + dataset_params: DatasetParams = schema.get_dataset_params(query_data) + tree_params: RegressionTreeParams = schema.get_tree_params(query_data) + return service.run_regression( + dataset_path, uploaded_file, dataset_params, tree_params + ) diff --git a/dt-cart/backend/dataset/__init__.py b/dt-cart/backend/dataset/__init__.py new file mode 100644 index 0000000..4579aec --- /dev/null +++ b/dt-cart/backend/dataset/__init__.py @@ -0,0 +1,56 @@ +import os +import uuid + +import pandas as pd +from pandas import DataFrame +from sklearn.model_selection import train_test_split +from werkzeug import utils + +from backend.api import FileStorage +from backend.dataset.model import DatasetParams, SplittedDataset + + +class Dataset: + def __init__(self, path: str | None, file: FileStorage) -> None: + if path is None: + raise Exception("Dataset path is not defined") + self.__path: str = path + self.__file_name: str = self.__save(file) + + def __get_file_name(self, file: FileStorage) -> str: + if file.filename is None: + raise Exception("Dataset upload error") + file_uuid: str = str(uuid.uuid4()) + file_name: str = utils.secure_filename(file_uuid) + return os.path.join(self.__path, file_name) + + def __save(self, file: FileStorage) -> str: + file_name: str = self.__get_file_name(file=file) + if os.path.exists(file_name): + raise Exception(f"File with name '{file_name}' is already exists") + file.stream.seek(0) + file.save(file_name) + return file_name + + def read(self, params: DatasetParams) -> DataFrame: + df = pd.read_csv(self.__file_name, sep=params.sep, decimal=params.decimal) + if params.input is not None: + return df[params.input + [params.target]] + + return df + + def split( + self, data: DataFrame, params: DatasetParams, random_state: int + ) -> SplittedDataset: + X = data.drop([params.target], axis=1) + y = data[[params.target]] + X_train, X_test, y_train, y_test = train_test_split( + X, + y, + test_size=(1.0 - params.train_volume), + random_state=random_state, + ) + return SplittedDataset(X_train, X_test, y_train, y_test) + + def remove(self): + os.remove(self.__file_name) diff --git a/dt-cart/backend/dataset/dto.py b/dt-cart/backend/dataset/dto.py new file mode 100644 index 0000000..3927bf2 --- /dev/null +++ b/dt-cart/backend/dataset/dto.py @@ -0,0 +1,14 @@ +from apiflask import Schema, fields +from apiflask.validators import Range + + +class DatasetUploadDto(Schema): + dataset = fields.File(attribute="dataset", required=True) + + +class DatasetDto(Schema): + input = fields.List(fields.String(), load_default=None) + target = fields.String(required=True) + sep = fields.String(load_default=",") + decimal = fields.String(load_default=".") + train_volume = fields.Float(load_default=0.8, validate=Range(min=0.1, max=0.9)) diff --git a/dt-cart/backend/dataset/model.py b/dt-cart/backend/dataset/model.py new file mode 100644 index 0000000..8f4a4b1 --- /dev/null +++ b/dt-cart/backend/dataset/model.py @@ -0,0 +1,21 @@ +from dataclasses import dataclass, field +from typing import List + +from pandas import DataFrame + + +@dataclass +class DatasetParams: + target: str + sep: str + decimal: str + train_volume: float + input: List[str] = field(default_factory=lambda: []) + + +@dataclass +class SplittedDataset: + X_train: DataFrame + X_test: DataFrame + y_train: DataFrame + y_test: DataFrame diff --git a/dt-cart/backend/dto.py b/dt-cart/backend/dto.py new file mode 100644 index 0000000..99d0867 --- /dev/null +++ b/dt-cart/backend/dto.py @@ -0,0 +1,18 @@ +import dataclasses + +from backend.dataset.dto import DatasetDto +from backend.dataset.model import DatasetParams +from backend.regression.dto import RegressionTreeDto +from backend.regression.model import RegressionTreeParams + + +class RegressionDto(DatasetDto, RegressionTreeDto): + def get_dataset_params(self, data) -> DatasetParams: + field_names = set(f.name for f in dataclasses.fields(DatasetParams)) + return DatasetParams(**{k: v for k, v in data.items() if k in field_names}) + + def get_tree_params(self, data) -> RegressionTreeParams: + field_names = set(f.name for f in dataclasses.fields(RegressionTreeParams)) + return RegressionTreeParams( + **{k: v for k, v in data.items() if k in field_names} + ) diff --git a/dt-cart/backend/metric/__init__.py b/dt-cart/backend/metric/__init__.py new file mode 100644 index 0000000..ff06dbb --- /dev/null +++ b/dt-cart/backend/metric/__init__.py @@ -0,0 +1,30 @@ +import math +from typing import Callable + +from sklearn import metrics + +from backend.metric.model import MetricValue + + +def mse(y, y_pred) -> float: + return float(metrics.mean_squared_error(y, y_pred)) + + +def rmse(y, y_pred) -> float: + return float(math.sqrt(metrics.mean_squared_error(y, y_pred))) + + +def mae(y, y_pred) -> float: + return float(metrics.mean_absolute_error(y, y_pred)) + + +def rmae(y, y_pred) -> float: + return float(math.sqrt(metrics.mean_absolute_error(y, y_pred))) + + +def r2(y, y_pred) -> float: + return float(metrics.r2_score(y, y_pred)) + + +def get_metric(metric: Callable, y, y_pred) -> MetricValue: + return MetricValue(metric(y[0], y[1]), metric(y_pred[0], y_pred[1])) diff --git a/dt-cart/backend/metric/dto.py b/dt-cart/backend/metric/dto.py new file mode 100644 index 0000000..72ba5f2 --- /dev/null +++ b/dt-cart/backend/metric/dto.py @@ -0,0 +1,6 @@ +from apiflask import Schema, fields + + +class MetrciDto(Schema): + train = fields.Float() + test = fields.Float() diff --git a/dt-cart/backend/metric/model.py b/dt-cart/backend/metric/model.py new file mode 100644 index 0000000..68a8eb1 --- /dev/null +++ b/dt-cart/backend/metric/model.py @@ -0,0 +1,7 @@ +from dataclasses import dataclass + + +@dataclass +class MetricValue: + train: float + test: float diff --git a/dt-cart/backend/regression/__init__.py b/dt-cart/backend/regression/__init__.py new file mode 100644 index 0000000..7db5288 --- /dev/null +++ b/dt-cart/backend/regression/__init__.py @@ -0,0 +1,25 @@ +from sklearn import tree + +from backend import metric +from backend import tree as tree_helper +from backend.dataset.model import SplittedDataset +from backend.regression.model import RegressionResult, RegressionTreeParams + + +def learn_regression_model( + data: SplittedDataset, + params: RegressionTreeParams, +) -> RegressionResult: + model = tree.DecisionTreeRegressor(**vars(params)) + fitted_model = model.fit(data.X_train.values, data.y_train.values.ravel()) + y = (data.y_train, fitted_model.predict(data.X_train.values)) + y_pred = (data.y_test, fitted_model.predict(data.X_test.values)) + return RegressionResult( + mse=metric.get_metric(metric.mse, y, y_pred), + mae=metric.get_metric(metric.mae, y, y_pred), + rmse=metric.get_metric(metric.rmse, y, y_pred), + rmae=metric.get_metric(metric.rmae, y, y_pred), + r2=metric.get_metric(metric.r2, y, y_pred), + rules=tree_helper.get_rules(fitted_model, list(data.X_train.columns)), + tree=tree_helper.get_tree(fitted_model, list(data.X_train.columns)), + ) diff --git a/dt-cart/backend/regression/dto.py b/dt-cart/backend/regression/dto.py new file mode 100644 index 0000000..827d0a6 --- /dev/null +++ b/dt-cart/backend/regression/dto.py @@ -0,0 +1,36 @@ +from apiflask import Schema, fields +from apiflask.validators import OneOf, Range + +from backend.metric.dto import MetrciDto +from backend.tree.dto import RuleDto, TreeNodeDto + + +class RegressionTreeDto(Schema): + criterion = fields.String( + load_default="squared_error", + validate=OneOf(["squared_error", "friedman_mse", "absolute_error", "poisson"]), + ) + splitter = fields.String(load_default="best", validate=OneOf(["best", "random"])) + max_depth = fields.Integer(load_default=None) + min_samples_split = fields.Integer(load_default=2, validate=Range(min=2)) + min_samples_leaf = fields.Integer(load_default=1, validate=Range(min=1)) + min_weight_fraction_leaf = fields.Float(load_default=0.0) + # TODO: Add float values support + max_features = fields.String( + load_default=None, + validate=OneOf(["auto", "sqrt", "log2", None]), + ) + random_state = fields.Integer(load_default=None) + max_leaf_nodes = fields.Integer(load_default=None) + min_impurity_decrease = fields.Float(load_default=0.0) + ccp_alpha = fields.Float(load_default=0.0) + + +class RegressionResultDto(Schema): + rules = fields.List(fields.Nested(RuleDto())) + tree = fields.List(fields.Nested(TreeNodeDto())) + mse = fields.Nested(MetrciDto()) + mae = fields.Nested(MetrciDto()) + rmse = fields.Nested(MetrciDto()) + rmae = fields.Nested(MetrciDto()) + r2 = fields.Nested(MetrciDto()) diff --git a/dt-cart/backend/regression/model.py b/dt-cart/backend/regression/model.py new file mode 100644 index 0000000..5d85023 --- /dev/null +++ b/dt-cart/backend/regression/model.py @@ -0,0 +1,31 @@ +from dataclasses import dataclass +from typing import List + +from backend.metric.model import MetricValue +from backend.tree.model import Rule, TreeNode + + +@dataclass +class RegressionTreeParams: + criterion: str + splitter: str + max_depth: int + min_samples_split: int + min_samples_leaf: int + min_weight_fraction_leaf: float + max_features: str + random_state: int + max_leaf_nodes: int + min_impurity_decrease: float + ccp_alpha: float + + +@dataclass +class RegressionResult: + tree: List[TreeNode] + rules: List[Rule] + mse: MetricValue + mae: MetricValue + rmse: MetricValue + rmae: MetricValue + r2: MetricValue diff --git a/dt-cart/backend/service.py b/dt-cart/backend/service.py new file mode 100644 index 0000000..6473c81 --- /dev/null +++ b/dt-cart/backend/service.py @@ -0,0 +1,29 @@ +from werkzeug.datastructures import FileStorage + +from backend import regression +from backend.dataset import Dataset +from backend.dataset.model import DatasetParams, SplittedDataset +from backend.regression.model import RegressionResult, RegressionTreeParams + + +def run_regression( + path: str | None, + file: FileStorage, + dataset_params: DatasetParams, + tree_params: RegressionTreeParams, +) -> RegressionResult: + try: + dataset: Dataset = Dataset(path=path, file=file) + data = dataset.read(dataset_params) + splitted_dataset: SplittedDataset = dataset.split( + data=data, + params=dataset_params, + random_state=tree_params.random_state, + ) + result = regression.learn_regression_model( + data=splitted_dataset, + params=tree_params, + ) + finally: + dataset.remove() + return result diff --git a/dt-cart/backend/tree/__init__.py b/dt-cart/backend/tree/__init__.py new file mode 100644 index 0000000..e12563b --- /dev/null +++ b/dt-cart/backend/tree/__init__.py @@ -0,0 +1,98 @@ +import uuid +from typing import List + +import numpy as np +from sklearn import tree +from sklearn.tree._tree import TREE_UNDEFINED # type: ignore + +from backend.tree.model import ComparisonType, Rule, RuleAtom, TreeNode + + +def get_rules( + tree: tree.BaseDecisionTree, feature_names: List[str], classes=None +) -> List[Rule]: + tree_ = tree.tree_ # type: ignore + feature_name = [ + feature_names[i] if i != TREE_UNDEFINED else "undefined!" for i in tree_.feature + ] + + rules: List[Rule] = [] + antecedent: List[RuleAtom] = [] + + def recurse(node, antecedent, rules): + + if tree_.feature[node] != TREE_UNDEFINED: + name = feature_name[node] + threshold = tree_.threshold[node] + p1, p2 = list(antecedent), list(antecedent) + p1.append(RuleAtom(name, ComparisonType.LESS.value, threshold)) + recurse(tree_.children_left[node], p1, rules) + p2.append(RuleAtom(name, ComparisonType.GREATER.value, threshold)) + recurse(tree_.children_right[node], p2, rules) + else: + if classes is None: + rules.append(Rule(antecedent, tree_.value[node][0][0])) # type: ignore + else: + value = np.argmax(tree_.value[node][0]) + rules.append(Rule(antecedent, classes[value])) # type: ignore + + recurse(0, antecedent, rules) + + # sort by antecedent length + samples_count = [len(rule.antecedent) for rule in rules] + sorted_index = list(np.argsort(samples_count)) + return [rules[index] for index in sorted_index] + + +def get_tree( + tree: tree.BaseDecisionTree, feature_names: List[str], classes=None +) -> List[TreeNode]: + tree_ = tree.tree_ # type: ignore + feature_name = [ + feature_names[i] if i != TREE_UNDEFINED else "undefined!" for i in tree_.feature + ] + + nodes: List[TreeNode] = [] + + def recurse(node, parent_node, nodes): + parent: str | None = None if parent_node is None else parent_node.name + if tree_.feature[node] != TREE_UNDEFINED: + feature = feature_name[node] + threshold = tree_.threshold[node] + p1 = TreeNode( + parent, + str(uuid.uuid4()), + node, + feature, + ComparisonType.LESS.value, + threshold, + ) + recurse(tree_.children_left[node], p1, nodes) + p2 = TreeNode( + parent, + str(uuid.uuid4()), + node, + feature, + ComparisonType.GREATER.value, + threshold, + ) + nodes.append(p1) + nodes.append(p2) + recurse(tree_.children_right[node], p2, nodes) + else: + if classes is None: + nodes.append( + TreeNode(parent, None, node, "result", "=", tree_.value[node][0][0]) + ) + else: + value = np.argmax(tree_.value[node][0]) + nodes.append( + TreeNode(parent, None, node, "result", "=", classes[value]) + ) + + recurse(0, None, nodes) + + # sort by node level + levels = [node.level for node in nodes] + sorted_index = list(np.argsort(levels)) + return [nodes[index] for index in sorted_index] diff --git a/dt-cart/backend/tree/dto.py b/dt-cart/backend/tree/dto.py new file mode 100644 index 0000000..b2e364b --- /dev/null +++ b/dt-cart/backend/tree/dto.py @@ -0,0 +1,21 @@ +from apiflask import Schema, fields + + +class RuleAtomDto(Schema): + variable = fields.String() + type = fields.String() + value = fields.Float() + + +class RuleDto(Schema): + antecedent = fields.List(fields.Nested(RuleAtomDto())) + consequent = fields.Field() + + +class TreeNodeDto(Schema): + parent = fields.String() + name = fields.String() + level = fields.Integer() + variable = fields.String() + type = fields.String() + value = fields.Field() diff --git a/dt-cart/backend/tree/model.py b/dt-cart/backend/tree/model.py new file mode 100644 index 0000000..1fe7583 --- /dev/null +++ b/dt-cart/backend/tree/model.py @@ -0,0 +1,42 @@ +import enum +from dataclasses import dataclass +from typing import List + +import numpy as np + + +class ComparisonType(enum.Enum): + LESS = "<=" + GREATER = ">" + + +@dataclass(repr=False) +class RuleAtom: + variable: str + type: str + value: float + + def __repr__(self) -> str: + return f"({self.variable} {self.type} {np.round(self.value, 3)})" + + +@dataclass(repr=False) +class Rule: + antecedent: List[RuleAtom] + consequent: float | str + + def __repr__(self) -> str: + consequent_value: float | str = str(self.consequent) + if consequent_value.isnumeric(): + consequent_value = np.round(float(consequent_value), 3) + return f"if {" and ".join([str(atom) for atom in self.antecedent])} -> {consequent_value}" + + +@dataclass(repr=False) +class TreeNode: + parent: str | None + name: str | None + level: int + variable: str + type: str + value: float | str diff --git a/dt-cart/data/density.csv b/dt-cart/data/density.csv new file mode 100644 index 0000000..628b828 --- /dev/null +++ b/dt-cart/data/density.csv @@ -0,0 +1,56 @@ +T;Al2O3;TiO2;Density +20;0;0;1,0625 +25;0;0;1,05979 +35;0;0;1,05404 +40;0;0;1,05103 +45;0;0;1,04794 +50;0;0;1,04477 +60;0;0;1,03826 +65;0;0;1,03484 +70;0;0;1,03182 +20;0,05;0;1,08755 +45;0,05;0;1,07105 +50;0,05;0;1,0676 +55;0,05;0;1,06409 +65;0,05;0;1,05691 +70;0,05;0;1,05291 +20;0,3;0;1,18861 +25;0,3;0;1,18389 +30;0,3;0;1,1792 +40;0,3;0;1,17017 +45;0,3;0;1,16572 +50;0,3;0;1,16138 +55;0,3;0;1,15668 +60;0,3;0;1,15233 +70;0,3;0;1,14414 +20;0;0,05;1,09098 +25;0;0,05;1,08775 +30;0;0,05;1,08443 +35;0;0,05;1,08108 +40;0;0,05;1,07768 +60;0;0,05;1,06362 +65;0;0,05;1,05999 +70;0;0,05;1,05601 +25;0;0,3;1,2186 +35;0;0,3;1,20776 +45;0;0,3;1,19759 +50;0;0,3;1,19268 +55;0;0,3;1,18746 +65;0;0,3;1,178 +30;0;0;1,05696 +55;0;0;1,04158 +25;0,05;0;1,08438 +30;0,05;0;1,08112 +35;0,05;0;1,07781 +40;0,05;0;1,07446 +60;0,05;0;1,06053 +35;0,3;0;1,17459 +65;0,3;0;1,14812 +45;0;0,05;1,07424 +50;0;0,05;1,07075 +55;0;0,05;1,06721 +20;0;0,3;1,22417 +30;0;0,3;1,2131 +40;0;0,3;1,20265 +60;0;0,3;1,18265 +70;0;0,3;1,17261 \ No newline at end of file