From 654ddec21841870ef721f152c0b3e3c054fb9dd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=C3=B4nio=20C=C3=A2ndido?= Date: Mon, 16 Apr 2018 13:42:51 -0300 Subject: [PATCH] Improvements on benchmarks --- pyFTS/benchmarks/Measures.py | 14 +++++---- pyFTS/benchmarks/Util.py | 41 +++++++++++++++++++++++++-- pyFTS/benchmarks/benchmarks.py | 52 ++++++++++++++++++++-------------- pyFTS/tests/general.py | 19 ++++++++----- 4 files changed, 90 insertions(+), 36 deletions(-) diff --git a/pyFTS/benchmarks/Measures.py b/pyFTS/benchmarks/Measures.py index bdcab26..c887a3e 100644 --- a/pyFTS/benchmarks/Measures.py +++ b/pyFTS/benchmarks/Measures.py @@ -297,16 +297,17 @@ def get_point_statistics(data, model, **kwargs): ret.append(np.round(smape(ndata, nforecasts), 2)) ret.append(np.round(UStatistic(ndata, nforecasts), 2)) else: + steps_ahead_sampler = kwargs.get('steps_ahead_sampler', 1) nforecasts = [] - for k in np.arange(model.order, len(ndata)-steps_ahead): + for k in np.arange(model.order, len(ndata)-steps_ahead,steps_ahead_sampler): sample = ndata[k - model.order: k] tmp = model.forecast_ahead(sample, steps_ahead, **kwargs) nforecasts.append(tmp[-1]) start = model.order + steps_ahead - ret.append(np.round(rmse(ndata[start:], nforecasts), 2)) - ret.append(np.round(smape(ndata[start:], nforecasts), 2)) - ret.append(np.round(UStatistic(ndata[start:], nforecasts), 2)) + ret.append(np.round(rmse(ndata[start:-1:steps_ahead_sampler], nforecasts), 2)) + ret.append(np.round(smape(ndata[start:-1:steps_ahead_sampler], nforecasts), 2)) + ret.append(np.round(UStatistic(ndata[start:-1:steps_ahead_sampler], nforecasts), 2)) return ret @@ -371,16 +372,17 @@ def get_distribution_statistics(data, model, **kwargs): ret.append(round(crps(data, forecasts), 3)) ret.append(round(_e1 - _s1, 3)) else: + skip = kwargs.get('steps_ahead_sampler', 1) forecasts = [] _s1 = time.time() - for k in np.arange(model.order, len(data) - steps_ahead): + for k in np.arange(model.order, len(data) - steps_ahead, skip): sample = data[k - model.order: k] tmp = model.forecast_ahead_distribution(sample, steps_ahead, **kwargs) forecasts.append(tmp[-1]) _e1 = time.time() start = model.order + steps_ahead - ret.append(round(crps(data[start:], forecasts), 3)) + ret.append(round(crps(data[start:-1:skip], forecasts), 3)) ret.append(round(_e1 - _s1, 3)) return ret diff --git a/pyFTS/benchmarks/Util.py b/pyFTS/benchmarks/Util.py index 358eb0d..cecb036 100644 --- a/pyFTS/benchmarks/Util.py +++ b/pyFTS/benchmarks/Util.py @@ -45,10 +45,47 @@ def find_best(dataframe, criteria, ascending): return ret +def analytic_tabular_dataframe(dataframe): + experiments = len(dataframe.columns) - len(base_dataframe_columns()) - 1 + models = dataframe.Model.unique() + orders = dataframe.Order.unique() + schemes = dataframe.Scheme.unique() + partitions = dataframe.Partitions.unique() + steps = dataframe.Steps.unique() + measures = dataframe.Measure.unique() + data_columns = analytical_data_columns(experiments) + + ret = [] + + for m in models: + for o in orders: + for s in schemes: + for p in partitions: + for st in steps: + for ms in measures: + df = dataframe[(dataframe.Model == m) & (dataframe.Order == o) + & (dataframe.Scheme == s) & (dataframe.Partitions == p) + & (dataframe.Steps == st) & (dataframe.Measure == ms) ] + + if not df.empty: + for col in data_columns: + mod = [m, o, s, p, st, ms, df[col].values[0]] + ret.append(mod) + + dat = pd.DataFrame(ret, columns=tabular_dataframe_columns()) + return dat + + +def tabular_dataframe_columns(): + return ["Model", "Order", "Scheme", "Partitions", "Steps", "Measure", "Value"] + + +def base_dataframe_columns(): + return ["Model", "Order", "Scheme", "Partitions", "Size", "Steps", "Method"] def point_dataframe_synthetic_columns(): - return ["Model", "Order", "Scheme", "Partitions", "Size", "Steps", "Method", "RMSEAVG", "RMSESTD", - "SMAPEAVG", "SMAPESTD", "UAVG","USTD", "TIMEAVG", "TIMESTD"] + return base_dataframe_columns().extend(["RMSEAVG", "RMSESTD", + "SMAPEAVG", "SMAPESTD", "UAVG","USTD", "TIMEAVG", "TIMESTD"]) def point_dataframe_analytic_columns(experiments): diff --git a/pyFTS/benchmarks/benchmarks.py b/pyFTS/benchmarks/benchmarks.py index 31ef14a..f9437c2 100644 --- a/pyFTS/benchmarks/benchmarks.py +++ b/pyFTS/benchmarks/benchmarks.py @@ -50,30 +50,43 @@ def __pop(key, default, kwargs): def sliding_window_benchmarks(data, windowsize, train=0.8, **kwargs): """ - Sliding window benchmarks for FTS point forecasters + Sliding window benchmarks for FTS forecasters. + + For each data window, a train and test datasets will be splitted. For each train split, number of + partitions and partitioning method will be created a partitioner model. And for each partitioner, order, + steps ahead and FTS method a foreasting model will be trained. + + Then all trained models are benchmarked on the test data and the metrics are stored in a datafame for + posterior analysis. + + The number of experiments is determined by the windowsize and inc. + :param data: test data :param windowsize: size of sliding window :param train: percentual of sliding window data used to train the models :param kwargs: dict, optional arguments :keyword - models: FTS point forecasters - partitioners: Universe of Discourse partitioner - partitions: the max number of partitions on the Universe of Discourse - max_order: the max order of the models (for high order models) - type: the forecasting type, one of these values: point(default), interval or distribution. - steps_ahead: The forecasting horizon, i. e., the number of steps ahead to forecast - start: in the multi step forecasting, the index of the data where to start forecasting - transformation: data transformation - indexer: seasonal indexer - progress: If true a progress bar will be displayed during the benchmarks - distributed: boolean, indicate if the forecasting procedure will be distributed in a dispy cluster - nodes: a list with the dispy cluster nodes addresses - benchmark_methods: Non FTS models to benchmark - benchmark_methods_parameters: Non FTS models parameters - save: save results - file: file path to save the results - sintetic: if true only the average and standard deviation of the results + inc: a float on interval [0,1] indicating the percentage of the windowsize to move the window + models: a list with prebuilt FTS objects. The default is None. + methods: a list with FTS class names. The default depends on the forecasting type and contains the list of all FTS methods. + partitioners_models: a list with prebuilt Universe of Discourse partitioners objects. The default is None. + partitioners_methods: a list with Universe of Discourse partitioners class names. The default is [partitioners.Grid.GridPartitioner]. + partitions: a list with the numbers of partitions on the Universe of Discourse. The default is [10]. + orders: a list with orders of the models (for high order models). The default is [1,2,3]. + type: the forecasting type, one of these values: point(default), interval or distribution. . The default is point. + steps_ahead: a list with the forecasting horizons, i. e., the number of steps ahead to forecast. The default is 1. + start: in the multi step forecasting, the index of the data where to start forecasting. The default is 0. + transformation: data transformation . The default is None. + indexer: seasonal indexer. . The default is None. + progress: If true a progress bar will be displayed during the benchmarks. The default is False. + distributed: A boolean value indicating if the forecasting procedure will be distributed in a dispy cluster. . The default is False + nodes: a list with the dispy cluster nodes addresses. The default is [127.0.0.1]. + benchmark_methods: a list with Non FTS models to benchmark. The default is None. + benchmark_methods_parameters: a list with Non FTS models parameters. . The default is None. + save: save results. The default is False. + file: file path to save the results. The default is None. + sintetic: if true only the average and standard deviation of the results. The de fault is False. :return: DataFrame with the benchmark results """ @@ -235,7 +248,6 @@ def sliding_window_benchmarks(data, windowsize, train=0.8, **kwargs): if job.status == dispy.DispyJob.Finished and job is not None: tmp = job() jobs2.append(tmp) - print(tmp) else: print("status",job.status) print("result",job.result) @@ -249,8 +261,6 @@ def sliding_window_benchmarks(data, windowsize, train=0.8, **kwargs): file = kwargs.get('file', None) sintetic = kwargs.get('sintetic', False) - print(jobs) - return synthesis_method(jobs, experiments, save, file, sintetic) diff --git a/pyFTS/tests/general.py b/pyFTS/tests/general.py index 47e80a3..a947c95 100644 --- a/pyFTS/tests/general.py +++ b/pyFTS/tests/general.py @@ -15,17 +15,22 @@ from pyFTS.data import TAIEX dataset = TAIEX.get_data() -from pyFTS.benchmarks import benchmarks as bchmk +from pyFTS.benchmarks import benchmarks as bchmk, Util as bUtil from pyFTS.models import pwfts -#''' -bchmk.sliding_window_benchmarks(dataset[:2000], 1000, train=0.8, inc=0.2, methods=[pwfts.ProbabilisticWeightedFTS], - benchmark_models=False, orders=[1,2,3], partitions=[30,50,70], #np.arange(10,100,2), - progress=False, type='distribution', steps_ahead=[1,4,7,10], +''' +bchmk.sliding_window_benchmarks(dataset, 1000, train=0.8, inc=0.2, methods=[pwfts.ProbabilisticWeightedFTS], + benchmark_models=False, orders=[1,2,3], partitions=np.arange(10,100,5), + progress=False, type='point', + #steps_ahead=[1,4,7,10], steps_ahead_sampler=10, distributed=True, nodes=['192.168.0.102','192.168.0.106','192.168.0.110'], - save=True, file="pwfts_taiex_distribution.csv") -#''' + save=True, file="pwfts_taiex_partitioning.csv") +''' + +dat = pd.read_csv('pwfts_taiex_partitioning.csv', sep=';') +print(bUtil.analytic_tabular_dataframe(dat)) +#print(dat["Size"].values[0]) ''' train_split = 2000