Improvements on benchmarks

2018-04-16 13:42:51 -03:00 · 2018-04-16 13:42:51 -03:00 · 654ddec218
commit 654ddec218
parent ecfc9db862
4 changed files with 90 additions and 36 deletions
--- a/pyFTS/benchmarks/Measures.py
+++ b/pyFTS/benchmarks/Measures.py
@ -297,16 +297,17 @@ def get_point_statistics(data, model, **kwargs):
        ret.append(np.round(smape(ndata, nforecasts), 2))
        ret.append(np.round(UStatistic(ndata, nforecasts), 2))
    else:
+        steps_ahead_sampler = kwargs.get('steps_ahead_sampler', 1)
        nforecasts = []
-        for k in np.arange(model.order, len(ndata)-steps_ahead):
+        for k in np.arange(model.order, len(ndata)-steps_ahead,steps_ahead_sampler):
            sample = ndata[k - model.order: k]
            tmp = model.forecast_ahead(sample, steps_ahead, **kwargs)
            nforecasts.append(tmp[-1])

        start = model.order + steps_ahead
-        ret.append(np.round(rmse(ndata[start:], nforecasts), 2))
-        ret.append(np.round(smape(ndata[start:], nforecasts), 2))
-        ret.append(np.round(UStatistic(ndata[start:], nforecasts), 2))
+        ret.append(np.round(rmse(ndata[start:-1:steps_ahead_sampler], nforecasts), 2))
+        ret.append(np.round(smape(ndata[start:-1:steps_ahead_sampler], nforecasts), 2))
+        ret.append(np.round(UStatistic(ndata[start:-1:steps_ahead_sampler], nforecasts), 2))

    return ret

@ -371,16 +372,17 @@ def get_distribution_statistics(data, model, **kwargs):
        ret.append(round(crps(data, forecasts), 3))
        ret.append(round(_e1 - _s1, 3))
    else:
+        skip = kwargs.get('steps_ahead_sampler', 1)
        forecasts = []
        _s1 = time.time()
-        for k in np.arange(model.order, len(data) - steps_ahead):
+        for k in np.arange(model.order, len(data) - steps_ahead, skip):
            sample = data[k - model.order: k]
            tmp = model.forecast_ahead_distribution(sample, steps_ahead, **kwargs)
            forecasts.append(tmp[-1])
        _e1 = time.time()

        start = model.order + steps_ahead
-        ret.append(round(crps(data[start:], forecasts), 3))
+        ret.append(round(crps(data[start:-1:skip], forecasts), 3))
        ret.append(round(_e1 - _s1, 3))
    return ret

--- a/pyFTS/benchmarks/Util.py
+++ b/pyFTS/benchmarks/Util.py
@ -45,10 +45,47 @@ def find_best(dataframe, criteria, ascending):

    return ret

+def analytic_tabular_dataframe(dataframe):
+    experiments = len(dataframe.columns) - len(base_dataframe_columns()) - 1
+    models = dataframe.Model.unique()
+    orders = dataframe.Order.unique()
+    schemes = dataframe.Scheme.unique()
+    partitions = dataframe.Partitions.unique()
+    steps = dataframe.Steps.unique()
+    measures = dataframe.Measure.unique()
+    data_columns = analytical_data_columns(experiments)
+
+    ret = []
+
+    for m in models:
+        for o in orders:
+            for s in schemes:
+                for p in partitions:
+                    for st in steps:
+                        for ms in measures:
+                            df = dataframe[(dataframe.Model == m) & (dataframe.Order == o)
+                                           & (dataframe.Scheme == s) & (dataframe.Partitions == p)
+                                           & (dataframe.Steps == st) & (dataframe.Measure == ms) ]
+
+                            if not df.empty:
+                                for col in data_columns:
+                                    mod = [m, o, s, p, st, ms, df[col].values[0]]
+                                    ret.append(mod)
+
+    dat = pd.DataFrame(ret, columns=tabular_dataframe_columns())
+    return dat
+
+
+def tabular_dataframe_columns():
+        return ["Model", "Order", "Scheme", "Partitions", "Steps", "Measure", "Value"]
+
+
+def base_dataframe_columns():
+    return ["Model", "Order", "Scheme", "Partitions", "Size", "Steps", "Method"]

 def point_dataframe_synthetic_columns():
-    return ["Model", "Order", "Scheme", "Partitions", "Size", "Steps", "Method", "RMSEAVG", "RMSESTD",
-            "SMAPEAVG", "SMAPESTD", "UAVG","USTD", "TIMEAVG", "TIMESTD"]
+    return base_dataframe_columns().extend(["RMSEAVG", "RMSESTD",
+            "SMAPEAVG", "SMAPESTD", "UAVG","USTD", "TIMEAVG", "TIMESTD"])


 def point_dataframe_analytic_columns(experiments):
--- a/pyFTS/benchmarks/benchmarks.py
+++ b/pyFTS/benchmarks/benchmarks.py
@ -50,30 +50,43 @@ def __pop(key, default, kwargs):

 def sliding_window_benchmarks(data, windowsize, train=0.8, **kwargs):
    """
-    Sliding window benchmarks for FTS point forecasters
+    Sliding window benchmarks for FTS forecasters.
+
+    For each data window, a train and test datasets will be splitted. For each train split, number of
+    partitions and partitioning method will be created a partitioner model. And for each partitioner, order,
+    steps ahead and FTS method a foreasting model will be trained.
+
+    Then all trained models are benchmarked on the test data and the metrics are stored in a datafame for
+    posterior analysis.
+
+    The number of experiments is determined by the windowsize and inc.
+
    :param data: test data
    :param windowsize: size of sliding window
    :param train: percentual of sliding window data used to train the models
    :param kwargs: dict, optional arguments

    :keyword
-        models: FTS point forecasters
-        partitioners: Universe of Discourse partitioner
-        partitions: the max number of partitions on the Universe of Discourse
-        max_order: the max order of the models (for high order models)
-        type: the forecasting type, one of these values: point(default), interval or distribution.
-        steps_ahead: The forecasting horizon, i. e., the number of steps ahead to forecast
-        start: in the multi step forecasting, the index of the data where to start forecasting
-        transformation: data transformation
-        indexer: seasonal indexer
-        progress: If true a progress bar will be displayed during the benchmarks
-        distributed: boolean, indicate if the forecasting procedure will be distributed in a dispy cluster
-        nodes: a list with the dispy cluster nodes addresses
-        benchmark_methods: Non FTS models to benchmark
-        benchmark_methods_parameters: Non FTS models parameters
-        save: save results
-        file: file path to save the results
-        sintetic: if true only the average and standard deviation of the results
+        inc: a float on interval [0,1] indicating the percentage of the windowsize to move the window
+        models: a list with prebuilt FTS objects. The default is None.
+        methods: a list with FTS class names. The default depends on the forecasting type and contains the list of all FTS methods.
+        partitioners_models: a list with prebuilt Universe of Discourse partitioners objects. The default is None.
+        partitioners_methods: a list with Universe of Discourse partitioners class names. The default is [partitioners.Grid.GridPartitioner].
+        partitions: a list with the numbers of partitions on the Universe of Discourse. The default is [10].
+        orders: a list with orders of the models (for high order models). The default is [1,2,3].
+        type: the forecasting type, one of these values: point(default), interval or distribution. . The default is point.
+        steps_ahead: a list with  the forecasting horizons, i. e., the number of steps ahead to forecast. The default is 1.
+        start: in the multi step forecasting, the index of the data where to start forecasting. The default is 0.
+        transformation: data transformation . The default is None.
+        indexer: seasonal indexer. . The default is None.
+        progress: If true a progress bar will be displayed during the benchmarks. The default is False.
+        distributed: A boolean value indicating if the forecasting procedure will be distributed in a dispy cluster. . The default is False
+        nodes: a list with the dispy cluster nodes addresses. The default is [127.0.0.1].
+        benchmark_methods:  a list with Non FTS models to benchmark. The default is None.
+        benchmark_methods_parameters:  a list with Non FTS models parameters. . The default is None.
+        save: save results. The default is False.
+        file: file path to save the results. The default is None.
+        sintetic: if true only the average and standard deviation of the results. The de fault is False.

    :return: DataFrame with the benchmark results
    """
@ -235,7 +248,6 @@ def sliding_window_benchmarks(data, windowsize, train=0.8, **kwargs):
            if job.status == dispy.DispyJob.Finished and job is not None:
                tmp = job()
                jobs2.append(tmp)
-                print(tmp)
            else:
                print("status",job.status)
                print("result",job.result)
@ -249,8 +261,6 @@ def sliding_window_benchmarks(data, windowsize, train=0.8, **kwargs):
    file = kwargs.get('file', None)
    sintetic = kwargs.get('sintetic', False)

-    print(jobs)
-
    return synthesis_method(jobs, experiments, save, file, sintetic)


--- a/pyFTS/tests/general.py
+++ b/pyFTS/tests/general.py
@ -15,17 +15,22 @@ from pyFTS.data import TAIEX

 dataset = TAIEX.get_data()

-from pyFTS.benchmarks import benchmarks as bchmk
+from pyFTS.benchmarks import benchmarks as bchmk, Util as bUtil

 from pyFTS.models import pwfts

-#'''
-bchmk.sliding_window_benchmarks(dataset[:2000], 1000, train=0.8, inc=0.2, methods=[pwfts.ProbabilisticWeightedFTS],
-                                benchmark_models=False, orders=[1,2,3], partitions=[30,50,70], #np.arange(10,100,2),
-                                progress=False, type='distribution', steps_ahead=[1,4,7,10],
+'''
+bchmk.sliding_window_benchmarks(dataset, 1000, train=0.8, inc=0.2, methods=[pwfts.ProbabilisticWeightedFTS],
+                                benchmark_models=False, orders=[1,2,3], partitions=np.arange(10,100,5),
+                                progress=False, type='point',
+                                #steps_ahead=[1,4,7,10], steps_ahead_sampler=10,
                                distributed=True, nodes=['192.168.0.102','192.168.0.106','192.168.0.110'],
-                                save=True, file="pwfts_taiex_distribution.csv")
-#'''
+                                save=True, file="pwfts_taiex_partitioning.csv")
+'''
+
+dat = pd.read_csv('pwfts_taiex_partitioning.csv', sep=';')
+print(bUtil.analytic_tabular_dataframe(dat))
+#print(dat["Size"].values[0])

 '''
 train_split = 2000