From 95c6e90bfb9d5b2a5b75d5b61045a95eed385781 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Petr=C3=B4nio=20C=C3=A2ndido?= <petronio.candido@gmail.com>
Date: Wed, 25 Apr 2018 18:47:51 -0300
Subject: [PATCH] Bugfixes in Measures and optimizations on benchmarks

---
 pyFTS/benchmarks/Measures.py      |  32 +++-----
 pyFTS/benchmarks/benchmarks.py    | 129 ++++++++++++++++--------------
 pyFTS/benchmarks/knn.py           |   2 +
 pyFTS/benchmarks/quantreg.py      |   4 +-
 pyFTS/models/ensemble/ensemble.py |   5 +-
 pyFTS/tests/general.py            |  30 ++++---
 6 files changed, 108 insertions(+), 94 deletions(-)

diff --git a/pyFTS/benchmarks/Measures.py b/pyFTS/benchmarks/Measures.py
index e4338e2..cf6697f 100644
--- a/pyFTS/benchmarks/Measures.py
+++ b/pyFTS/benchmarks/Measures.py
@@ -239,6 +239,9 @@ def pmf_to_cdf(density):
     return df
 
 
+def heavyside(bin, target):
+    return 1 if bin >= target else 0
+
 def heavyside_cdf(bins, targets):
     ret = []
     for t in targets:
@@ -255,24 +258,13 @@ def crps(targets, densities):
     :return: float
     '''
     _crps = float(0.0)
-    if isinstance(densities, pd.DataFrame):
-        l = len(densities.columns)
-        n = len(densities.index)
-        Ff = pmf_to_cdf(densities)
-        Fa = heavyside_cdf(densities.columns, targets)
-        for k in densities.index:
-            _crps += sum([ (Ff[col][k]-Fa[col][k])**2 for col in densities.columns])
-    elif isinstance(densities, ProbabilityDistribution.ProbabilityDistribution):
-        l = len(densities.bins)
-        n = 1
-        Fa = heavyside_cdf(densities.bins, targets)
-        _crps = sum([(densities.cummulative(val) - Fa[val][0]) ** 2 for val in densities.bins])
-    elif isinstance(densities, list):
-        l = len(densities[0].bins)
-        n = len(densities)
-        Fa = heavyside_cdf(densities[0].bins, targets)
-        for df in densities:
-            _crps += sum([(df.cummulative(val) - Fa[val][0]) ** 2 for val in df.bins])
+    if isinstance(densities, ProbabilityDistribution.ProbabilityDistribution):
+        densities = [densities]
+
+    l = len(densities[0].bins)
+    n = len(densities)
+    for ct, df in enumerate(densities):
+        _crps += sum([(df.cummulative(bin) - (1 if bin >= targets[ct] else 0)) ** 2 for bin in df.bins])
 
     return _crps / float(l * n)
 
@@ -387,8 +379,9 @@ def get_distribution_statistics(data, model, **kwargs):
         _s1 = time.time()
         forecasts = model.predict(data, **kwargs)
         _e1 = time.time()
-        ret.append(round(crps(data[model.order:], forecasts), 3))
+        ret.append(round(crps(data[model.order:], forecasts[:-1]), 3))
         ret.append(round(_e1 - _s1, 3))
+        ret.append(round(brier_score(data[model.order:], forecasts[:-1]), 3))
     else:
         skip = kwargs.get('steps_ahead_sampler', 1)
         forecasts = []
@@ -402,6 +395,7 @@ def get_distribution_statistics(data, model, **kwargs):
         start = model.order + steps_ahead
         ret.append(round(crps(data[start:-1:skip], forecasts), 3))
         ret.append(round(_e1 - _s1, 3))
+        ret.append(round(brier_score(data[start:-1:skip], forecasts), 3))
     return ret
 
 
diff --git a/pyFTS/benchmarks/benchmarks.py b/pyFTS/benchmarks/benchmarks.py
index bfe22fe..8e251c9 100644
--- a/pyFTS/benchmarks/benchmarks.py
+++ b/pyFTS/benchmarks/benchmarks.py
@@ -48,6 +48,38 @@ def __pop(key, default, kwargs):
         return default
 
 
+def get_benchmark_point_methods():
+    """Return all non FTS methods for point forecasting"""
+    return [naive.Naive, arima.ARIMA, quantreg.QuantileRegression]
+
+
+def get_point_methods():
+    """Return all FTS methods for point forecasting"""
+    return [song.ConventionalFTS, chen.ConventionalFTS, yu.WeightedFTS, ismailefendi.ImprovedWeightedFTS,
+            cheng.TrendWeightedFTS, sadaei.ExponentialyWeightedFTS, hofts.HighOrderFTS, hwang.HighOrderFTS,
+            pwfts.ProbabilisticWeightedFTS]
+
+
+def get_benchmark_interval_methods():
+    """Return all non FTS methods for point_to_interval forecasting"""
+    return [ arima.ARIMA, quantreg.QuantileRegression]
+
+
+def get_interval_methods():
+    """Return all FTS methods for point_to_interval forecasting"""
+    return [ifts.IntervalFTS, pwfts.ProbabilisticWeightedFTS]
+
+
+def get_probabilistic_methods():
+    """Return all FTS methods for probabilistic forecasting"""
+    return [ensemble.AllMethodEnsembleFTS, pwfts.ProbabilisticWeightedFTS]
+
+
+def get_benchmark_probabilistic_methods():
+    """Return all FTS methods for probabilistic forecasting"""
+    return [arima.ARIMA, quantreg.QuantileRegression, knn.KNearestNeighbors]
+
+
 def sliding_window_benchmarks(data, windowsize, train=0.8, **kwargs):
     """
     Sliding window benchmarks for FTS forecasters.
@@ -141,6 +173,8 @@ def sliding_window_benchmarks(data, windowsize, train=0.8, **kwargs):
     benchmark_methods = __pop("benchmark_methods", None, kwargs)
     benchmark_methods_parameters = __pop("benchmark_methods_parameters", None, kwargs)
 
+    benchmark_pool = [] if benchmark_models is None else benchmark_models
+
     if benchmark_models != False:
 
         if benchmark_models is None and benchmark_methods is None:
@@ -151,13 +185,13 @@ def sliding_window_benchmarks(data, windowsize, train=0.8, **kwargs):
             elif type == 'distribution':
                 benchmark_methods = get_benchmark_probabilistic_methods()
 
-        if isinstance(benchmark_models, list) :
-            pool.extend(benchmark_models)
-        elif benchmark_methods is not None:
-            for count, model in enumerate(benchmark_methods, start=0):
-                par = benchmark_methods_parameters[count]
-                mfts = model("", **par)
-                pool.append(mfts)
+        if benchmark_methods is not None:
+            for transformation in transformations:
+                for count, model in enumerate(benchmark_methods, start=0):
+                    par = benchmark_methods_parameters[count]
+                    mfts = model("", **par)
+                    mfts.append_transformation(transformation)
+                    benchmark_pool.append(mfts)
 
     if type == 'point':
         experiment_method = run_point
@@ -184,6 +218,10 @@ def sliding_window_benchmarks(data, windowsize, train=0.8, **kwargs):
 
     inc = __pop("inc", 0.1, kwargs)
 
+    file = kwargs.get('file', "benchmarks.db")
+
+    conn = bUtil.open_benchmark_db(file)
+
     for ct, train, test in cUtil.sliding_window(data, windowsize, train, inc=inc, **kwargs):
         experiments += 1
 
@@ -192,6 +230,18 @@ def sliding_window_benchmarks(data, windowsize, train=0.8, **kwargs):
 
         partitioners_pool = []
 
+        for model in benchmark_pool:
+            for step in steps_ahead:
+                kwargs['steps_ahead'] = step
+
+                if not distributed:
+                    job = experiment_method(deepcopy(model), None, train, test, **kwargs)
+                    synthesis_method(dataset, tag, job, conn)
+                else:
+                    job = cluster.submit(deepcopy(model), None, train, test, **kwargs)
+                    jobs.append(job)
+
+
         if partitioners_models is None:
 
             for transformation in transformations:
@@ -210,10 +260,6 @@ def sliding_window_benchmarks(data, windowsize, train=0.8, **kwargs):
         if progress:
             rng1 = tqdm(steps_ahead, desc="Steps")
 
-        file = kwargs.get('file', "benchmarks.db")
-
-        conn = bUtil.open_benchmark_db(file)
-
         for step in rng1:
             rng2 = partitioners_pool
 
@@ -267,36 +313,6 @@ def sliding_window_benchmarks(data, windowsize, train=0.8, **kwargs):
     conn.close()
 
 
-def get_benchmark_point_methods():
-    """Return all non FTS methods for point forecasting"""
-    return [naive.Naive, arima.ARIMA, quantreg.QuantileRegression]
-
-
-def get_point_methods():
-    """Return all FTS methods for point forecasting"""
-    return [song.ConventionalFTS, chen.ConventionalFTS, yu.WeightedFTS, ismailefendi.ImprovedWeightedFTS,
-            cheng.TrendWeightedFTS, sadaei.ExponentialyWeightedFTS, hofts.HighOrderFTS, hwang.HighOrderFTS,
-            pwfts.ProbabilisticWeightedFTS]
-
-
-def get_benchmark_interval_methods():
-    """Return all non FTS methods for point_to_interval forecasting"""
-    return [ arima.ARIMA, quantreg.QuantileRegression]
-
-
-def get_interval_methods():
-    """Return all FTS methods for point_to_interval forecasting"""
-    return [ifts.IntervalFTS, pwfts.ProbabilisticWeightedFTS]
-
-
-def get_probabilistic_methods():
-    """Return all FTS methods for probabilistic forecasting"""
-    return [ensemble.AllMethodEnsembleFTS, pwfts.ProbabilisticWeightedFTS]
-
-
-def get_benchmark_probabilistic_methods():
-    """Return all FTS methods for probabilistic forecasting"""
-    return [arima.ARIMA, quantreg.QuantileRegression, knn.KNearestNeighbors]
 
 
 def run_point(mfts, partitioner, train_data, test_data, window_key=None, **kwargs):
@@ -336,7 +352,6 @@ def run_point(mfts, partitioner, train_data, test_data, window_key=None, **kwarg
 
     if mfts.benchmark_only:
         _key = mfts.shortname + str(mfts.order if mfts.order is not None else "")
-        mfts.append_transformation(partitioner.transformation)
     else:
         pttr = str(partitioner.__module__).split('.')[-1]
         _key = mfts.shortname + " n = " + str(mfts.order) + " " + pttr + " q = " + str(partitioner.partitions)
@@ -347,7 +362,7 @@ def run_point(mfts, partitioner, train_data, test_data, window_key=None, **kwarg
     _key += str(method) if method is not None else ""
 
     _start = time.time()
-    mfts.fit(train_data, order=mfts.order, **kwargs)
+    mfts.fit(train_data, **kwargs)
     _end = time.time()
     times = _end - _start
 
@@ -392,7 +407,6 @@ def run_interval(mfts, partitioner, train_data, test_data, window_key=None, **kw
     method = kwargs.get('method', None)
 
     if mfts.benchmark_only:
-        mfts.append_transformation(partitioner.transformation)
         _key = mfts.shortname + str(mfts.order if mfts.order is not None else "") + str(mfts.alpha)
     else:
         pttr = str(partitioner.__module__).split('.')[-1]
@@ -404,7 +418,7 @@ def run_interval(mfts, partitioner, train_data, test_data, window_key=None, **kw
     _key += str(method) if method is not None else ""
 
     _start = time.time()
-    mfts.fit(train_data, order=mfts.order, **kwargs)
+    mfts.fit(train_data, **kwargs)
     _end = time.time()
     times = _end - _start
 
@@ -456,7 +470,6 @@ def run_probabilistic(mfts, partitioner, train_data, test_data, window_key=None,
 
     if mfts.benchmark_only:
         _key = mfts.shortname + str(mfts.order if mfts.order is not None else "") + str(mfts.alpha)
-        mfts.append_transformation(partitioner.transformation)
     else:
         pttr = str(partitioner.__module__).split('.')[-1]
         _key = mfts.shortname + " n = " + str(mfts.order) + " " + pttr + " q = " + str(partitioner.partitions)
@@ -469,20 +482,15 @@ def run_probabilistic(mfts, partitioner, train_data, test_data, window_key=None,
     if mfts.has_seasonality:
         mfts.indexer = indexer
 
-    try:
-        _start = time.time()
-        mfts.fit(train_data, order=mfts.order)
-        _end = time.time()
-        times = _end - _start
+    _start = time.time()
+    mfts.fit(train_data, **kwargs)
+    _end = time.time()
+    times = _end - _start
 
-        _crps1, _t1 = Measures.get_distribution_statistics(test_data, mfts, **kwargs)
-        _t1 += times
-    except Exception as e:
-        print(e)
-        _crps1 = np.nan
-        _t1 = np.nan
+    _crps1, _t1, _brier = Measures.get_distribution_statistics(test_data, mfts, **kwargs)
+    _t1 += times
 
-    ret = {'key': _key, 'obj': mfts, 'CRPS': _crps1, 'time': _t1, 'window': window_key,
+    ret = {'key': _key, 'obj': mfts, 'CRPS': _crps1, 'time': _t1, 'brier': _brier, 'window': window_key,
            'steps': steps_ahead, 'method': method}
 
     return ret
@@ -541,11 +549,14 @@ def process_probabilistic_jobs(dataset, tag,  job, conn):
     data = bUtil.process_common_data(dataset, tag,  'density', job)
 
     crps = deepcopy(data)
-    crps.extend(["CRPS",job["CRPS"]])
+    crps.extend(["crps",job["CRPS"]])
     bUtil.insert_benchmark(crps, conn)
     time = deepcopy(data)
     time.extend(["time", job["time"]])
     bUtil.insert_benchmark(time, conn)
+    brier = deepcopy(data)
+    brier.extend(["brier", job["brier"]])
+    bUtil.insert_benchmark(brier, conn)
 
 
 def print_point_statistics(data, models, externalmodels = None, externalforecasts = None, indexers=None):
diff --git a/pyFTS/benchmarks/knn.py b/pyFTS/benchmarks/knn.py
index c957f52..58814b6 100644
--- a/pyFTS/benchmarks/knn.py
+++ b/pyFTS/benchmarks/knn.py
@@ -6,6 +6,7 @@ from statsmodels.tsa.tsatools import lagmat
 from pyFTS.common import fts
 from pyFTS.probabilistic import ProbabilityDistribution
 
+
 class KNearestNeighbors(fts.FTS):
     """
     K-Nearest Neighbors
@@ -13,6 +14,7 @@ class KNearestNeighbors(fts.FTS):
     def __init__(self, name, **kwargs):
         super(KNearestNeighbors, self).__init__(1, "kNN"+name)
         self.name = "kNN"
+        self.shortname = "kNN"
         self.detail = "K-Nearest Neighbors"
         self.is_high_order = True
         self.has_point_forecasting = True
diff --git a/pyFTS/benchmarks/quantreg.py b/pyFTS/benchmarks/quantreg.py
index 83f184b..fe6e3f4 100644
--- a/pyFTS/benchmarks/quantreg.py
+++ b/pyFTS/benchmarks/quantreg.py
@@ -8,6 +8,7 @@ from statsmodels.tsa.tsatools import lagmat
 from pyFTS.common import SortedCollection, fts
 from pyFTS.probabilistic import ProbabilityDistribution
 
+
 class QuantileRegression(fts.FTS):
     """Façade for statsmodels.regression.quantile_regression"""
     def __init__(self, name, **kwargs):
@@ -26,10 +27,11 @@ class QuantileRegression(fts.FTS):
         self.mean_qt = None
         self.lower_qt = None
         self.dist_qt = None
+        self.order = kwargs.get('order', 1)
         self.shortname = "QAR("+str(self.order)+","+str(self.alpha)+")"
 
     def train(self, data, **kwargs):
-        if kwargs.get('order', None) is not None:
+        if 'order' in kwargs:
             self.order = kwargs.get('order', 1)
 
         if self.indexer is not None and isinstance(data, pd.DataFrame):
diff --git a/pyFTS/models/ensemble/ensemble.py b/pyFTS/models/ensemble/ensemble.py
index 8bac976..3282b7c 100644
--- a/pyFTS/models/ensemble/ensemble.py
+++ b/pyFTS/models/ensemble/ensemble.py
@@ -246,9 +246,10 @@ class EnsembleFTS(fts.FTS):
 
 
 class AllMethodEnsembleFTS(EnsembleFTS):
-    def __init__(self, **kwargs):
-        super(AllMethodEnsembleFTS, self).__init__(name="Ensemble FTS", **kwargs)
+    def __init__(self, name, **kwargs):
+        super(AllMethodEnsembleFTS, self).__init__(name="Ensemble FTS"+name, **kwargs)
         self.min_order = 3
+        self.shortname ="Ensemble FTS"
 
     def set_transformations(self, model):
         for t in self.transformations:
diff --git a/pyFTS/tests/general.py b/pyFTS/tests/general.py
index eae55c7..bdb10f9 100644
--- a/pyFTS/tests/general.py
+++ b/pyFTS/tests/general.py
@@ -20,15 +20,17 @@ partitioner = Grid.GridPartitioner(data=dataset[:800], npart=10) #, transformati
 '''
 from pyFTS.benchmarks import benchmarks as bchmk, Util as bUtil, Measures, knn, quantreg, arima
 
-'''
+
 from pyFTS.models import pwfts, song, ifts
 from pyFTS.models.ensemble import ensemble
 
-model = ensemble.AllMethodEnsembleFTS(partitioner=partitioner)
+'''
+model = knn.KNearestNeighbors("")
 model.fit(dataset[:800])
-tmp = model.predict(dataset[800:1000], type='distribution')
-for tmp2 in tmp:
-    print(tmp2)
+Measures.get_distribution_statistics(dataset[800:1000], model)
+#tmp = model.predict(dataset[800:1000], type='distribution')
+#for tmp2 in tmp:
+#    print(tmp2)
 '''
 
 
@@ -49,11 +51,12 @@ print(Measures.get_distribution_statistics(dataset[800:1000], model, steps_ahead
 
 from pyFTS.benchmarks import arima, naive, quantreg
 
-bchmk.sliding_window_benchmarks(dataset, 1000, train=0.8, inc=0.2,
-                                #methods=[ifts.IntervalFTS], #[pwfts.ProbabilisticWeightedFTS],
-                                benchmark_models=False,
+bchmk.sliding_window_benchmarks(dataset[:1000], 1000, train=0.8, inc=0.2,
+                                #methods=[pwfts.ProbabilisticWeightedFTS],
+                                benchmark_models=[],
                                 benchmark_methods=[arima.ARIMA for k in range(4)]
-                                    + [quantreg.QuantileRegression for k in range(2)],
+                                    + [quantreg.QuantileRegression for k in range(2)]
+                                    + [knn.KNearestNeighbors],
                                 benchmark_methods_parameters=[
                                     {'order': (1, 0, 0)},
                                     {'order': (1, 0, 1)},
@@ -61,14 +64,15 @@ bchmk.sliding_window_benchmarks(dataset, 1000, train=0.8, inc=0.2,
                                     {'order': (2, 0, 2)},
                                     {'order': 1, 'dist': True},
                                     {'order': 2, 'dist': True},
+                                    {}
                                 ],
-                                #transformations=[None, tdiff],
-                                orders=[1, 2, 3],
+                                #transformations=[tdiff],
+                                orders=[1],
                                 partitions=np.arange(30, 80, 5),
                                 progress=False, type='distribution',
                                 #steps_ahead=[1,4,7,10], #steps_ahead=[1]
-                                distributed=True, nodes=['192.168.0.110', '192.168.0.107','192.168.0.106'],
-                                file="benchmarks.db", dataset="TAIEX", tag="comparisons")
+                                #distributed=True, nodes=['192.168.0.110', '192.168.0.107','192.168.0.106'],
+                                file="benchmarks.tmp", dataset="TAIEX", tag="comparisons")
 
 
 #'''