From 191ddf90d8086ee363f20b488766ab05bcfe4f13 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Petr=C3=B4nio=20C=C3=A2ndido?= <petronio.candido@gmail.com>
Date: Thu, 26 Apr 2018 11:53:53 -0300
Subject: [PATCH] Small bugfixes in benchmarks and benchmark models

---
 pyFTS/benchmarks/Measures.py                  | 12 +++--
 pyFTS/benchmarks/benchmarks.py                | 37 ++++++++--------
 pyFTS/benchmarks/knn.py                       |  9 ++--
 pyFTS/data/SP500.py                           | 10 ++++-
 pyFTS/data/TAIEX.py                           |  2 +-
 .../probabilistic/ProbabilityDistribution.py  |  3 +-
 pyFTS/tests/general.py                        | 44 ++++++++-----------
 7 files changed, 63 insertions(+), 54 deletions(-)

diff --git a/pyFTS/benchmarks/Measures.py b/pyFTS/benchmarks/Measures.py
index cf6697f..7bb4f52 100644
--- a/pyFTS/benchmarks/Measures.py
+++ b/pyFTS/benchmarks/Measures.py
@@ -219,10 +219,14 @@ def brier_score(targets, densities):
     '''Brier (1950). "Verification of Forecasts Expressed in Terms of Probability". Monthly Weather Review. 78: 1–3. '''
     ret = []
     for ct, d in enumerate(densities):
-        v = d.bin_index.find_ge(targets[ct])
-        score = sum([d.distribution[k] ** 2 for k in d.bins if k != v])
-        score += (d.distribution[v] - 1) ** 2
-        ret.append(score)
+        try:
+            v = d.bin_index.find_ge(targets[ct])
+
+            score = sum([d.distribution[k] ** 2 for k in d.bins if k != v])
+            score += (d.distribution[v] - 1) ** 2
+            ret.append(score)
+        except ValueError as ex:
+            ret.append(sum([d.distribution[k] ** 2 for k in d.bins]))
     return sum(ret)/len(ret)
 
 
diff --git a/pyFTS/benchmarks/benchmarks.py b/pyFTS/benchmarks/benchmarks.py
index 8e251c9..9c8852a 100644
--- a/pyFTS/benchmarks/benchmarks.py
+++ b/pyFTS/benchmarks/benchmarks.py
@@ -168,15 +168,16 @@ def sliding_window_benchmarks(data, windowsize, train=0.8, **kwargs):
                 mfts.order = 1
                 pool.append(mfts)
 
-    benchmark_models = __pop("benchmark_models", None, kwargs)
-
-    benchmark_methods = __pop("benchmark_methods", None, kwargs)
-    benchmark_methods_parameters = __pop("benchmark_methods_parameters", None, kwargs)
-
-    benchmark_pool = [] if benchmark_models is None else benchmark_models
+    benchmark_models = __pop("benchmark_models", False, kwargs)
 
     if benchmark_models != False:
 
+        benchmark_methods = __pop("benchmark_methods", None, kwargs)
+        benchmark_methods_parameters = __pop("benchmark_methods_parameters", None, kwargs)
+
+        benchmark_pool = [] if ( benchmark_models is None or not isinstance(benchmark_models, list)) \
+            else benchmark_models
+
         if benchmark_models is None and benchmark_methods is None:
             if type == 'point'or type  == 'partition':
                 benchmark_methods = get_benchmark_point_methods()
@@ -228,20 +229,20 @@ def sliding_window_benchmarks(data, windowsize, train=0.8, **kwargs):
         if progress:
             progressbar.update(windowsize * inc)
 
+        if benchmark_models != False:
+            for model in benchmark_pool:
+                for step in steps_ahead:
+                    kwargs['steps_ahead'] = step
+
+                    if not distributed:
+                        job = experiment_method(deepcopy(model), None, train, test, **kwargs)
+                        synthesis_method(dataset, tag, job, conn)
+                    else:
+                        job = cluster.submit(deepcopy(model), None, train, test, **kwargs)
+                        jobs.append(job)
+
         partitioners_pool = []
 
-        for model in benchmark_pool:
-            for step in steps_ahead:
-                kwargs['steps_ahead'] = step
-
-                if not distributed:
-                    job = experiment_method(deepcopy(model), None, train, test, **kwargs)
-                    synthesis_method(dataset, tag, job, conn)
-                else:
-                    job = cluster.submit(deepcopy(model), None, train, test, **kwargs)
-                    jobs.append(job)
-
-
         if partitioners_models is None:
 
             for transformation in transformations:
diff --git a/pyFTS/benchmarks/knn.py b/pyFTS/benchmarks/knn.py
index 58814b6..036e510 100644
--- a/pyFTS/benchmarks/knn.py
+++ b/pyFTS/benchmarks/knn.py
@@ -26,12 +26,15 @@ class KNearestNeighbors(fts.FTS):
         self.order = kwargs.get("order", 1)
         self.lag = None
         self.k = kwargs.get("k", 30)
+        self.uod = None
 
     def train(self, data, **kwargs):
         if kwargs.get('order', None) is not None:
             self.order = kwargs.get('order', 1)
 
-        self.data = data
+        self.data = np.array(data)
+        self.original_max = max(data)
+        self.original_min = min(data)
 
         #self.lagdata, = lagmat(data, maxlag=self.order, trim="both", original='sep')
 
@@ -47,8 +50,8 @@ class KNearestNeighbors(fts.FTS):
                 dist.append(sum([ (self.data[k - kk] - sample[kk])**2 for kk in range(self.order)]))
             ix = np.argsort(np.array(dist)) + self.order + 1
 
-        ix = np.clip(ix, 0, len(self.data)-1 )
-        return self.data[ix[:self.k]]
+        ix2 = np.clip(ix[:self.k], 0, len(self.data)-1)
+        return self.data[ix2]
 
     def forecast_distribution(self, data, **kwargs):
         ret = []
diff --git a/pyFTS/data/SP500.py b/pyFTS/data/SP500.py
index fed0f5a..2b105a6 100644
--- a/pyFTS/data/SP500.py
+++ b/pyFTS/data/SP500.py
@@ -3,6 +3,15 @@ import pandas as pd
 import numpy as np
 
 
+def get_data():
+    """
+    Get the univariate time series data.
+    :return: numpy array
+    """
+    dat = get_dataframe()
+    return np.array(dat["Avg"])
+
+
 def get_dataframe():
     """
     Get the complete multivariate time series data.
@@ -11,6 +20,5 @@ def get_dataframe():
     dat = common.get_dataframe('SP500.csv.bz2',
                                'https://github.com/petroniocandido/pyFTS/raw/8f20f3634aa6a8f58083bdcd1bbf93795e6ed767/pyFTS/data/SP500.csv.bz2',
                                sep=",", compression='bz2')
-    dat = np.array(dat["Avg"])
     return dat
 
diff --git a/pyFTS/data/TAIEX.py b/pyFTS/data/TAIEX.py
index 07857b6..ff2099e 100644
--- a/pyFTS/data/TAIEX.py
+++ b/pyFTS/data/TAIEX.py
@@ -5,7 +5,7 @@ import numpy as np
 
 def get_data():
     """
-    :param field: the dataset field name to extract
+    Get the univariate time series data.
     :return: numpy array
     """
     dat = get_dataframe()
diff --git a/pyFTS/probabilistic/ProbabilityDistribution.py b/pyFTS/probabilistic/ProbabilityDistribution.py
index 97b579d..f1a31f8 100644
--- a/pyFTS/probabilistic/ProbabilityDistribution.py
+++ b/pyFTS/probabilistic/ProbabilityDistribution.py
@@ -88,7 +88,7 @@ class ProbabilityDistribution(object):
         for k in values:
             if self.type == "histogram":
                 v = self.bin_index.find_ge(k)
-                ret.append(self.distribution[v] / self.count)
+                ret.append(self.distribution[v] / (self.count + 1e-5))
             elif self.type == "KDE":
                 v = self.kde.probability(k, self.data)
                 ret.append(v)
@@ -119,7 +119,6 @@ class ProbabilityDistribution(object):
         self.cdf = None
         self.qtl = None
 
-
     def expected_value(self):
         return np.nansum([v * self.distribution[v] for v in self.bins])
 
diff --git a/pyFTS/tests/general.py b/pyFTS/tests/general.py
index bdb10f9..d934f6f 100644
--- a/pyFTS/tests/general.py
+++ b/pyFTS/tests/general.py
@@ -11,9 +11,11 @@ from pyFTS.common import Transformations
 
 tdiff = Transformations.Differential(1)
 
-from pyFTS.data import TAIEX
+from pyFTS.data import TAIEX, SP500
 
-dataset = TAIEX.get_data()
+#dataset = TAIEX.get_data()
+dataset = SP500.get_data()[11500:16000]
+#print(len(dataset))
 '''
 from pyFTS.partitioners import Grid, Util as pUtil
 partitioner = Grid.GridPartitioner(data=dataset[:800], npart=10) #, transformation=tdiff)
@@ -25,13 +27,17 @@ from pyFTS.models import pwfts, song, ifts
 from pyFTS.models.ensemble import ensemble
 
 '''
-model = knn.KNearestNeighbors("")
+#model = knn.KNearestNeighbors("")
+#model = ensemble.AllMethodEnsembleFTS("", partitioner=partitioner)
+#model = arima.ARIMA("", order=(2,0,2))
+#model = quantreg.QuantileRegression("", order=2, dist=True)
+model.append_transformation(tdiff)
 model.fit(dataset[:800])
 Measures.get_distribution_statistics(dataset[800:1000], model)
 #tmp = model.predict(dataset[800:1000], type='distribution')
 #for tmp2 in tmp:
 #    print(tmp2)
-'''
+#'''
 
 
 '''
@@ -51,28 +57,16 @@ print(Measures.get_distribution_statistics(dataset[800:1000], model, steps_ahead
 
 from pyFTS.benchmarks import arima, naive, quantreg
 
-bchmk.sliding_window_benchmarks(dataset[:1000], 1000, train=0.8, inc=0.2,
-                                #methods=[pwfts.ProbabilisticWeightedFTS],
-                                benchmark_models=[],
-                                benchmark_methods=[arima.ARIMA for k in range(4)]
-                                    + [quantreg.QuantileRegression for k in range(2)]
-                                    + [knn.KNearestNeighbors],
-                                benchmark_methods_parameters=[
-                                    {'order': (1, 0, 0)},
-                                    {'order': (1, 0, 1)},
-                                    {'order': (2, 0, 1)},
-                                    {'order': (2, 0, 2)},
-                                    {'order': 1, 'dist': True},
-                                    {'order': 2, 'dist': True},
-                                    {}
-                                ],
-                                #transformations=[tdiff],
-                                orders=[1],
-                                partitions=np.arange(30, 80, 5),
-                                progress=False, type='distribution',
+bchmk.sliding_window_benchmarks(dataset, 1000, train=0.8, inc=0.2,
+                                methods=[pwfts.ProbabilisticWeightedFTS],
+                                benchmark_models=False,
+                                transformations=[tdiff],
+                                orders=[1,2,3],
+                                partitions=np.arange(3, 50, 2),
+                                progress=False, type='point',
                                 #steps_ahead=[1,4,7,10], #steps_ahead=[1]
-                                #distributed=True, nodes=['192.168.0.110', '192.168.0.107','192.168.0.106'],
-                                file="benchmarks.tmp", dataset="TAIEX", tag="comparisons")
+                                distributed=True, nodes=['192.168.0.110', '192.168.0.107','192.168.0.106'],
+                                file="benchmarks.db", dataset="SP500", tag="partitioning")
 
 
 #'''