From 191ddf90d8086ee363f20b488766ab05bcfe4f13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=C3=B4nio=20C=C3=A2ndido?= Date: Thu, 26 Apr 2018 11:53:53 -0300 Subject: [PATCH] Small bugfixes in benchmarks and benchmark models --- pyFTS/benchmarks/Measures.py | 12 +++-- pyFTS/benchmarks/benchmarks.py | 37 ++++++++-------- pyFTS/benchmarks/knn.py | 9 ++-- pyFTS/data/SP500.py | 10 ++++- pyFTS/data/TAIEX.py | 2 +- .../probabilistic/ProbabilityDistribution.py | 3 +- pyFTS/tests/general.py | 44 ++++++++----------- 7 files changed, 63 insertions(+), 54 deletions(-) diff --git a/pyFTS/benchmarks/Measures.py b/pyFTS/benchmarks/Measures.py index cf6697f..7bb4f52 100644 --- a/pyFTS/benchmarks/Measures.py +++ b/pyFTS/benchmarks/Measures.py @@ -219,10 +219,14 @@ def brier_score(targets, densities): '''Brier (1950). "Verification of Forecasts Expressed in Terms of Probability". Monthly Weather Review. 78: 1–3. ''' ret = [] for ct, d in enumerate(densities): - v = d.bin_index.find_ge(targets[ct]) - score = sum([d.distribution[k] ** 2 for k in d.bins if k != v]) - score += (d.distribution[v] - 1) ** 2 - ret.append(score) + try: + v = d.bin_index.find_ge(targets[ct]) + + score = sum([d.distribution[k] ** 2 for k in d.bins if k != v]) + score += (d.distribution[v] - 1) ** 2 + ret.append(score) + except ValueError as ex: + ret.append(sum([d.distribution[k] ** 2 for k in d.bins])) return sum(ret)/len(ret) diff --git a/pyFTS/benchmarks/benchmarks.py b/pyFTS/benchmarks/benchmarks.py index 8e251c9..9c8852a 100644 --- a/pyFTS/benchmarks/benchmarks.py +++ b/pyFTS/benchmarks/benchmarks.py @@ -168,15 +168,16 @@ def sliding_window_benchmarks(data, windowsize, train=0.8, **kwargs): mfts.order = 1 pool.append(mfts) - benchmark_models = __pop("benchmark_models", None, kwargs) - - benchmark_methods = __pop("benchmark_methods", None, kwargs) - benchmark_methods_parameters = __pop("benchmark_methods_parameters", None, kwargs) - - benchmark_pool = [] if benchmark_models is None else benchmark_models + benchmark_models = __pop("benchmark_models", False, kwargs) if benchmark_models != False: + benchmark_methods = __pop("benchmark_methods", None, kwargs) + benchmark_methods_parameters = __pop("benchmark_methods_parameters", None, kwargs) + + benchmark_pool = [] if ( benchmark_models is None or not isinstance(benchmark_models, list)) \ + else benchmark_models + if benchmark_models is None and benchmark_methods is None: if type == 'point'or type == 'partition': benchmark_methods = get_benchmark_point_methods() @@ -228,20 +229,20 @@ def sliding_window_benchmarks(data, windowsize, train=0.8, **kwargs): if progress: progressbar.update(windowsize * inc) + if benchmark_models != False: + for model in benchmark_pool: + for step in steps_ahead: + kwargs['steps_ahead'] = step + + if not distributed: + job = experiment_method(deepcopy(model), None, train, test, **kwargs) + synthesis_method(dataset, tag, job, conn) + else: + job = cluster.submit(deepcopy(model), None, train, test, **kwargs) + jobs.append(job) + partitioners_pool = [] - for model in benchmark_pool: - for step in steps_ahead: - kwargs['steps_ahead'] = step - - if not distributed: - job = experiment_method(deepcopy(model), None, train, test, **kwargs) - synthesis_method(dataset, tag, job, conn) - else: - job = cluster.submit(deepcopy(model), None, train, test, **kwargs) - jobs.append(job) - - if partitioners_models is None: for transformation in transformations: diff --git a/pyFTS/benchmarks/knn.py b/pyFTS/benchmarks/knn.py index 58814b6..036e510 100644 --- a/pyFTS/benchmarks/knn.py +++ b/pyFTS/benchmarks/knn.py @@ -26,12 +26,15 @@ class KNearestNeighbors(fts.FTS): self.order = kwargs.get("order", 1) self.lag = None self.k = kwargs.get("k", 30) + self.uod = None def train(self, data, **kwargs): if kwargs.get('order', None) is not None: self.order = kwargs.get('order', 1) - self.data = data + self.data = np.array(data) + self.original_max = max(data) + self.original_min = min(data) #self.lagdata, = lagmat(data, maxlag=self.order, trim="both", original='sep') @@ -47,8 +50,8 @@ class KNearestNeighbors(fts.FTS): dist.append(sum([ (self.data[k - kk] - sample[kk])**2 for kk in range(self.order)])) ix = np.argsort(np.array(dist)) + self.order + 1 - ix = np.clip(ix, 0, len(self.data)-1 ) - return self.data[ix[:self.k]] + ix2 = np.clip(ix[:self.k], 0, len(self.data)-1) + return self.data[ix2] def forecast_distribution(self, data, **kwargs): ret = [] diff --git a/pyFTS/data/SP500.py b/pyFTS/data/SP500.py index fed0f5a..2b105a6 100644 --- a/pyFTS/data/SP500.py +++ b/pyFTS/data/SP500.py @@ -3,6 +3,15 @@ import pandas as pd import numpy as np +def get_data(): + """ + Get the univariate time series data. + :return: numpy array + """ + dat = get_dataframe() + return np.array(dat["Avg"]) + + def get_dataframe(): """ Get the complete multivariate time series data. @@ -11,6 +20,5 @@ def get_dataframe(): dat = common.get_dataframe('SP500.csv.bz2', 'https://github.com/petroniocandido/pyFTS/raw/8f20f3634aa6a8f58083bdcd1bbf93795e6ed767/pyFTS/data/SP500.csv.bz2', sep=",", compression='bz2') - dat = np.array(dat["Avg"]) return dat diff --git a/pyFTS/data/TAIEX.py b/pyFTS/data/TAIEX.py index 07857b6..ff2099e 100644 --- a/pyFTS/data/TAIEX.py +++ b/pyFTS/data/TAIEX.py @@ -5,7 +5,7 @@ import numpy as np def get_data(): """ - :param field: the dataset field name to extract + Get the univariate time series data. :return: numpy array """ dat = get_dataframe() diff --git a/pyFTS/probabilistic/ProbabilityDistribution.py b/pyFTS/probabilistic/ProbabilityDistribution.py index 97b579d..f1a31f8 100644 --- a/pyFTS/probabilistic/ProbabilityDistribution.py +++ b/pyFTS/probabilistic/ProbabilityDistribution.py @@ -88,7 +88,7 @@ class ProbabilityDistribution(object): for k in values: if self.type == "histogram": v = self.bin_index.find_ge(k) - ret.append(self.distribution[v] / self.count) + ret.append(self.distribution[v] / (self.count + 1e-5)) elif self.type == "KDE": v = self.kde.probability(k, self.data) ret.append(v) @@ -119,7 +119,6 @@ class ProbabilityDistribution(object): self.cdf = None self.qtl = None - def expected_value(self): return np.nansum([v * self.distribution[v] for v in self.bins]) diff --git a/pyFTS/tests/general.py b/pyFTS/tests/general.py index bdb10f9..d934f6f 100644 --- a/pyFTS/tests/general.py +++ b/pyFTS/tests/general.py @@ -11,9 +11,11 @@ from pyFTS.common import Transformations tdiff = Transformations.Differential(1) -from pyFTS.data import TAIEX +from pyFTS.data import TAIEX, SP500 -dataset = TAIEX.get_data() +#dataset = TAIEX.get_data() +dataset = SP500.get_data()[11500:16000] +#print(len(dataset)) ''' from pyFTS.partitioners import Grid, Util as pUtil partitioner = Grid.GridPartitioner(data=dataset[:800], npart=10) #, transformation=tdiff) @@ -25,13 +27,17 @@ from pyFTS.models import pwfts, song, ifts from pyFTS.models.ensemble import ensemble ''' -model = knn.KNearestNeighbors("") +#model = knn.KNearestNeighbors("") +#model = ensemble.AllMethodEnsembleFTS("", partitioner=partitioner) +#model = arima.ARIMA("", order=(2,0,2)) +#model = quantreg.QuantileRegression("", order=2, dist=True) +model.append_transformation(tdiff) model.fit(dataset[:800]) Measures.get_distribution_statistics(dataset[800:1000], model) #tmp = model.predict(dataset[800:1000], type='distribution') #for tmp2 in tmp: # print(tmp2) -''' +#''' ''' @@ -51,28 +57,16 @@ print(Measures.get_distribution_statistics(dataset[800:1000], model, steps_ahead from pyFTS.benchmarks import arima, naive, quantreg -bchmk.sliding_window_benchmarks(dataset[:1000], 1000, train=0.8, inc=0.2, - #methods=[pwfts.ProbabilisticWeightedFTS], - benchmark_models=[], - benchmark_methods=[arima.ARIMA for k in range(4)] - + [quantreg.QuantileRegression for k in range(2)] - + [knn.KNearestNeighbors], - benchmark_methods_parameters=[ - {'order': (1, 0, 0)}, - {'order': (1, 0, 1)}, - {'order': (2, 0, 1)}, - {'order': (2, 0, 2)}, - {'order': 1, 'dist': True}, - {'order': 2, 'dist': True}, - {} - ], - #transformations=[tdiff], - orders=[1], - partitions=np.arange(30, 80, 5), - progress=False, type='distribution', +bchmk.sliding_window_benchmarks(dataset, 1000, train=0.8, inc=0.2, + methods=[pwfts.ProbabilisticWeightedFTS], + benchmark_models=False, + transformations=[tdiff], + orders=[1,2,3], + partitions=np.arange(3, 50, 2), + progress=False, type='point', #steps_ahead=[1,4,7,10], #steps_ahead=[1] - #distributed=True, nodes=['192.168.0.110', '192.168.0.107','192.168.0.106'], - file="benchmarks.tmp", dataset="TAIEX", tag="comparisons") + distributed=True, nodes=['192.168.0.110', '192.168.0.107','192.168.0.106'], + file="benchmarks.db", dataset="SP500", tag="partitioning") #'''