From 95c6e90bfb9d5b2a5b75d5b61045a95eed385781 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=C3=B4nio=20C=C3=A2ndido?= Date: Wed, 25 Apr 2018 18:47:51 -0300 Subject: [PATCH] Bugfixes in Measures and optimizations on benchmarks --- pyFTS/benchmarks/Measures.py | 32 +++----- pyFTS/benchmarks/benchmarks.py | 129 ++++++++++++++++-------------- pyFTS/benchmarks/knn.py | 2 + pyFTS/benchmarks/quantreg.py | 4 +- pyFTS/models/ensemble/ensemble.py | 5 +- pyFTS/tests/general.py | 30 ++++--- 6 files changed, 108 insertions(+), 94 deletions(-) diff --git a/pyFTS/benchmarks/Measures.py b/pyFTS/benchmarks/Measures.py index e4338e2..cf6697f 100644 --- a/pyFTS/benchmarks/Measures.py +++ b/pyFTS/benchmarks/Measures.py @@ -239,6 +239,9 @@ def pmf_to_cdf(density): return df +def heavyside(bin, target): + return 1 if bin >= target else 0 + def heavyside_cdf(bins, targets): ret = [] for t in targets: @@ -255,24 +258,13 @@ def crps(targets, densities): :return: float ''' _crps = float(0.0) - if isinstance(densities, pd.DataFrame): - l = len(densities.columns) - n = len(densities.index) - Ff = pmf_to_cdf(densities) - Fa = heavyside_cdf(densities.columns, targets) - for k in densities.index: - _crps += sum([ (Ff[col][k]-Fa[col][k])**2 for col in densities.columns]) - elif isinstance(densities, ProbabilityDistribution.ProbabilityDistribution): - l = len(densities.bins) - n = 1 - Fa = heavyside_cdf(densities.bins, targets) - _crps = sum([(densities.cummulative(val) - Fa[val][0]) ** 2 for val in densities.bins]) - elif isinstance(densities, list): - l = len(densities[0].bins) - n = len(densities) - Fa = heavyside_cdf(densities[0].bins, targets) - for df in densities: - _crps += sum([(df.cummulative(val) - Fa[val][0]) ** 2 for val in df.bins]) + if isinstance(densities, ProbabilityDistribution.ProbabilityDistribution): + densities = [densities] + + l = len(densities[0].bins) + n = len(densities) + for ct, df in enumerate(densities): + _crps += sum([(df.cummulative(bin) - (1 if bin >= targets[ct] else 0)) ** 2 for bin in df.bins]) return _crps / float(l * n) @@ -387,8 +379,9 @@ def get_distribution_statistics(data, model, **kwargs): _s1 = time.time() forecasts = model.predict(data, **kwargs) _e1 = time.time() - ret.append(round(crps(data[model.order:], forecasts), 3)) + ret.append(round(crps(data[model.order:], forecasts[:-1]), 3)) ret.append(round(_e1 - _s1, 3)) + ret.append(round(brier_score(data[model.order:], forecasts[:-1]), 3)) else: skip = kwargs.get('steps_ahead_sampler', 1) forecasts = [] @@ -402,6 +395,7 @@ def get_distribution_statistics(data, model, **kwargs): start = model.order + steps_ahead ret.append(round(crps(data[start:-1:skip], forecasts), 3)) ret.append(round(_e1 - _s1, 3)) + ret.append(round(brier_score(data[start:-1:skip], forecasts), 3)) return ret diff --git a/pyFTS/benchmarks/benchmarks.py b/pyFTS/benchmarks/benchmarks.py index bfe22fe..8e251c9 100644 --- a/pyFTS/benchmarks/benchmarks.py +++ b/pyFTS/benchmarks/benchmarks.py @@ -48,6 +48,38 @@ def __pop(key, default, kwargs): return default +def get_benchmark_point_methods(): + """Return all non FTS methods for point forecasting""" + return [naive.Naive, arima.ARIMA, quantreg.QuantileRegression] + + +def get_point_methods(): + """Return all FTS methods for point forecasting""" + return [song.ConventionalFTS, chen.ConventionalFTS, yu.WeightedFTS, ismailefendi.ImprovedWeightedFTS, + cheng.TrendWeightedFTS, sadaei.ExponentialyWeightedFTS, hofts.HighOrderFTS, hwang.HighOrderFTS, + pwfts.ProbabilisticWeightedFTS] + + +def get_benchmark_interval_methods(): + """Return all non FTS methods for point_to_interval forecasting""" + return [ arima.ARIMA, quantreg.QuantileRegression] + + +def get_interval_methods(): + """Return all FTS methods for point_to_interval forecasting""" + return [ifts.IntervalFTS, pwfts.ProbabilisticWeightedFTS] + + +def get_probabilistic_methods(): + """Return all FTS methods for probabilistic forecasting""" + return [ensemble.AllMethodEnsembleFTS, pwfts.ProbabilisticWeightedFTS] + + +def get_benchmark_probabilistic_methods(): + """Return all FTS methods for probabilistic forecasting""" + return [arima.ARIMA, quantreg.QuantileRegression, knn.KNearestNeighbors] + + def sliding_window_benchmarks(data, windowsize, train=0.8, **kwargs): """ Sliding window benchmarks for FTS forecasters. @@ -141,6 +173,8 @@ def sliding_window_benchmarks(data, windowsize, train=0.8, **kwargs): benchmark_methods = __pop("benchmark_methods", None, kwargs) benchmark_methods_parameters = __pop("benchmark_methods_parameters", None, kwargs) + benchmark_pool = [] if benchmark_models is None else benchmark_models + if benchmark_models != False: if benchmark_models is None and benchmark_methods is None: @@ -151,13 +185,13 @@ def sliding_window_benchmarks(data, windowsize, train=0.8, **kwargs): elif type == 'distribution': benchmark_methods = get_benchmark_probabilistic_methods() - if isinstance(benchmark_models, list) : - pool.extend(benchmark_models) - elif benchmark_methods is not None: - for count, model in enumerate(benchmark_methods, start=0): - par = benchmark_methods_parameters[count] - mfts = model("", **par) - pool.append(mfts) + if benchmark_methods is not None: + for transformation in transformations: + for count, model in enumerate(benchmark_methods, start=0): + par = benchmark_methods_parameters[count] + mfts = model("", **par) + mfts.append_transformation(transformation) + benchmark_pool.append(mfts) if type == 'point': experiment_method = run_point @@ -184,6 +218,10 @@ def sliding_window_benchmarks(data, windowsize, train=0.8, **kwargs): inc = __pop("inc", 0.1, kwargs) + file = kwargs.get('file', "benchmarks.db") + + conn = bUtil.open_benchmark_db(file) + for ct, train, test in cUtil.sliding_window(data, windowsize, train, inc=inc, **kwargs): experiments += 1 @@ -192,6 +230,18 @@ def sliding_window_benchmarks(data, windowsize, train=0.8, **kwargs): partitioners_pool = [] + for model in benchmark_pool: + for step in steps_ahead: + kwargs['steps_ahead'] = step + + if not distributed: + job = experiment_method(deepcopy(model), None, train, test, **kwargs) + synthesis_method(dataset, tag, job, conn) + else: + job = cluster.submit(deepcopy(model), None, train, test, **kwargs) + jobs.append(job) + + if partitioners_models is None: for transformation in transformations: @@ -210,10 +260,6 @@ def sliding_window_benchmarks(data, windowsize, train=0.8, **kwargs): if progress: rng1 = tqdm(steps_ahead, desc="Steps") - file = kwargs.get('file', "benchmarks.db") - - conn = bUtil.open_benchmark_db(file) - for step in rng1: rng2 = partitioners_pool @@ -267,36 +313,6 @@ def sliding_window_benchmarks(data, windowsize, train=0.8, **kwargs): conn.close() -def get_benchmark_point_methods(): - """Return all non FTS methods for point forecasting""" - return [naive.Naive, arima.ARIMA, quantreg.QuantileRegression] - - -def get_point_methods(): - """Return all FTS methods for point forecasting""" - return [song.ConventionalFTS, chen.ConventionalFTS, yu.WeightedFTS, ismailefendi.ImprovedWeightedFTS, - cheng.TrendWeightedFTS, sadaei.ExponentialyWeightedFTS, hofts.HighOrderFTS, hwang.HighOrderFTS, - pwfts.ProbabilisticWeightedFTS] - - -def get_benchmark_interval_methods(): - """Return all non FTS methods for point_to_interval forecasting""" - return [ arima.ARIMA, quantreg.QuantileRegression] - - -def get_interval_methods(): - """Return all FTS methods for point_to_interval forecasting""" - return [ifts.IntervalFTS, pwfts.ProbabilisticWeightedFTS] - - -def get_probabilistic_methods(): - """Return all FTS methods for probabilistic forecasting""" - return [ensemble.AllMethodEnsembleFTS, pwfts.ProbabilisticWeightedFTS] - - -def get_benchmark_probabilistic_methods(): - """Return all FTS methods for probabilistic forecasting""" - return [arima.ARIMA, quantreg.QuantileRegression, knn.KNearestNeighbors] def run_point(mfts, partitioner, train_data, test_data, window_key=None, **kwargs): @@ -336,7 +352,6 @@ def run_point(mfts, partitioner, train_data, test_data, window_key=None, **kwarg if mfts.benchmark_only: _key = mfts.shortname + str(mfts.order if mfts.order is not None else "") - mfts.append_transformation(partitioner.transformation) else: pttr = str(partitioner.__module__).split('.')[-1] _key = mfts.shortname + " n = " + str(mfts.order) + " " + pttr + " q = " + str(partitioner.partitions) @@ -347,7 +362,7 @@ def run_point(mfts, partitioner, train_data, test_data, window_key=None, **kwarg _key += str(method) if method is not None else "" _start = time.time() - mfts.fit(train_data, order=mfts.order, **kwargs) + mfts.fit(train_data, **kwargs) _end = time.time() times = _end - _start @@ -392,7 +407,6 @@ def run_interval(mfts, partitioner, train_data, test_data, window_key=None, **kw method = kwargs.get('method', None) if mfts.benchmark_only: - mfts.append_transformation(partitioner.transformation) _key = mfts.shortname + str(mfts.order if mfts.order is not None else "") + str(mfts.alpha) else: pttr = str(partitioner.__module__).split('.')[-1] @@ -404,7 +418,7 @@ def run_interval(mfts, partitioner, train_data, test_data, window_key=None, **kw _key += str(method) if method is not None else "" _start = time.time() - mfts.fit(train_data, order=mfts.order, **kwargs) + mfts.fit(train_data, **kwargs) _end = time.time() times = _end - _start @@ -456,7 +470,6 @@ def run_probabilistic(mfts, partitioner, train_data, test_data, window_key=None, if mfts.benchmark_only: _key = mfts.shortname + str(mfts.order if mfts.order is not None else "") + str(mfts.alpha) - mfts.append_transformation(partitioner.transformation) else: pttr = str(partitioner.__module__).split('.')[-1] _key = mfts.shortname + " n = " + str(mfts.order) + " " + pttr + " q = " + str(partitioner.partitions) @@ -469,20 +482,15 @@ def run_probabilistic(mfts, partitioner, train_data, test_data, window_key=None, if mfts.has_seasonality: mfts.indexer = indexer - try: - _start = time.time() - mfts.fit(train_data, order=mfts.order) - _end = time.time() - times = _end - _start + _start = time.time() + mfts.fit(train_data, **kwargs) + _end = time.time() + times = _end - _start - _crps1, _t1 = Measures.get_distribution_statistics(test_data, mfts, **kwargs) - _t1 += times - except Exception as e: - print(e) - _crps1 = np.nan - _t1 = np.nan + _crps1, _t1, _brier = Measures.get_distribution_statistics(test_data, mfts, **kwargs) + _t1 += times - ret = {'key': _key, 'obj': mfts, 'CRPS': _crps1, 'time': _t1, 'window': window_key, + ret = {'key': _key, 'obj': mfts, 'CRPS': _crps1, 'time': _t1, 'brier': _brier, 'window': window_key, 'steps': steps_ahead, 'method': method} return ret @@ -541,11 +549,14 @@ def process_probabilistic_jobs(dataset, tag, job, conn): data = bUtil.process_common_data(dataset, tag, 'density', job) crps = deepcopy(data) - crps.extend(["CRPS",job["CRPS"]]) + crps.extend(["crps",job["CRPS"]]) bUtil.insert_benchmark(crps, conn) time = deepcopy(data) time.extend(["time", job["time"]]) bUtil.insert_benchmark(time, conn) + brier = deepcopy(data) + brier.extend(["brier", job["brier"]]) + bUtil.insert_benchmark(brier, conn) def print_point_statistics(data, models, externalmodels = None, externalforecasts = None, indexers=None): diff --git a/pyFTS/benchmarks/knn.py b/pyFTS/benchmarks/knn.py index c957f52..58814b6 100644 --- a/pyFTS/benchmarks/knn.py +++ b/pyFTS/benchmarks/knn.py @@ -6,6 +6,7 @@ from statsmodels.tsa.tsatools import lagmat from pyFTS.common import fts from pyFTS.probabilistic import ProbabilityDistribution + class KNearestNeighbors(fts.FTS): """ K-Nearest Neighbors @@ -13,6 +14,7 @@ class KNearestNeighbors(fts.FTS): def __init__(self, name, **kwargs): super(KNearestNeighbors, self).__init__(1, "kNN"+name) self.name = "kNN" + self.shortname = "kNN" self.detail = "K-Nearest Neighbors" self.is_high_order = True self.has_point_forecasting = True diff --git a/pyFTS/benchmarks/quantreg.py b/pyFTS/benchmarks/quantreg.py index 83f184b..fe6e3f4 100644 --- a/pyFTS/benchmarks/quantreg.py +++ b/pyFTS/benchmarks/quantreg.py @@ -8,6 +8,7 @@ from statsmodels.tsa.tsatools import lagmat from pyFTS.common import SortedCollection, fts from pyFTS.probabilistic import ProbabilityDistribution + class QuantileRegression(fts.FTS): """Façade for statsmodels.regression.quantile_regression""" def __init__(self, name, **kwargs): @@ -26,10 +27,11 @@ class QuantileRegression(fts.FTS): self.mean_qt = None self.lower_qt = None self.dist_qt = None + self.order = kwargs.get('order', 1) self.shortname = "QAR("+str(self.order)+","+str(self.alpha)+")" def train(self, data, **kwargs): - if kwargs.get('order', None) is not None: + if 'order' in kwargs: self.order = kwargs.get('order', 1) if self.indexer is not None and isinstance(data, pd.DataFrame): diff --git a/pyFTS/models/ensemble/ensemble.py b/pyFTS/models/ensemble/ensemble.py index 8bac976..3282b7c 100644 --- a/pyFTS/models/ensemble/ensemble.py +++ b/pyFTS/models/ensemble/ensemble.py @@ -246,9 +246,10 @@ class EnsembleFTS(fts.FTS): class AllMethodEnsembleFTS(EnsembleFTS): - def __init__(self, **kwargs): - super(AllMethodEnsembleFTS, self).__init__(name="Ensemble FTS", **kwargs) + def __init__(self, name, **kwargs): + super(AllMethodEnsembleFTS, self).__init__(name="Ensemble FTS"+name, **kwargs) self.min_order = 3 + self.shortname ="Ensemble FTS" def set_transformations(self, model): for t in self.transformations: diff --git a/pyFTS/tests/general.py b/pyFTS/tests/general.py index eae55c7..bdb10f9 100644 --- a/pyFTS/tests/general.py +++ b/pyFTS/tests/general.py @@ -20,15 +20,17 @@ partitioner = Grid.GridPartitioner(data=dataset[:800], npart=10) #, transformati ''' from pyFTS.benchmarks import benchmarks as bchmk, Util as bUtil, Measures, knn, quantreg, arima -''' + from pyFTS.models import pwfts, song, ifts from pyFTS.models.ensemble import ensemble -model = ensemble.AllMethodEnsembleFTS(partitioner=partitioner) +''' +model = knn.KNearestNeighbors("") model.fit(dataset[:800]) -tmp = model.predict(dataset[800:1000], type='distribution') -for tmp2 in tmp: - print(tmp2) +Measures.get_distribution_statistics(dataset[800:1000], model) +#tmp = model.predict(dataset[800:1000], type='distribution') +#for tmp2 in tmp: +# print(tmp2) ''' @@ -49,11 +51,12 @@ print(Measures.get_distribution_statistics(dataset[800:1000], model, steps_ahead from pyFTS.benchmarks import arima, naive, quantreg -bchmk.sliding_window_benchmarks(dataset, 1000, train=0.8, inc=0.2, - #methods=[ifts.IntervalFTS], #[pwfts.ProbabilisticWeightedFTS], - benchmark_models=False, +bchmk.sliding_window_benchmarks(dataset[:1000], 1000, train=0.8, inc=0.2, + #methods=[pwfts.ProbabilisticWeightedFTS], + benchmark_models=[], benchmark_methods=[arima.ARIMA for k in range(4)] - + [quantreg.QuantileRegression for k in range(2)], + + [quantreg.QuantileRegression for k in range(2)] + + [knn.KNearestNeighbors], benchmark_methods_parameters=[ {'order': (1, 0, 0)}, {'order': (1, 0, 1)}, @@ -61,14 +64,15 @@ bchmk.sliding_window_benchmarks(dataset, 1000, train=0.8, inc=0.2, {'order': (2, 0, 2)}, {'order': 1, 'dist': True}, {'order': 2, 'dist': True}, + {} ], - #transformations=[None, tdiff], - orders=[1, 2, 3], + #transformations=[tdiff], + orders=[1], partitions=np.arange(30, 80, 5), progress=False, type='distribution', #steps_ahead=[1,4,7,10], #steps_ahead=[1] - distributed=True, nodes=['192.168.0.110', '192.168.0.107','192.168.0.106'], - file="benchmarks.db", dataset="TAIEX", tag="comparisons") + #distributed=True, nodes=['192.168.0.110', '192.168.0.107','192.168.0.106'], + file="benchmarks.tmp", dataset="TAIEX", tag="comparisons") #'''