From f4dec685bce0ac8bbf4fdb0781a914da5f3018c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=C3=B4nio=20C=C3=A2ndido?= Date: Sat, 1 Jun 2019 14:58:39 -0300 Subject: [PATCH] Improvements on benchmarks.knn --- pyFTS/benchmarks/benchmarks.py | 13 ++- pyFTS/benchmarks/knn.py | 144 ++++++++++++++++++++++++++---- pyFTS/common/fts.py | 6 +- pyFTS/models/ensemble/ensemble.py | 8 +- pyFTS/tests/general.py | 51 ++++++++--- 5 files changed, 178 insertions(+), 44 deletions(-) diff --git a/pyFTS/benchmarks/benchmarks.py b/pyFTS/benchmarks/benchmarks.py index 1b22417..f6dfeee 100644 --- a/pyFTS/benchmarks/benchmarks.py +++ b/pyFTS/benchmarks/benchmarks.py @@ -643,7 +643,7 @@ def run_point2(fts_method, order, partitioner_method, partitions, transformation import time from pyFTS.models import yu, chen, hofts, pwfts,ismailefendi,sadaei, song, cheng, hwang from pyFTS.partitioners import Grid, Entropy, FCM - from pyFTS.benchmarks import Measures, naive, arima, quantreg + from pyFTS.benchmarks import Measures, naive, arima, quantreg, benchmarks from pyFTS.common import Transformations tmp = [song.ConventionalFTS, chen.ConventionalFTS, yu.WeightedFTS, ismailefendi.ImprovedWeightedFTS, @@ -664,7 +664,7 @@ def run_point2(fts_method, order, partitioner_method, partitions, transformation method = kwargs.get('method', None) parameters = kwargs.get('parameters', {}) - mfts, pttr = __build_model(fts_method, order, parameters, partitioner_method, partitions, train_data, + mfts, pttr = benchmarks.__build_model(fts_method, order, parameters, partitioner_method, partitions, train_data, transformation) _start = time.time() @@ -691,7 +691,7 @@ def run_interval2(fts_method, order, partitioner_method, partitions, transformat import time from pyFTS.models import hofts,ifts,pwfts from pyFTS.partitioners import Grid, Entropy, FCM - from pyFTS.benchmarks import Measures, arima, quantreg, BSTS + from pyFTS.benchmarks import Measures, arima, quantreg, BSTS, benchmarks tmp = [hofts.HighOrderFTS, ifts.IntervalFTS, ifts.WeightedIntervalFTS, pwfts.ProbabilisticWeightedFTS] @@ -705,7 +705,7 @@ def run_interval2(fts_method, order, partitioner_method, partitions, transformat method = kwargs.get('method', None) parameters = kwargs.get('parameters',{}) - mfts, pttr = __build_model(fts_method, order, parameters, partitioner_method, partitions, train_data, + mfts, pttr = benchmarks.__build_model(fts_method, order, parameters, partitioner_method, partitions, train_data, transformation) _start = time.time() mfts.fit(train_data, **kwargs) @@ -735,7 +735,7 @@ def run_probabilistic2(fts_method, order, partitioner_method, partitions, transf from pyFTS.models import hofts, ifts, pwfts from pyFTS.models.ensemble import ensemble from pyFTS.partitioners import Grid, Entropy, FCM - from pyFTS.benchmarks import Measures, arima, quantreg, knn + from pyFTS.benchmarks import Measures, arima, quantreg, knn, benchmarks from pyFTS.models.seasonal import SeasonalIndexer tmp = [hofts.HighOrderFTS, ifts.IntervalFTS, pwfts.ProbabilisticWeightedFTS, arima.ARIMA, @@ -751,9 +751,8 @@ def run_probabilistic2(fts_method, order, partitioner_method, partitions, transf method = kwargs.get('method', None) parameters = kwargs.get('parameters', {}) - mfts, pttr = __build_model(fts_method, order, parameters, partitioner_method, partitions, train_data, + mfts, pttr = benchmarks.__build_model(fts_method, order, parameters, partitioner_method, partitions, train_data, transformation) - if mfts.has_seasonality: mfts.indexer = indexer diff --git a/pyFTS/benchmarks/knn.py b/pyFTS/benchmarks/knn.py index f26a3c8..ca3ac5c 100644 --- a/pyFTS/benchmarks/knn.py +++ b/pyFTS/benchmarks/knn.py @@ -5,17 +5,20 @@ import numpy as np from statsmodels.tsa.tsatools import lagmat from pyFTS.common import fts from pyFTS.probabilistic import ProbabilityDistribution - +from sklearn.neighbors import KDTree +from itertools import product +from pyFTS.models.ensemble.ensemble import sampler class KNearestNeighbors(fts.FTS): """ - K-Nearest Neighbors + A façade for sklearn.neighbors """ def __init__(self, **kwargs): super(KNearestNeighbors, self).__init__(**kwargs) self.name = "kNN" self.shortname = "kNN" self.detail = "K-Nearest Neighbors" + self.uod_clip = False self.is_high_order = True self.has_point_forecasting = True self.has_interval_forecasting = True @@ -26,30 +29,113 @@ class KNearestNeighbors(fts.FTS): self.lag = None self.k = kwargs.get("k", 30) self.uod = None + self.kdtree = None + self.values = None + + def _prepare_x(self, data): + l = len(data) + X = [] + + if l == self.order: + l += 1 + + for t in np.arange(self.order, l): + X.append([data[t - k - 1] for k in np.arange(self.order)]) + + return X + + def _prepare_xy(self, data): + l = len(data) + X = [] + Y = [] + + for t in np.arange(self.order, l): + X.append([data[t - k - 1] for k in np.arange(self.order)]) + Y.append(data[t]) + + return (X,Y) def train(self, data, **kwargs): - self.data = np.array(data) + X,Y = self._prepare_xy(data) + self.kdtree = KDTree(X) + self.values = Y def knn(self, sample): + X = self._prepare_x(sample) + _, ix = self.kdtree.query(X, self.k) - if self.order == 1: - dist = np.apply_along_axis(lambda x: (x - sample) ** 2, 0, self.data) - ix = np.argsort(dist) + 1 - else: - dist = [] - for k in np.arange(self.order, len(self.data)): - dist.append(sum([ (self.data[k - kk] - sample[kk])**2 for kk in range(self.order)])) - ix = np.argsort(np.array(dist)) + self.order + 1 + return [self.values[k] for k in ix.flatten() ] - ix2 = np.clip(ix[:self.k], 0, len(self.data)-1) - return self.data[ix2] + def forecast(self, data, **kwargs): + ret = [] + for k in np.arange(self.order, len(data)): + + sample = data[k-self.order : k] + + forecasts = self.knn(sample) + + ret.append(np.nanmean(forecasts)) + + return ret + + def forecast_ahead(self, data, steps, **kwargs): + start = kwargs.get('start', self.order) + + sample = [k for k in data[start - self.order: start]] + + for k in np.arange(self.order, steps + self.order): + tmp = self.forecast(sample[k-self.order:k]) + sample.append(tmp) + + return sample[-steps] + + def forecast_interval(self, data, **kwargs): + + alpha = kwargs.get('alpha',self.alpha) + + ret = [] + for k in np.arange(self.order, len(data)): + + sample = data[k-self.order : k] + + forecasts = self.knn(sample) + + i = np.percentile(forecasts, [alpha*100, (1-alpha)*100]).tolist() + ret.append(i) + + return ret + + def forecast_ahead_interval(self, data, steps, **kwargs): + alpha = kwargs.get('alpha', self.alpha) + + ret = [] + + start = kwargs.get('start', self.order) + + sample = [[k] for k in data[start - self.order: start]] + + for k in np.arange(self.order, steps + self.order): + forecasts = [] + + lags = [sample[k - i - 1] for i in np.arange(0, self.order)] + + # Trace the possible paths + for path in product(*lags): + forecasts.extend(self.knn(path)) + + sample.append(sampler(forecasts, np.arange(.1, 1, 0.1), bounds=True)) + + interval = np.percentile(forecasts, [alpha*100, (1-alpha)*100]).tolist() + + ret.append(interval) + + return ret def forecast_distribution(self, data, **kwargs): ret = [] - smooth = kwargs.get("smooth", "KDE") - alpha = kwargs.get("alpha", None) + smooth = kwargs.get("smooth", "histogram") uod = self.get_UoD() @@ -65,4 +151,32 @@ class KNearestNeighbors(fts.FTS): return ret + def forecast_ahead_distribution(self, data, steps, **kwargs): + smooth = kwargs.get("smooth", "histogram") + + ret = [] + + start = kwargs.get('start', self.order) + + uod = self.get_UoD() + + sample = [[k] for k in data[start - self.order: start]] + + for k in np.arange(self.order, steps + self.order): + forecasts = [] + + lags = [sample[k - i - 1] for i in np.arange(0, self.order)] + + # Trace the possible paths + for path in product(*lags): + forecasts.extend(self.knn(path)) + + dist = ProbabilityDistribution.ProbabilityDistribution(smooth, uod=uod, data=forecasts, + name="", **kwargs) + ret.append(dist) + + sample.append(sampler(forecasts, np.arange(.1, 1, 0.1), bounds=True)) + + return ret + diff --git a/pyFTS/common/fts.py b/pyFTS/common/fts.py index 371a6f0..f500641 100644 --- a/pyFTS/common/fts.py +++ b/pyFTS/common/fts.py @@ -519,8 +519,10 @@ class FTS(object): return data def get_UoD(self): - #return [self.original_min, self.original_max] - return [self.partitioner.min, self.partitioner.max] + if self.partitioner is not None: + return [self.partitioner.min, self.partitioner.max] + else: + return [self.original_min, self.original_max] def __str__(self): """String representation of the model""" diff --git a/pyFTS/models/ensemble/ensemble.py b/pyFTS/models/ensemble/ensemble.py index 6192592..95433cd 100644 --- a/pyFTS/models/ensemble/ensemble.py +++ b/pyFTS/models/ensemble/ensemble.py @@ -167,8 +167,7 @@ class EnsembleFTS(fts.FTS): if "method" in kwargs: self.interval_method = kwargs.get('method','quantile') - if 'alpha' in kwargs: - self.alpha = kwargs.get('alpha',0.05) + self.alpha = kwargs.get('alpha', self.alpha) l = len(data) @@ -189,15 +188,12 @@ class EnsembleFTS(fts.FTS): if 'method' in kwargs: self.interval_method = kwargs.get('method','quantile') - if 'alpha' in kwargs: - self.alpha = kwargs.get('alpha', self.alpha) + self.alpha = kwargs.get('alpha', self.alpha) ret = [] start = kwargs.get('start', self.order) - uod = self.get_UoD() - sample = [[k] for k in data[start - self.order: start]] for k in np.arange(self.order, steps + self.order): diff --git a/pyFTS/tests/general.py b/pyFTS/tests/general.py index 0f92096..a666f5f 100644 --- a/pyFTS/tests/general.py +++ b/pyFTS/tests/general.py @@ -13,29 +13,51 @@ from pyFTS.partitioners import Grid, Entropy, Util as pUtil, Simple from pyFTS.benchmarks import benchmarks as bchmk, Measures from pyFTS.models import chen, yu, cheng, ismailefendi, hofts, pwfts, tsaur, song, sadaei, ifts from pyFTS.models.ensemble import ensemble -from pyFTS.common import Transformations, Membership -from pyFTS.benchmarks import arima, quantreg, BSTS, gaussianproc +from pyFTS.common import Transformations, Membership, Util +from pyFTS.benchmarks import arima, quantreg, BSTS, gaussianproc, knn from pyFTS.fcm import fts, common, GA from pyFTS.data import TAIEX, NASDAQ, SP500 +train = TAIEX.get_data()[:800] +test = TAIEX.get_data()[800:1000] + +order = 2 +model = knn.KNearestNeighbors(order=order) +model.fit(train) + +horizon=7 + +intervals05 = model.predict(test[:10], type='interval', alpha=.05, steps_ahead=horizon) + +print(test[:10]) +print(intervals05) + +intervals25 = model.predict(test[:10], type='interval', alpha=.25, steps_ahead=horizon) +distributions = model.predict(test[:10], type='distribution', steps_ahead=horizon, smoothing=0.01, num_bins=100) + +fig, ax = plt.subplots(nrows=1, ncols=1,figsize=[15,5]) +ax.plot(test[:10], label='Original',color='black') +Util.plot_interval2(intervals05, test[:10], start_at=model.order, ax=ax, color='green', label='alpha=.05'.format(model.order)) +Util.plot_interval2(intervals25, test[:10], start_at=model.order, ax=ax, color='green', label='alpha=.25'.format(model.order)) +Util.plot_distribution2(distributions, test[:10], start_at=model.order, ax=ax, cmap="Blues") + +print("") +''' + datasets = {} datasets['TAIEX'] = TAIEX.get_data()[:5000] datasets['NASDAQ'] = NASDAQ.get_data()[:5000] datasets['SP500'] = SP500.get_data()[10000:15000] -methods = [ensemble.SimpleEnsembleFTS]*8 +methods = [ensemble.SimpleEnsembleFTS]*4 methods_parameters = [ - {'name': 'EnsembleFTS-HOFTS-10-.05', 'fts_method': hofts.HighOrderFTS, 'partitions': np.arange(20,50,10), 'alpha': .05}, - {'name': 'EnsembleFTS-HOFTS-5-.05', 'fts_method': hofts.HighOrderFTS, 'partitions': np.arange(20,50,5), 'alpha': .05}, - {'name': 'EnsembleFTS-HOFTS-10-.25', 'fts_method': hofts.HighOrderFTS, 'partitions': np.arange(20,50,10), 'alpha': .25}, - {'name': 'EnsembleFTS-HOFTS-5-.25', 'fts_method': hofts.HighOrderFTS, 'partitions': np.arange(20,50,5), 'alpha': .25}, - {'name': 'EnsembleFTS-WHOFTS-10-.05', 'fts_method': hofts.WeightedHighOrderFTS, 'partitions': np.arange(20,50,10), 'alpha': .05}, - {'name': 'EnsembleFTS-WHOFTS-5-.05', 'fts_method': hofts.WeightedHighOrderFTS, 'partitions': np.arange(20,50,5), 'alpha': .05}, - {'name': 'EnsembleFTS-WHOFTS-10-.25', 'fts_method': hofts.WeightedHighOrderFTS, 'partitions': np.arange(20,50,10), 'alpha': .25}, - {'name': 'EnsembleFTS-WHOFTS-5-.25', 'fts_method': hofts.WeightedHighOrderFTS, 'partitions': np.arange(20,50,5), 'alpha': .25}, + {'name': 'EnsembleFTS-HOFTS-10', 'fts_method': hofts.HighOrderFTS, 'partitions': np.arange(20,50,10)}, + {'name': 'EnsembleFTS-HOFTS-5', 'fts_method': hofts.HighOrderFTS, 'partitions': np.arange(20,50,5)}, + {'name': 'EnsembleFTS-WHOFTS-10', 'fts_method': hofts.WeightedHighOrderFTS, 'partitions': np.arange(20,50,10)}, + {'name': 'EnsembleFTS-WHOFTS-5', 'fts_method': hofts.WeightedHighOrderFTS, 'partitions': np.arange(20,50,5)} ] for dataset_name, dataset in datasets.items(): @@ -46,6 +68,7 @@ for dataset_name, dataset in datasets.items(): transformations=[None], orders=[3], partitions=[None], - type='interval', - #distributed=True, nodes=['192.168.0.110', '192.168.0.107','192.168.0.106'], - file="tmp.db", dataset=dataset_name, tag="gridsearch") + type='distribution', + distributed=True, nodes=['192.168.0.110', '192.168.0.107','192.168.0.106'], + file="experiments.db", dataset=dataset_name, tag="gridsearch") +''' \ No newline at end of file