Improvements on benchmarks.knn

This commit is contained in:
Petrônio Cândido 2019-06-01 14:58:39 -03:00
parent 0dc2fabdcc
commit f4dec685bc
5 changed files with 178 additions and 44 deletions
pyFTS
benchmarks
common
models/ensemble
tests

View File

@ -643,7 +643,7 @@ def run_point2(fts_method, order, partitioner_method, partitions, transformation
import time import time
from pyFTS.models import yu, chen, hofts, pwfts,ismailefendi,sadaei, song, cheng, hwang from pyFTS.models import yu, chen, hofts, pwfts,ismailefendi,sadaei, song, cheng, hwang
from pyFTS.partitioners import Grid, Entropy, FCM from pyFTS.partitioners import Grid, Entropy, FCM
from pyFTS.benchmarks import Measures, naive, arima, quantreg from pyFTS.benchmarks import Measures, naive, arima, quantreg, benchmarks
from pyFTS.common import Transformations from pyFTS.common import Transformations
tmp = [song.ConventionalFTS, chen.ConventionalFTS, yu.WeightedFTS, ismailefendi.ImprovedWeightedFTS, tmp = [song.ConventionalFTS, chen.ConventionalFTS, yu.WeightedFTS, ismailefendi.ImprovedWeightedFTS,
@ -664,7 +664,7 @@ def run_point2(fts_method, order, partitioner_method, partitions, transformation
method = kwargs.get('method', None) method = kwargs.get('method', None)
parameters = kwargs.get('parameters', {}) parameters = kwargs.get('parameters', {})
mfts, pttr = __build_model(fts_method, order, parameters, partitioner_method, partitions, train_data, mfts, pttr = benchmarks.__build_model(fts_method, order, parameters, partitioner_method, partitions, train_data,
transformation) transformation)
_start = time.time() _start = time.time()
@ -691,7 +691,7 @@ def run_interval2(fts_method, order, partitioner_method, partitions, transformat
import time import time
from pyFTS.models import hofts,ifts,pwfts from pyFTS.models import hofts,ifts,pwfts
from pyFTS.partitioners import Grid, Entropy, FCM from pyFTS.partitioners import Grid, Entropy, FCM
from pyFTS.benchmarks import Measures, arima, quantreg, BSTS from pyFTS.benchmarks import Measures, arima, quantreg, BSTS, benchmarks
tmp = [hofts.HighOrderFTS, ifts.IntervalFTS, ifts.WeightedIntervalFTS, pwfts.ProbabilisticWeightedFTS] tmp = [hofts.HighOrderFTS, ifts.IntervalFTS, ifts.WeightedIntervalFTS, pwfts.ProbabilisticWeightedFTS]
@ -705,7 +705,7 @@ def run_interval2(fts_method, order, partitioner_method, partitions, transformat
method = kwargs.get('method', None) method = kwargs.get('method', None)
parameters = kwargs.get('parameters',{}) parameters = kwargs.get('parameters',{})
mfts, pttr = __build_model(fts_method, order, parameters, partitioner_method, partitions, train_data, mfts, pttr = benchmarks.__build_model(fts_method, order, parameters, partitioner_method, partitions, train_data,
transformation) transformation)
_start = time.time() _start = time.time()
mfts.fit(train_data, **kwargs) mfts.fit(train_data, **kwargs)
@ -735,7 +735,7 @@ def run_probabilistic2(fts_method, order, partitioner_method, partitions, transf
from pyFTS.models import hofts, ifts, pwfts from pyFTS.models import hofts, ifts, pwfts
from pyFTS.models.ensemble import ensemble from pyFTS.models.ensemble import ensemble
from pyFTS.partitioners import Grid, Entropy, FCM from pyFTS.partitioners import Grid, Entropy, FCM
from pyFTS.benchmarks import Measures, arima, quantreg, knn from pyFTS.benchmarks import Measures, arima, quantreg, knn, benchmarks
from pyFTS.models.seasonal import SeasonalIndexer from pyFTS.models.seasonal import SeasonalIndexer
tmp = [hofts.HighOrderFTS, ifts.IntervalFTS, pwfts.ProbabilisticWeightedFTS, arima.ARIMA, tmp = [hofts.HighOrderFTS, ifts.IntervalFTS, pwfts.ProbabilisticWeightedFTS, arima.ARIMA,
@ -751,9 +751,8 @@ def run_probabilistic2(fts_method, order, partitioner_method, partitions, transf
method = kwargs.get('method', None) method = kwargs.get('method', None)
parameters = kwargs.get('parameters', {}) parameters = kwargs.get('parameters', {})
mfts, pttr = __build_model(fts_method, order, parameters, partitioner_method, partitions, train_data, mfts, pttr = benchmarks.__build_model(fts_method, order, parameters, partitioner_method, partitions, train_data,
transformation) transformation)
if mfts.has_seasonality: if mfts.has_seasonality:
mfts.indexer = indexer mfts.indexer = indexer

View File

@ -5,17 +5,20 @@ import numpy as np
from statsmodels.tsa.tsatools import lagmat from statsmodels.tsa.tsatools import lagmat
from pyFTS.common import fts from pyFTS.common import fts
from pyFTS.probabilistic import ProbabilityDistribution from pyFTS.probabilistic import ProbabilityDistribution
from sklearn.neighbors import KDTree
from itertools import product
from pyFTS.models.ensemble.ensemble import sampler
class KNearestNeighbors(fts.FTS): class KNearestNeighbors(fts.FTS):
""" """
K-Nearest Neighbors A façade for sklearn.neighbors
""" """
def __init__(self, **kwargs): def __init__(self, **kwargs):
super(KNearestNeighbors, self).__init__(**kwargs) super(KNearestNeighbors, self).__init__(**kwargs)
self.name = "kNN" self.name = "kNN"
self.shortname = "kNN" self.shortname = "kNN"
self.detail = "K-Nearest Neighbors" self.detail = "K-Nearest Neighbors"
self.uod_clip = False
self.is_high_order = True self.is_high_order = True
self.has_point_forecasting = True self.has_point_forecasting = True
self.has_interval_forecasting = True self.has_interval_forecasting = True
@ -26,30 +29,113 @@ class KNearestNeighbors(fts.FTS):
self.lag = None self.lag = None
self.k = kwargs.get("k", 30) self.k = kwargs.get("k", 30)
self.uod = None self.uod = None
self.kdtree = None
self.values = None
def _prepare_x(self, data):
l = len(data)
X = []
if l == self.order:
l += 1
for t in np.arange(self.order, l):
X.append([data[t - k - 1] for k in np.arange(self.order)])
return X
def _prepare_xy(self, data):
l = len(data)
X = []
Y = []
for t in np.arange(self.order, l):
X.append([data[t - k - 1] for k in np.arange(self.order)])
Y.append(data[t])
return (X,Y)
def train(self, data, **kwargs): def train(self, data, **kwargs):
self.data = np.array(data) X,Y = self._prepare_xy(data)
self.kdtree = KDTree(X)
self.values = Y
def knn(self, sample): def knn(self, sample):
X = self._prepare_x(sample)
_, ix = self.kdtree.query(X, self.k)
if self.order == 1: return [self.values[k] for k in ix.flatten() ]
dist = np.apply_along_axis(lambda x: (x - sample) ** 2, 0, self.data)
ix = np.argsort(dist) + 1
else:
dist = []
for k in np.arange(self.order, len(self.data)):
dist.append(sum([ (self.data[k - kk] - sample[kk])**2 for kk in range(self.order)]))
ix = np.argsort(np.array(dist)) + self.order + 1
ix2 = np.clip(ix[:self.k], 0, len(self.data)-1) def forecast(self, data, **kwargs):
return self.data[ix2] ret = []
for k in np.arange(self.order, len(data)):
sample = data[k-self.order : k]
forecasts = self.knn(sample)
ret.append(np.nanmean(forecasts))
return ret
def forecast_ahead(self, data, steps, **kwargs):
start = kwargs.get('start', self.order)
sample = [k for k in data[start - self.order: start]]
for k in np.arange(self.order, steps + self.order):
tmp = self.forecast(sample[k-self.order:k])
sample.append(tmp)
return sample[-steps]
def forecast_interval(self, data, **kwargs):
alpha = kwargs.get('alpha',self.alpha)
ret = []
for k in np.arange(self.order, len(data)):
sample = data[k-self.order : k]
forecasts = self.knn(sample)
i = np.percentile(forecasts, [alpha*100, (1-alpha)*100]).tolist()
ret.append(i)
return ret
def forecast_ahead_interval(self, data, steps, **kwargs):
alpha = kwargs.get('alpha', self.alpha)
ret = []
start = kwargs.get('start', self.order)
sample = [[k] for k in data[start - self.order: start]]
for k in np.arange(self.order, steps + self.order):
forecasts = []
lags = [sample[k - i - 1] for i in np.arange(0, self.order)]
# Trace the possible paths
for path in product(*lags):
forecasts.extend(self.knn(path))
sample.append(sampler(forecasts, np.arange(.1, 1, 0.1), bounds=True))
interval = np.percentile(forecasts, [alpha*100, (1-alpha)*100]).tolist()
ret.append(interval)
return ret
def forecast_distribution(self, data, **kwargs): def forecast_distribution(self, data, **kwargs):
ret = [] ret = []
smooth = kwargs.get("smooth", "KDE") smooth = kwargs.get("smooth", "histogram")
alpha = kwargs.get("alpha", None)
uod = self.get_UoD() uod = self.get_UoD()
@ -65,4 +151,32 @@ class KNearestNeighbors(fts.FTS):
return ret return ret
def forecast_ahead_distribution(self, data, steps, **kwargs):
smooth = kwargs.get("smooth", "histogram")
ret = []
start = kwargs.get('start', self.order)
uod = self.get_UoD()
sample = [[k] for k in data[start - self.order: start]]
for k in np.arange(self.order, steps + self.order):
forecasts = []
lags = [sample[k - i - 1] for i in np.arange(0, self.order)]
# Trace the possible paths
for path in product(*lags):
forecasts.extend(self.knn(path))
dist = ProbabilityDistribution.ProbabilityDistribution(smooth, uod=uod, data=forecasts,
name="", **kwargs)
ret.append(dist)
sample.append(sampler(forecasts, np.arange(.1, 1, 0.1), bounds=True))
return ret

View File

@ -519,8 +519,10 @@ class FTS(object):
return data return data
def get_UoD(self): def get_UoD(self):
#return [self.original_min, self.original_max] if self.partitioner is not None:
return [self.partitioner.min, self.partitioner.max] return [self.partitioner.min, self.partitioner.max]
else:
return [self.original_min, self.original_max]
def __str__(self): def __str__(self):
"""String representation of the model""" """String representation of the model"""

View File

@ -167,8 +167,7 @@ class EnsembleFTS(fts.FTS):
if "method" in kwargs: if "method" in kwargs:
self.interval_method = kwargs.get('method','quantile') self.interval_method = kwargs.get('method','quantile')
if 'alpha' in kwargs: self.alpha = kwargs.get('alpha', self.alpha)
self.alpha = kwargs.get('alpha',0.05)
l = len(data) l = len(data)
@ -189,15 +188,12 @@ class EnsembleFTS(fts.FTS):
if 'method' in kwargs: if 'method' in kwargs:
self.interval_method = kwargs.get('method','quantile') self.interval_method = kwargs.get('method','quantile')
if 'alpha' in kwargs:
self.alpha = kwargs.get('alpha', self.alpha) self.alpha = kwargs.get('alpha', self.alpha)
ret = [] ret = []
start = kwargs.get('start', self.order) start = kwargs.get('start', self.order)
uod = self.get_UoD()
sample = [[k] for k in data[start - self.order: start]] sample = [[k] for k in data[start - self.order: start]]
for k in np.arange(self.order, steps + self.order): for k in np.arange(self.order, steps + self.order):

View File

@ -13,29 +13,51 @@ from pyFTS.partitioners import Grid, Entropy, Util as pUtil, Simple
from pyFTS.benchmarks import benchmarks as bchmk, Measures from pyFTS.benchmarks import benchmarks as bchmk, Measures
from pyFTS.models import chen, yu, cheng, ismailefendi, hofts, pwfts, tsaur, song, sadaei, ifts from pyFTS.models import chen, yu, cheng, ismailefendi, hofts, pwfts, tsaur, song, sadaei, ifts
from pyFTS.models.ensemble import ensemble from pyFTS.models.ensemble import ensemble
from pyFTS.common import Transformations, Membership from pyFTS.common import Transformations, Membership, Util
from pyFTS.benchmarks import arima, quantreg, BSTS, gaussianproc from pyFTS.benchmarks import arima, quantreg, BSTS, gaussianproc, knn
from pyFTS.fcm import fts, common, GA from pyFTS.fcm import fts, common, GA
from pyFTS.data import TAIEX, NASDAQ, SP500 from pyFTS.data import TAIEX, NASDAQ, SP500
train = TAIEX.get_data()[:800]
test = TAIEX.get_data()[800:1000]
order = 2
model = knn.KNearestNeighbors(order=order)
model.fit(train)
horizon=7
intervals05 = model.predict(test[:10], type='interval', alpha=.05, steps_ahead=horizon)
print(test[:10])
print(intervals05)
intervals25 = model.predict(test[:10], type='interval', alpha=.25, steps_ahead=horizon)
distributions = model.predict(test[:10], type='distribution', steps_ahead=horizon, smoothing=0.01, num_bins=100)
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=[15,5])
ax.plot(test[:10], label='Original',color='black')
Util.plot_interval2(intervals05, test[:10], start_at=model.order, ax=ax, color='green', label='alpha=.05'.format(model.order))
Util.plot_interval2(intervals25, test[:10], start_at=model.order, ax=ax, color='green', label='alpha=.25'.format(model.order))
Util.plot_distribution2(distributions, test[:10], start_at=model.order, ax=ax, cmap="Blues")
print("")
'''
datasets = {} datasets = {}
datasets['TAIEX'] = TAIEX.get_data()[:5000] datasets['TAIEX'] = TAIEX.get_data()[:5000]
datasets['NASDAQ'] = NASDAQ.get_data()[:5000] datasets['NASDAQ'] = NASDAQ.get_data()[:5000]
datasets['SP500'] = SP500.get_data()[10000:15000] datasets['SP500'] = SP500.get_data()[10000:15000]
methods = [ensemble.SimpleEnsembleFTS]*8 methods = [ensemble.SimpleEnsembleFTS]*4
methods_parameters = [ methods_parameters = [
{'name': 'EnsembleFTS-HOFTS-10-.05', 'fts_method': hofts.HighOrderFTS, 'partitions': np.arange(20,50,10), 'alpha': .05}, {'name': 'EnsembleFTS-HOFTS-10', 'fts_method': hofts.HighOrderFTS, 'partitions': np.arange(20,50,10)},
{'name': 'EnsembleFTS-HOFTS-5-.05', 'fts_method': hofts.HighOrderFTS, 'partitions': np.arange(20,50,5), 'alpha': .05}, {'name': 'EnsembleFTS-HOFTS-5', 'fts_method': hofts.HighOrderFTS, 'partitions': np.arange(20,50,5)},
{'name': 'EnsembleFTS-HOFTS-10-.25', 'fts_method': hofts.HighOrderFTS, 'partitions': np.arange(20,50,10), 'alpha': .25}, {'name': 'EnsembleFTS-WHOFTS-10', 'fts_method': hofts.WeightedHighOrderFTS, 'partitions': np.arange(20,50,10)},
{'name': 'EnsembleFTS-HOFTS-5-.25', 'fts_method': hofts.HighOrderFTS, 'partitions': np.arange(20,50,5), 'alpha': .25}, {'name': 'EnsembleFTS-WHOFTS-5', 'fts_method': hofts.WeightedHighOrderFTS, 'partitions': np.arange(20,50,5)}
{'name': 'EnsembleFTS-WHOFTS-10-.05', 'fts_method': hofts.WeightedHighOrderFTS, 'partitions': np.arange(20,50,10), 'alpha': .05},
{'name': 'EnsembleFTS-WHOFTS-5-.05', 'fts_method': hofts.WeightedHighOrderFTS, 'partitions': np.arange(20,50,5), 'alpha': .05},
{'name': 'EnsembleFTS-WHOFTS-10-.25', 'fts_method': hofts.WeightedHighOrderFTS, 'partitions': np.arange(20,50,10), 'alpha': .25},
{'name': 'EnsembleFTS-WHOFTS-5-.25', 'fts_method': hofts.WeightedHighOrderFTS, 'partitions': np.arange(20,50,5), 'alpha': .25},
] ]
for dataset_name, dataset in datasets.items(): for dataset_name, dataset in datasets.items():
@ -46,6 +68,7 @@ for dataset_name, dataset in datasets.items():
transformations=[None], transformations=[None],
orders=[3], orders=[3],
partitions=[None], partitions=[None],
type='interval', type='distribution',
#distributed=True, nodes=['192.168.0.110', '192.168.0.107','192.168.0.106'], distributed=True, nodes=['192.168.0.110', '192.168.0.107','192.168.0.106'],
file="tmp.db", dataset=dataset_name, tag="gridsearch") file="experiments.db", dataset=dataset_name, tag="gridsearch")
'''