k-Nearest Neighbors benchmark method

This commit is contained in:
Petrônio Cândido 2018-04-25 11:36:01 -03:00
parent 34995b72f8
commit abe9a45a47
11 changed files with 198 additions and 120 deletions

View File

@ -387,7 +387,7 @@ def get_distribution_statistics(data, model, **kwargs):
_s1 = time.time()
forecasts = model.predict(data, **kwargs)
_e1 = time.time()
ret.append(round(crps(data, forecasts), 3))
ret.append(round(crps(data[model.order:], forecasts), 3))
ret.append(round(_e1 - _s1, 3))
else:
skip = kwargs.get('steps_ahead_sampler', 1)

View File

@ -30,7 +30,7 @@ class ARIMA(fts.FTS):
self.benchmark_only = True
self.min_order = 1
self.alpha = kwargs.get("alpha", 0.05)
self.shortname += str(self.alpha)
self.order = kwargs.get("order", (1,0,0))
self._decompose_order(self.order)
def _decompose_order(self, order):
@ -43,6 +43,10 @@ class ARIMA(fts.FTS):
self.shortname = "ARIMA(" + str(self.p) + "," + str(self.d) + "," + str(self.q) + ") - " + str(self.alpha)
def train(self, data, **kwargs):
self.original_min = np.nanmin(data)
self.original_max = np.nanmax(data)
if kwargs.get('order', None) is not None:
order = kwargs.get('order', (1,0,0))
self._decompose_order(order)
@ -50,8 +54,6 @@ class ARIMA(fts.FTS):
if self.indexer is not None:
data = self.indexer.get_data(data)
#data = self.apply_transformations(data, updateUoD=True)
try:
self.model = stats_arima(data, order=(self.p, self.d, self.q))
self.model_fit = self.model.fit(disp=0)
@ -69,9 +71,6 @@ class ARIMA(fts.FTS):
if self.model_fit is None:
return np.nan
if self.indexer is not None and isinstance(ndata, pd.DataFrame):
data = self.indexer.get_data(ndata)
ndata = np.array(ndata)
l = len(ndata)
@ -101,8 +100,6 @@ class ARIMA(fts.FTS):
sigma = np.sqrt(self.model_fit.sigma2)
#ndata = np.array(self.apply_transformations(data))
l = len(data)
ret = []
@ -122,8 +119,6 @@ class ARIMA(fts.FTS):
ret.append(tmp)
#ret = self.apply_inverse_transformations(ret, params=[data[self.order - 1:]], point_to_interval=True)
return ret
def forecast_ahead_interval(self, ndata, steps, **kwargs):
@ -134,8 +129,6 @@ class ARIMA(fts.FTS):
sigma = np.sqrt(self.model_fit.sigma2)
#ndata = np.array(self.apply_transformations(data))
l = len(ndata)
nmeans = self.forecast_ahead(ndata, steps, **kwargs)
@ -152,15 +145,10 @@ class ARIMA(fts.FTS):
ret.append(tmp)
#ret = self.apply_inverse_transformations(ret, params=[[data[-1] for a in np.arange(0, steps)]], interval=True)
return ret
def forecast_distribution(self, data, **kwargs):
if self.indexer is not None and isinstance(data, pd.DataFrame):
data = self.indexer.get_data(data)
sigma = np.sqrt(self.model_fit.sigma2)
l = len(data)
@ -168,8 +156,6 @@ class ARIMA(fts.FTS):
ret = []
for k in np.arange(self.order, l + 1):
tmp = []
sample = [data[i] for i in np.arange(k - self.order, k)]
mean = self.forecast(sample)

View File

@ -19,7 +19,7 @@ from pyFTS.probabilistic import ProbabilityDistribution
from pyFTS.common import Transformations
from pyFTS.models import song, chen, yu, ismailefendi, sadaei, hofts, pwfts, ifts, cheng, hwang
from pyFTS.models.ensemble import ensemble
from pyFTS.benchmarks import Measures, naive, arima, ResidualAnalysis, quantreg
from pyFTS.benchmarks import Measures, naive, arima, ResidualAnalysis, quantreg, knn
from pyFTS.benchmarks import Util as bUtil
from pyFTS.common import Util as cUtil
# from sklearn.cross_validation import KFold
@ -156,8 +156,7 @@ def sliding_window_benchmarks(data, windowsize, train=0.8, **kwargs):
elif benchmark_methods is not None:
for count, model in enumerate(benchmark_methods, start=0):
par = benchmark_methods_parameters[count]
mfts = model(str(par if par is not None else ""))
mfts.order = par
mfts = model("", **par)
pool.append(mfts)
if type == 'point':
@ -244,7 +243,6 @@ def sliding_window_benchmarks(data, windowsize, train=0.8, **kwargs):
progressbar.close()
if distributed:
jobs2 = []
rng = jobs
@ -268,10 +266,6 @@ def sliding_window_benchmarks(data, windowsize, train=0.8, **kwargs):
conn.close()
sintetic = kwargs.get('sintetic', False)
#return synthesis_method(jobs, experiments, save, file, sintetic)
def get_benchmark_point_methods():
"""Return all non FTS methods for point forecasting"""
@ -287,7 +281,7 @@ def get_point_methods():
def get_benchmark_interval_methods():
"""Return all non FTS methods for point_to_interval forecasting"""
return [quantreg.QuantileRegression]
return [ arima.ARIMA, quantreg.QuantileRegression]
def get_interval_methods():
@ -302,7 +296,7 @@ def get_probabilistic_methods():
def get_benchmark_probabilistic_methods():
"""Return all FTS methods for probabilistic forecasting"""
return [arima.ARIMA, quantreg.QuantileRegression]
return [arima.ARIMA, quantreg.QuantileRegression, knn.KNearestNeighbors]
def run_point(mfts, partitioner, train_data, test_data, window_key=None, **kwargs):
@ -398,6 +392,7 @@ def run_interval(mfts, partitioner, train_data, test_data, window_key=None, **kw
method = kwargs.get('method', None)
if mfts.benchmark_only:
mfts.append_transformation(partitioner.transformation)
_key = mfts.shortname + str(mfts.order if mfts.order is not None else "") + str(mfts.alpha)
else:
pttr = str(partitioner.__module__).split('.')[-1]
@ -444,10 +439,11 @@ def run_probabilistic(mfts, partitioner, train_data, test_data, window_key=None,
from pyFTS.models import hofts, ifts, pwfts
from pyFTS.models.ensemble import ensemble
from pyFTS.partitioners import Grid, Entropy, FCM
from pyFTS.benchmarks import Measures, arima
from pyFTS.benchmarks import Measures, arima, quantreg, knn
from pyFTS.models.seasonal import SeasonalIndexer
tmp = [hofts.HighOrderFTS, ifts.IntervalFTS, pwfts.ProbabilisticWeightedFTS, arima.ARIMA, ensemble.AllMethodEnsembleFTS]
tmp = [hofts.HighOrderFTS, ifts.IntervalFTS, pwfts.ProbabilisticWeightedFTS, arima.ARIMA,
ensemble.AllMethodEnsembleFTS, knn.KNearestNeighbors]
tmp2 = [Grid.GridPartitioner, Entropy.EntropyPartitioner, FCM.FCMPartitioner]
@ -460,6 +456,7 @@ def run_probabilistic(mfts, partitioner, train_data, test_data, window_key=None,
if mfts.benchmark_only:
_key = mfts.shortname + str(mfts.order if mfts.order is not None else "") + str(mfts.alpha)
mfts.append_transformation(partitioner.transformation)
else:
pttr = str(partitioner.__module__).split('.')[-1]
_key = mfts.shortname + " n = " + str(mfts.order) + " " + pttr + " q = " + str(partitioner.partitions)

71
pyFTS/benchmarks/knn.py Normal file
View File

@ -0,0 +1,71 @@
#!/usr/bin/python
# -*- coding: utf8 -*-
import numpy as np
from statsmodels.tsa.tsatools import lagmat
from pyFTS.common import fts
from pyFTS.probabilistic import ProbabilityDistribution
class KNearestNeighbors(fts.FTS):
"""
K-Nearest Neighbors
"""
def __init__(self, name, **kwargs):
super(KNearestNeighbors, self).__init__(1, "kNN"+name)
self.name = "kNN"
self.detail = "K-Nearest Neighbors"
self.is_high_order = True
self.has_point_forecasting = True
self.has_interval_forecasting = True
self.has_probability_forecasting = True
self.benchmark_only = True
self.min_order = 1
self.alpha = kwargs.get("alpha", 0.05)
self.order = kwargs.get("order", 1)
self.lag = None
self.k = kwargs.get("k", 30)
def train(self, data, **kwargs):
if kwargs.get('order', None) is not None:
self.order = kwargs.get('order', 1)
self.data = data
#self.lagdata, = lagmat(data, maxlag=self.order, trim="both", original='sep')
def knn(self, sample):
if self.order == 1:
dist = np.apply_along_axis(lambda x: (x - sample) ** 2, 0, self.data)
ix = np.argsort(dist) + 1
else:
dist = []
for k in np.arange(self.order, len(self.data)):
dist.append(sum([ (self.data[k - kk] - sample[kk])**2 for kk in range(self.order)]))
ix = np.argsort(np.array(dist)) + self.order + 1
ix = np.clip(ix, 0, len(self.data)-1 )
return self.data[ix[:self.k]]
def forecast_distribution(self, data, **kwargs):
ret = []
smooth = kwargs.get("smooth", "KDE")
alpha = kwargs.get("alpha", None)
uod = self.get_UoD()
for k in np.arange(self.order, len(data)):
sample = data[k-self.order : k]
forecasts = self.knn(sample)
dist = ProbabilityDistribution.ProbabilityDistribution(smooth, uod=uod, data=forecasts,
name="", **kwargs)
ret.append(dist)
return ret

View File

@ -19,7 +19,7 @@ class QuantileRegression(fts.FTS):
self.has_interval_forecasting = True
self.has_probability_forecasting = True
self.benchmark_only = True
self.minOrder = 1
self.min_order = 1
self.alpha = kwargs.get("alpha", 0.05)
self.dist = kwargs.get("dist", False)
self.upper_qt = None
@ -28,15 +28,14 @@ class QuantileRegression(fts.FTS):
self.dist_qt = None
self.shortname = "QAR("+str(self.order)+","+str(self.alpha)+")"
def train(self, data, sets, order=1, parameters=None):
self.order = order
def train(self, data, **kwargs):
if kwargs.get('order', None) is not None:
self.order = kwargs.get('order', 1)
if self.indexer is not None and isinstance(data, pd.DataFrame):
data = self.indexer.get_data(data)
tmp = np.array(self.apply_transformations(data, updateUoD=True))
lagdata, ndata = lagmat(tmp, maxlag=order, trim="both", original='sep')
lagdata, ndata = lagmat(data, maxlag=self.order, trim="both", original='sep')
mqt = QuantReg(ndata, lagdata).fit(0.5)
if self.alpha is not None:
@ -76,12 +75,8 @@ class QuantileRegression(fts.FTS):
up = self.linearmodel([k[1] for k in data], up_params)
return [lo, up]
def forecast(self, data, **kwargs):
def forecast(self, ndata, **kwargs):
if self.indexer is not None and isinstance(data, pd.DataFrame):
data = self.indexer.get_data(data)
ndata = np.array(self.apply_transformations(data))
l = len(ndata)
ret = []
@ -91,16 +86,9 @@ class QuantileRegression(fts.FTS):
ret.append(self.linearmodel(sample, self.mean_qt))
ret = self.apply_inverse_transformations(ret, params=[data[self.order - 1:]])
return ret
def forecast_interval(self, data, **kwargs):
if self.indexer is not None and isinstance(data, pd.DataFrame):
data = self.indexer.get_data(data)
ndata = np.array(self.apply_transformations(data))
def forecast_interval(self, ndata, **kwargs):
l = len(ndata)
@ -110,16 +98,9 @@ class QuantileRegression(fts.FTS):
sample = ndata[k - self.order: k]
ret.append(self.point_to_interval(sample, self.lower_qt, self.upper_qt))
ret = self.apply_inverse_transformations(ret, params=[data[self.order - 1:]], interval=True)
return ret
def forecast_ahead_interval(self, data, steps, **kwargs):
if self.indexer is not None and isinstance(data, pd.DataFrame):
data = self.indexer.get_data(data)
ndata = np.array(self.apply_transformations(data))
def forecast_ahead_interval(self, ndata, steps, **kwargs):
smoothing = kwargs.get("smoothing", 0.9)
@ -137,20 +118,13 @@ class QuantileRegression(fts.FTS):
ret.append([intl[0]*(1 + k*smoothing), intl[1]*(1 + k*smoothing)])
ret = self.apply_inverse_transformations(ret, params=[[data[-1] for a in np.arange(0, steps + self.order)]], interval=True)
return ret[-steps:]
def forecast_distribution(self, data, **kwargs):
if self.indexer is not None and isinstance(data, pd.DataFrame):
data = self.indexer.get_data(data)
ndata = np.array(self.apply_transformations(data))
def forecast_distribution(self, ndata, **kwargs):
ret = []
l = len(data)
l = len(ndata)
for k in np.arange(self.order, l + 1):
dist = ProbabilityDistribution.ProbabilityDistribution(type="histogram",
@ -167,12 +141,7 @@ class QuantileRegression(fts.FTS):
return ret
def forecast_ahead_distribution(self, data, steps, **kwargs):
if self.indexer is not None and isinstance(data, pd.DataFrame):
data = self.indexer.get_data(data)
ndata = np.array(self.apply_transformations(data))
def forecast_ahead_distribution(self, ndata, steps, **kwargs):
ret = []

View File

@ -42,7 +42,6 @@ class FLRG(object):
def get_membership(self, data, sets):
ret = 0.0
if isinstance(self.LHS, (list, set)):
assert len(self.LHS) == len(data)
ret = np.nanmin([sets[self.LHS[ct]].membership(dat) for ct, dat in enumerate(data)])
else:
ret = sets[self.LHS].membership(data)

View File

@ -5,6 +5,7 @@ import numpy as np
import pandas as pd
from pyFTS.common import SortedCollection, fts, tree
from pyFTS.models import chen, cheng, hofts, hwang, ismailefendi, sadaei, song, yu
from pyFTS.probabilistic import ProbabilityDistribution
import scipy.stats as st
@ -171,29 +172,52 @@ class EnsembleFTS(fts.FTS):
return ret
def empty_grid(self, resolution):
return self.get_empty_grid(-(self.original_max*2), self.original_max*2, resolution)
def forecast_distribution(self, data, **kwargs):
ret = []
smooth = kwargs.get("smooth", "KDE")
alpha = kwargs.get("alpha", None)
uod = self.get_UoD()
for k in np.arange(self.order, len(data)):
sample = data[k-self.order : k]
forecasts = self.get_models_forecasts(sample)
if alpha is None:
forecasts = np.ravel(forecasts).tolist()
else:
forecasts = self.get_distribution_interquantile(np.ravel(forecasts).tolist(), alpha)
dist = ProbabilityDistribution.ProbabilityDistribution(smooth, uod=uod, data=forecasts,
name="", **kwargs)
ret.append(dist)
return ret
def forecast_ahead_distribution(self, data, steps, **kwargs):
if 'method' in kwargs:
self.point_method = kwargs.get('method','mean')
percentile_size = (self.original_max - self.original_min) / 100
resolution = kwargs.get('resolution', percentile_size)
grid = self.empty_grid(resolution)
index = SortedCollection.SortedCollection(iterable=grid.keys())
smooth = kwargs.get("smooth", "KDE")
alpha = kwargs.get("alpha", None)
ret = []
samples = [[k] for k in data[-self.order:]]
start = kwargs.get('start', self.order)
for k in np.arange(self.order, steps + self.order):
uod = self.get_UoD()
sample = data[start - self.order: start]
for k in np.arange(self.order, steps+self.order):
forecasts = []
lags = {}
for i in np.arange(0, self.order): lags[i] = samples[k - self.order + i]
for i in np.arange(0, self.order): lags[i] = sample[k-self.order]
# Build the tree with all possible paths
@ -206,17 +230,19 @@ class EnsembleFTS(fts.FTS):
forecasts.extend(self.get_models_forecasts(path))
samples.append(sampler(forecasts, np.arange(0.1, 1, 0.1)))
sample.append(sampler(forecasts, np.arange(0.1, 1, 0.1)))
grid = self.gridCountPoint(grid, resolution, index, forecasts)
if alpha is None:
forecasts = np.ravel(forecasts).tolist()
else:
forecasts = self.get_distribution_interquantile(np.ravel(forecasts).tolist(), alpha)
tmp = np.array([grid[i] for i in sorted(grid)])
dist = ProbabilityDistribution.ProbabilityDistribution(smooth, uod=uod, data=forecasts,
name="", **kwargs)
ret.append(tmp / sum(tmp))
ret.append(dist)
grid = self.empty_grid(resolution)
df = pd.DataFrame(ret, columns=sorted(grid))
return df
return ret
class AllMethodEnsembleFTS(EnsembleFTS):

View File

@ -26,6 +26,7 @@ class IntervalFTS(hofts.HighOrderFTS):
self.has_point_forecasting = False
self.has_interval_forecasting = True
self.is_high_order = True
self.min_order = 1
def get_upper(self, flrg):
if flrg.get_key() in self.flrgs:

View File

@ -105,6 +105,7 @@ class ProbabilisticWeightedFTS(ifts.IntervalFTS):
self.has_interval_forecasting = True
self.has_probability_forecasting = True
self.is_high_order = True
self.min_order = 1
self.auto_update = kwargs.get('update',False)

View File

@ -14,15 +14,25 @@ class ProbabilityDistribution(object):
def __init__(self, type = "KDE", **kwargs):
self.uod = kwargs.get("uod", None)
self.type = type
if self.type == "KDE":
self.kde = kde.KernelSmoothing(kwargs.get("h", 0.5), kwargs.get("kernel", "epanechnikov"))
self.data = []
self.nbins = kwargs.get("num_bins", 100)
self.type = type
self.bins = kwargs.get("bins", None)
self.labels = kwargs.get("bins_labels", None)
data = kwargs.get("data", None)
if self.type == "KDE":
self.kde = kde.KernelSmoothing(kwargs.get("h", 0.5), kwargs.get("kernel", "epanechnikov"))
_min = np.nanmin(data)
_min = _min * .7 if _min > 0 else _min * 1.3
_max = np.nanmax(data)
_max = _max * 1.3 if _max > 0 else _max * .7
self.uod = [_min, _max]
self.nbins = kwargs.get("num_bins", 100)
if self.bins is None:
self.bins = np.linspace(int(self.uod[0]), int(self.uod[1]), int(self.nbins)).tolist()
self.labels = [str(k) for k in self.bins]
@ -38,10 +48,6 @@ class ProbabilityDistribution(object):
self.count = 0
for k in self.bins: self.distribution[k] = 0
self.data = []
data = kwargs.get("data",None)
if data is not None:
self.append(data)
@ -228,10 +234,12 @@ class ProbabilityDistribution(object):
def __str__(self):
ret = ""
for k in sorted(self.distribution.keys()):
for k in sorted(self.bins):
ret += str(round(k,2)) + ':\t'
if self.type == "histogram":
ret += str(round(self.distribution[k] / self.count,3))
elif self.type == "KDE":
ret += str(round(self.density(k),3))
else:
ret += str(round(self.distribution[k], 6))
ret += '\n'

View File

@ -15,9 +15,18 @@ from pyFTS.data import TAIEX
dataset = TAIEX.get_data()
from pyFTS.benchmarks import benchmarks as bchmk, Util as bUtil, Measures
from pyFTS.benchmarks import benchmarks as bchmk, Util as bUtil, Measures, knn, quantreg, arima
from pyFTS.models import pwfts, song, ifts
model = arima.ARIMA("", order=(1,0,0))
model.fit(dataset[:800])
tmp = model.predict(dataset[800:1000], type='distribution')
for tmp2 in tmp:
print(tmp2)
from pyFTS.models import pwfts, song
'''
from pyFTS.partitioners import Grid, Util as pUtil
partitioner = Grid.GridPartitioner(data=dataset[:800], npart=10, transformation=tdiff)
@ -31,28 +40,39 @@ print(Measures.get_distribution_statistics(dataset[800:1000], model, steps_ahead
# print(tmp2)
'''
#'''
'''
from pyFTS.benchmarks import arima, naive, quantreg
bchmk.sliding_window_benchmarks(dataset[:1000], 1000, train=0.8, inc=0.2,
#methods=[song.ConventionalFTS], #[pwfts.ProbabilisticWeightedFTS],
bchmk.sliding_window_benchmarks(dataset, 1000, train=0.8, inc=0.2,
methods=[ifts.IntervalFTS], #[pwfts.ProbabilisticWeightedFTS],
benchmark_models=True,
benchmark_methods=[naive.Naive, arima.ARIMA,arima.ARIMA], #arima.ARIMA,arima.ARIMA],
#benchmark_methods=[arima.ARIMA],
benchmark_methods_parameters=[1,(1,0,0),(1,0,1)], #(2,0,1),(2,0,2)],
#benchmark_methods_parameters=[(1,0,0)],
benchmark_methods=[arima.ARIMA for k in range(8)]
+ [quantreg.QuantileRegression for k in range(4)],
benchmark_methods_parameters=[
{'order': (1, 0, 0), 'alpha': .05},
{'order': (1, 0, 0), 'alpha': .25},
{'order': (1, 0, 1), 'alpha': .05},
{'order': (1, 0, 1), 'alpha': .25},
{'order': (2, 0, 1), 'alpha': .05},
{'order': (2, 0, 1), 'alpha': .25},
{'order': (2, 0, 2), 'alpha': .05},
{'order': (2, 0, 2), 'alpha': .25},
{'order': 1, 'alpha': .05},
{'order': 1, 'alpha': .25},
{'order': 2, 'alpha': .05},
{'order': 2, 'alpha': .25},
],
transformations=[None, tdiff],
orders=[1, 2, 3],
partitions=[35], #np.arange(10, 100, 5),
progress=True, type='point',
orders=[1], #2, 3],
partitions=[3], #np.arange(3, 25, 2),
progress=False, type='interval',
#steps_ahead=[1,4,7,10], #steps_ahead=[1]
#distributed=True, nodes=['192.168.0.110', '192.168.0.105','192.168.0.106'],
file="benchmarks.tmp", dataset="TAIEX", tag="comparisons")
#save=True, file="tmp.db")
distributed=True, nodes=['192.168.0.110', '192.168.0.107','192.168.0.106'],
file="benchmarks.db", dataset="TAIEX", tag="comparisons")
#'''
'''
'''
dat = pd.read_csv('pwfts_taiex_partitioning.csv', sep=';')
print(bUtil.analytic_tabular_dataframe(dat))
@ -111,4 +131,4 @@ tmp[20].plot(ax[2][2], title='t=200')
f, ax = plt.subplots(1, 1, figsize=[20,15])
bchmk.plot_distribution(ax, 'blue', tmp, f, 0, reference_data=dataset[train_split:train_split+200])
'''
'''