From 67358fa000e86b201f7ff32811c2757a847583d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=C3=B4nio=20C=C3=A2ndido=20de=20Lima=20e=20Silva?= Date: Wed, 17 May 2017 10:45:10 -0300 Subject: [PATCH] - Improvementos of interval and distribution m-step ahead forecasts for arima and quantreg --- benchmarks/arima.py | 11 ++++--- benchmarks/benchmarks.py | 67 ++++++++++++++++++++++++++++++-------- benchmarks/quantreg.py | 16 ++++++--- common/SortedCollection.py | 5 +-- ensemble.py | 19 +++++++---- tests/general.py | 39 +++++++++++++--------- 6 files changed, 113 insertions(+), 44 deletions(-) diff --git a/benchmarks/arima.py b/benchmarks/arima.py index e50baf7..797146a 100644 --- a/benchmarks/arima.py +++ b/benchmarks/arima.py @@ -20,6 +20,7 @@ class ARIMA(fts.FTS): self.is_high_order = True self.has_point_forecasting = True self.has_interval_forecasting = True + self.has_probability_forecasting = True self.model = None self.model_fit = None self.trained_data = None @@ -44,7 +45,6 @@ class ARIMA(fts.FTS): try: self.model = stats_arima(data, order=(self.p, self.d, self.q)) self.model_fit = self.model.fit(disp=0) - print(np.sqrt(self.model_fit.sigma2)) except Exception as ex: print(ex) self.model_fit = None @@ -145,6 +145,9 @@ class ARIMA(fts.FTS): return ret + def empty_grid(self, resolution): + return self.get_empty_grid(-(self.original_max*2), self.original_max*2, resolution) + def forecastAheadDistribution(self, data, steps, **kwargs): smoothing = kwargs.get("smoothing", 0.5) @@ -158,7 +161,7 @@ class ARIMA(fts.FTS): resolution = kwargs.get('resolution', percentile_size) - grid = self.get_empty_grid(self.original_min, self.original_max, resolution) + grid = self.empty_grid(resolution) index = SortedCollection.SortedCollection(iterable=grid.keys()) @@ -167,7 +170,7 @@ class ARIMA(fts.FTS): nmeans = self.forecastAhead(ndata, steps, **kwargs) for k in np.arange(0, steps): - grid = self.get_empty_grid(self.original_min, self.original_max, resolution) + grid = self.empty_grid(resolution) for alpha in np.arange(0.05, 0.5, 0.05): tmp = [] @@ -182,6 +185,6 @@ class ARIMA(fts.FTS): ret.append(tmp / sum(tmp)) - grid = self.get_empty_grid(self.original_min, self.original_max, resolution) + grid = self.empty_grid(resolution) df = pd.DataFrame(ret, columns=sorted(grid)) return df \ No newline at end of file diff --git a/benchmarks/benchmarks.py b/benchmarks/benchmarks.py index 7e38e8c..098b22b 100644 --- a/benchmarks/benchmarks.py +++ b/benchmarks/benchmarks.py @@ -16,13 +16,19 @@ import numpy as np import pandas as pd from mpl_toolkits.mplot3d import Axes3D -from probabilistic import ProbabilityDistribution +from pyFTS.probabilistic import ProbabilityDistribution from pyFTS import song, chen, yu, ismailefendi, sadaei, hofts, pwfts, ifts, cheng, ensemble, hwang from pyFTS.benchmarks import Measures, naive, arima, ResidualAnalysis, quantreg from pyFTS.benchmarks import Util as bUtil from pyFTS.common import Transformations, Util # from sklearn.cross_validation import KFold from pyFTS.partitioners import Grid +from matplotlib import rc + +rc('font',**{'family':'sans-serif','sans-serif':['Helvetica']}) +## for Palatino and other serif fonts use: +#rc('font',**{'family':'serif','serif':['Palatino']}) +rc('text', usetex=True) colors = ['grey', 'rosybrown', 'maroon', 'red','orange', 'yellow', 'olive', 'green', 'cyan', 'blue', 'darkblue', 'purple', 'darkviolet'] @@ -472,6 +478,21 @@ def plot_distribution(dist): def plot_compared_series(original, models, colors, typeonlegend=False, save=False, file=None, tam=[20, 5], points=True, intervals=True, linewidth=1.5): + """ + Plot the forecasts of several one step ahead models, by point or by interval + :param original: Original time series data (list) + :param models: List of models to compare + :param colors: List of models colors + :param typeonlegend: Add the type of forecast (point / interval) on legend + :param save: Save the picture on file + :param file: Filename to save the picture + :param tam: Size of the picture + :param points: True to plot the point forecasts, False otherwise + :param intervals: True to plot the interval forecasts, False otherwise + :param linewidth: + :return: + """ + fig = plt.figure(figsize=tam) ax = fig.add_subplot(111) @@ -504,8 +525,12 @@ def plot_compared_series(original, models, colors, typeonlegend=False, save=Fals upper.insert(0, None) lbl = fts.shortname if typeonlegend: lbl += " (Interval)" - ax.plot(lower, color=colors[count], label=lbl, ls="--",linewidth=linewidth) - ax.plot(upper, color=colors[count], ls="--",linewidth=linewidth) + if not points and intervals: + ls = "-" + else: + ls = "--" + ax.plot(lower, color=colors[count], label=lbl, ls=ls,linewidth=linewidth) + ax.plot(upper, color=colors[count], ls=ls,linewidth=linewidth) handles0, labels0 = ax.get_legend_handles_labels() lgd = ax.legend(handles0, labels0, loc=2, bbox_to_anchor=(1, 1)) @@ -685,7 +710,24 @@ def print_distribution_statistics(original, models, steps, resolution): def plot_compared_intervals_ahead(original, models, colors, distributions, time_from, time_to, interpol=False, save=False, file=None, tam=[20, 5], resolution=None, - cmap='Blues',option=2): + cmap='Blues',option=2, linewidth=1.5): + """ + Plot the forecasts of several one step ahead models, by point or by interval + :param original: Original time series data (list) + :param models: List of models to compare + :param colors: List of models colors + :param distributions: True to plot a distribution + :param time_from: index of data poit to start the ahead forecasting + :param time_to: number of steps ahead to forecast + :param interpol: Fill space between distribution plots + :param save: Save the picture on file + :param file: Filename to save the picture + :param tam: Size of the picture + :param resolution: + :param cmap: Color map to be used on distribution plot + :param option: Distribution type to be passed for models + :return: + """ fig = plt.figure(figsize=tam) ax = fig.add_subplot(111) @@ -739,7 +781,6 @@ def plot_compared_intervals_ahead(original, models, colors, distributions, time_ cb.set_label('Density') - if fts.has_interval_forecasting: forecasts = fts.forecastAheadInterval(original[time_from - fts.order:time_from], time_to) lower = [kk[0] for kk in forecasts] @@ -749,8 +790,8 @@ def plot_compared_intervals_ahead(original, models, colors, distributions, time_ for k in np.arange(0, time_from - fts.order): lower.insert(0, None) upper.insert(0, None) - ax.plot(lower, color=colors[count], label=fts.shortname) - ax.plot(upper, color=colors[count]) + ax.plot(lower, color=colors[count], label=fts.shortname, linewidth=linewidth) + ax.plot(upper, color=colors[count], linewidth=linewidth*1.5) else: forecasts = fts.forecast(original) @@ -760,10 +801,12 @@ def plot_compared_intervals_ahead(original, models, colors, distributions, time_ forecasts.insert(0, None) ax.plot(forecasts, color=colors[count], label=fts.shortname) - ax.plot(original, color='black', label="Original") + ax.plot(original, color='black', label="Original", linewidth=linewidth*1.5) handles0, labels0 = ax.get_legend_handles_labels() - ax.legend(handles0, labels0, loc=2) - # ax.set_title(fts.name) + if True in distributions: + lgd = ax.legend(handles0, labels0, loc=2) + else: + lgd = ax.legend(handles0, labels0, loc=2, bbox_to_anchor=(1, 1)) _mi = min(mi) if _mi < 0: _mi *= 1.1 @@ -780,9 +823,7 @@ def plot_compared_intervals_ahead(original, models, colors, distributions, time_ ax.set_xlabel('T') ax.set_xlim([0, len(original)]) - #plt.colorbar() - - Util.showAndSaveImage(fig, file, save) + Util.showAndSaveImage(fig, file, save, lgd=lgd) def plotCompared(original, forecasts, labels, title): diff --git a/benchmarks/quantreg.py b/benchmarks/quantreg.py index 089129a..1cfe6ef 100644 --- a/benchmarks/quantreg.py +++ b/benchmarks/quantreg.py @@ -19,6 +19,7 @@ class QuantileRegression(fts.FTS): self.has_point_forecasting = True self.has_interval_forecasting = True self.has_probability_forecasting = True + self.has_probability_forecasting = True self.benchmark_only = True self.minOrder = 1 self.alpha = kwargs.get("alpha", 0.05) @@ -108,14 +109,21 @@ class QuantileRegression(fts.FTS): def forecastAheadInterval(self, data, steps, **kwargs): ndata = np.array(self.doTransformations(data)) + smoothing = kwargs.get("smoothing", 0.9) + l = len(ndata) - ret = [[k, k] for k in ndata[-self.order:]] + ret = [] - for k in np.arange(self.order, steps + self.order): - intl = self.interval_to_interval([ret[x] for x in np.arange(k - self.order, k)], self.lower_qt, self.upper_qt) + nmeans = self.forecastAhead(ndata, steps, **kwargs) - ret.append(intl) + for k in np.arange(0, self.order): + nmeans.insert(k,ndata[-(k+1)]) + + for k in np.arange(self.order, steps+self.order): + intl = self.point_to_interval(nmeans[k - self.order: k], self.lower_qt, self.upper_qt) + + ret.append([intl[0]*(1 + k*smoothing), intl[1]*(1 + k*smoothing)]) ret = self.doInverseTransformations(ret, params=[[data[-1] for a in np.arange(0, steps + self.order)]], interval=True) diff --git a/common/SortedCollection.py b/common/SortedCollection.py index b8c0103..5bb05dc 100644 --- a/common/SortedCollection.py +++ b/common/SortedCollection.py @@ -4,6 +4,7 @@ from bisect import bisect_left, bisect_right # Original Source Code: https://code.activestate.com/recipes/577197-sortedcollection/ # Author: RAYMOND HETTINGER + class SortedCollection(object): '''Sequence sorted by a key function. @@ -204,8 +205,8 @@ class SortedCollection(object): raise ValueError('No item found between keys : %r,%r' % (ge,le)) def inside(self, ge, le): - g = bisect_right(self._keys, ge) - l = bisect_left(self._keys, le) + l = bisect_right(self._keys, le) + g = bisect_left(self._keys, ge) if g != len(self) and l != len(self) and g != l: return self._items[g : l] elif g != len(self) and l != len(self) and g == l: diff --git a/ensemble.py b/ensemble.py index ddb116f..0b8b821 100644 --- a/ensemble.py +++ b/ensemble.py @@ -6,7 +6,9 @@ import pandas as pd import math from operator import itemgetter from pyFTS.common import FLR, FuzzySet, SortedCollection -from pyFTS import fts +from pyFTS import fts, chen, cheng, hofts, hwang, ismailefendi, sadaei, song, yu +from pyFTS.benchmarks import arima, quantreg +from pyFTS.common import Transformations class EnsembleFTS(fts.FTS): @@ -24,10 +26,14 @@ class EnsembleFTS(fts.FTS): def build(self, data, models, partitioners, partitions, max_order=3, transformation=None, indexer=None): - self.models = [] + methods = [song.ConventionalFTS, chen.ConventionalFTS, yu.WeightedFTS, cheng.TrendWeightedFTS, + ismailefendi.ImprovedWeightedFTS, sadaei.ExponentialyWeightedFTS, hwang.HighOrderFTS, + hofts.HighOrderFTS, arima.ARIMA, quantreg.QuantileRegression] - for count, model in enumerate(models, start=0): - mfts = model("") + transformations = [None, Transformations.Differential(1)] + + for count, method in enumerate(methods, start=0): + mfts = method("") if mfts.benchmark_only: if transformation is not None: mfts.appendTransformation(transformation) @@ -37,7 +43,7 @@ class EnsembleFTS(fts.FTS): for partition in partitions: for partitioner in partitioners: data_train_fs = partitioner(data, partition, transformation=transformation) - mfts = model("") + mfts = method("") mfts.partitioner = data_train_fs if not mfts.is_high_order: @@ -50,7 +56,7 @@ class EnsembleFTS(fts.FTS): else: for order in np.arange(1, max_order + 1): if order >= mfts.min_order: - mfts = model("") + mfts = method("") mfts.partitioner = data_train_fs if transformation is not None: @@ -60,6 +66,7 @@ class EnsembleFTS(fts.FTS): self.models.append(mfts) def train(self, data, sets, order=1,parameters=None): + #if self.models is None: pass def forecast(self, data, **kwargs): diff --git a/tests/general.py b/tests/general.py index 5eaf577..ee3a164 100644 --- a/tests/general.py +++ b/tests/general.py @@ -19,8 +19,8 @@ from numpy import random os.chdir("/home/petronio/dados/Dropbox/Doutorado/Codigos/") -#enrollments = pd.read_csv("DataSets/Enrollments.csv", sep=";") -#enrollments = np.array(enrollments["Enrollments"]) +enrollments = pd.read_csv("DataSets/Enrollments.csv", sep=";") +enrollments = np.array(enrollments["Enrollments"]) diff = Transformations.Differential(1) @@ -28,14 +28,20 @@ diff = Transformations.Differential(1) DATASETS """ +passengers = pd.read_csv("DataSets/AirPassengers.csv", sep=",") +passengers = np.array(passengers["Passengers"]) + +#sunspots = pd.read_csv("DataSets/sunspots.csv", sep=",") +#sunspots = np.array(sunspots["SUNACTIVITY"]) + #gauss = random.normal(0,1.0,5000) #gauss_teste = random.normal(0,1.0,400) #taiexpd = pd.read_csv("DataSets/TAIEX.csv", sep=",") #taiex = np.array(taiexpd["avg"][:5000]) -nasdaqpd = pd.read_csv("DataSets/NASDAQ_IXIC.csv", sep=",") -nasdaq = np.array(nasdaqpd["avg"][0:5000]) +#nasdaqpd = pd.read_csv("DataSets/NASDAQ_IXIC.csv", sep=",") +#nasdaq = np.array(nasdaqpd["avg"][0:5000]) #sp500pd = pd.read_csv("DataSets/S&P500.csv", sep=",") #sp500 = np.array(sp500pd["Avg"][11000:]) @@ -43,7 +49,7 @@ nasdaq = np.array(nasdaqpd["avg"][0:5000]) #sondapd = pd.read_csv("DataSets/SONDA_BSB_HOURLY_AVG.csv", sep=";") #sondapd = sondapd.dropna(axis=0, how='any') -#sonda = np.array(sondapd["ws_10m"]) +#sonda = np.array(sondapd["glo_avg"]) #del(sondapd) #bestpd = pd.read_csv("DataSets/BEST_TAVG.csv", sep=";") @@ -53,8 +59,8 @@ nasdaq = np.array(nasdaqpd["avg"][0:5000]) #print(lag) #print(a) -#from pyFTS.benchmarks import benchmarks as bchmk -from pyFTS.benchmarks import distributed_benchmarks as bchmk +from pyFTS.benchmarks import benchmarks as bchmk +#from pyFTS.benchmarks import distributed_benchmarks as bchmk #from pyFTS.benchmarks import parallel_benchmarks as bchmk from pyFTS.benchmarks import Util from pyFTS.benchmarks import arima, quantreg, Measures @@ -65,17 +71,20 @@ from pyFTS.benchmarks import arima, quantreg, Measures #""" tmp = arima.ARIMA("", alpha=0.25) #tmp.appendTransformation(diff) -tmp.train(nasdaq[:1600], None, order=(1,0,1)) -teste = tmp.forecastAheadDistribution(nasdaq[1600:1604], steps=5, resolution=100) +tmp.train(enrollments, None, order=(1,0,0)) +teste = tmp.forecastInterval(enrollments) -#tmp = quantreg.QuantileRegression("", dist=True) +#tmp = quantreg.QuantileRegression("", alpha=0.25, dist=True) #tmp.appendTransformation(diff) -#tmp.train(nasdaq[:1600], None, order=1) +#tmp.train(sunspots[:150], None, order=1) +#teste = tmp.forecastAheadInterval(sunspots[150:155], 5) #teste = tmp.forecastAheadDistribution(nasdaq[1600:1604], steps=5, resolution=50) -print(nasdaq[1600:1605]) -print(teste) +bchmk.plot_compared_series(enrollments,[tmp], ['blue','red'], points=False, intervals=True) + +#print(sunspots[150:155]) +#print(teste) #kk = Measures.get_interval_statistics(nasdaq[1600:1605], tmp) @@ -109,10 +118,10 @@ bchmk.interval_sliding_window(best, 5000, train=0.8, inc=0.8,#models=[yu.Weighte "_interval_analytic.csv", nodes=['192.168.0.103', '192.168.0.106', '192.168.0.108', '192.168.0.109']) #, depends=[hofts, ifts]) -bchmk.interval_sliding_window(best, 5000, train=0.8, inc=0.8, #models=[yu.WeightedFTS], # # +bchmk.interval_sliding_window(sp500, 2000, train=0.8, inc=0.2, #models=[yu.WeightedFTS], # # partitioners=[Grid.GridPartitioner], #Entropy.EntropyPartitioner], # FCM.FCMPartitioner, ], partitions= np.arange(3,20,step=2), transformation=diff, - dump=True, save=True, file="experiments/best_interval_analytic_diff.csv", + dump=True, save=True, file="experiments/sp500_analytic_diff.csv", nodes=['192.168.0.103', '192.168.0.106', '192.168.0.108', '192.168.0.109']) #, depends=[hofts, ifts]) #"""