From 9b2f286babfd5db7c6b683ad8fc450d39b5f758c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=C3=B4nio=20C=C3=A2ndido?= Date: Thu, 30 May 2019 00:03:01 -0300 Subject: [PATCH] Bugfixes and improvements in benchmarks, arima and quantreg --- pyFTS/benchmarks/BSTS.py | 41 ++++-- pyFTS/benchmarks/Measures.py | 134 ++++++++++++++---- pyFTS/benchmarks/quantreg.py | 23 ++- pyFTS/common/Util.py | 29 ++-- pyFTS/common/fts.py | 2 +- pyFTS/models/ensemble/ensemble.py | 33 +++-- .../probabilistic/ProbabilityDistribution.py | 6 +- pyFTS/tests/ensemble.py | 43 +++++- 8 files changed, 228 insertions(+), 83 deletions(-) diff --git a/pyFTS/benchmarks/BSTS.py b/pyFTS/benchmarks/BSTS.py index acf1d9d..f0393ee 100644 --- a/pyFTS/benchmarks/BSTS.py +++ b/pyFTS/benchmarks/BSTS.py @@ -60,27 +60,48 @@ class ARIMA(fts.FTS): print(ex) self.model_fit = None - def forecast(self, ndata, **kwargs): + def inference(self, steps): + t_z = self.model.transform_z() + mu, Y = self.model._model(self.model.latent_variables.get_z_values()) + date_index = self.model.shift_dates(steps) + sim_vector = self.model._sim_prediction(mu, Y, steps, t_z, 1000) - return ret + return sim_vector + + def forecast(self, ndata, **kwargs): + raise NotImplementedError() def forecast_interval(self, data, **kwargs): - - - - return ret + raise NotImplementedError() def forecast_ahead_interval(self, ndata, steps, **kwargs): + sim_vector = self.inference(steps) + + if 'alpha' in kwargs: + alpha = kwargs.get('alpha') + else: + alpha = self.alpha + + ret = [] + + for ct, sample in enumerate(sim_vector): + i = [np.percentile(sample, qt) for qt in [alpha, 1-alpha]] + ret.append(i) return ret def forecast_distribution(self, data, **kwargs): - - - return ret + raise NotImplementedError() def forecast_ahead_distribution(self, data, steps, **kwargs): + sim_vector = self.inference(steps) - return ret \ No newline at end of file + ret = [] + + for ct, sample in enumerate(sim_vector): + pd = ProbabilityDistribution.ProbabilityDistribution(type='histogram', data=sample, nbins=500) + ret.append(pd) + + return ret diff --git a/pyFTS/benchmarks/Measures.py b/pyFTS/benchmarks/Measures.py index 0db7a47..2b925cf 100644 --- a/pyFTS/benchmarks/Measures.py +++ b/pyFTS/benchmarks/Measures.py @@ -225,12 +225,12 @@ def pinball_mean(tau, targets, forecasts): def winkler_score(tau, target, forecast): '''R. L. Winkler, A Decision-Theoretic Approach to Interval Estimation, J. Am. Stat. Assoc. 67 (337) (1972) 187–191. doi:10.2307/2284720. ''' delta = forecast[1] - forecast[0] - if forecast[0] < target and target < forecast[1]: + if forecast[0] <= target <= forecast[1]: return delta elif forecast[0] > target: - return delta + 2 * (forecast[0] - target) / tau + return delta + (2 * (forecast[0] - target)) / tau elif forecast[1] < target: - return delta + 2 * (target - forecast[1]) / tau + return delta + (2 * (target - forecast[1])) / tau def winkler_mean(tau, targets, forecasts): @@ -248,44 +248,52 @@ def winkler_mean(tau, targets, forecasts): def brier_score(targets, densities): - '''Brier (1950). "Verification of Forecasts Expressed in Terms of Probability". Monthly Weather Review. 78: 1–3. ''' + ''' + Brier Score for probabilistic forecasts. + Brier (1950). "Verification of Forecasts Expressed in Terms of Probability". Monthly Weather Review. 78: 1–3. + + :param targets: a list with the target values + :param densities: a list with pyFTS.probabil objectsistic.ProbabilityDistribution + :return: float + ''' + + if not isinstance(densities, list): + densities = [densities] + targets = [targets] + ret = [] + for ct, d in enumerate(densities): try: - v = d.bin_index.find_ge(targets[ct]) + v = d.bin_index.find_le(targets[ct]) - score = sum([d.distribution[k] ** 2 for k in d.bins if k != v]) - score += (d.distribution[v] - 1) ** 2 + score = sum([d.density(k) ** 2 for k in d.bins if k != v]) + score += (d.density(v) - 1) ** 2 ret.append(score) except ValueError as ex: - ret.append(sum([d.distribution[k] ** 2 for k in d.bins])) + ret.append(sum([d.density(k) ** 2 for k in d.bins])) return sum(ret) / len(ret) -def pmf_to_cdf(density): - ret = [] - for row in density.index: - tmp = [] - prev = 0 - for col in density.columns: - prev += density[col][row] if not np.isnan(density[col][row]) else 0 - tmp.append(prev) - ret.append(tmp) - df = pd.DataFrame(ret, columns=density.columns) - return df +def logarithm_score(targets, densities): + ''' + Logarithm Score for probabilistic forecasts. + Good IJ (1952). “Rational Decisions.”Journal of the Royal Statistical Society B,14(1),107–114. URLhttps://www.jstor.org/stable/2984087. + :param targets: a list with the target values + :param densities: a list with pyFTS.probabil objectsistic.ProbabilityDistribution + :return: float + ''' + _ls = float(0.0) + if isinstance(densities, ProbabilityDistribution.ProbabilityDistribution): + densities = [densities] + targets = [targets] -def heavyside(bin, target): - return 1 if bin >= target else 0 + n = len(densities) + for ct, df in enumerate(densities): + _ls += -np.log(df.density(targets[ct])) - -def heavyside_cdf(bins, targets): - ret = [] - for t in targets: - result = [1 if b >= t else 0 for b in bins] - ret.append(result) - df = pd.DataFrame(ret, columns=bins) - return df + return _ls / n def crps(targets, densities): @@ -299,13 +307,13 @@ def crps(targets, densities): _crps = float(0.0) if isinstance(densities, ProbabilityDistribution.ProbabilityDistribution): densities = [densities] + targets = [targets] - l = len(densities[0].bins) n = len(densities) for ct, df in enumerate(densities): _crps += sum([(df.cumulative(bin) - (1 if bin >= targets[ct] else 0)) ** 2 for bin in df.bins]) - return _crps / float(l * n) + return _crps / n def get_point_statistics(data, model, **kwargs): @@ -416,6 +424,41 @@ def get_interval_statistics(data, model, **kwargs): return ret +def get_interval_ahead_statistics(data, intervals, **kwargs): + """ + Condensate all measures for point interval forecasters + + :param data: test data + :param model: FTS model with interval forecasting capability + :param kwargs: + :return: a list with the sharpness, resolution, coverage, .05 pinball mean, + .25 pinball mean, .75 pinball mean and .95 pinball mean. + """ + + l = len(intervals) + + if len(data) != l: + raise Exception("Data and intervals have different lenghts!") + + lags = {} + + for lag in range(l): + ret = {} + datum = data[lag] + interval = intervals[lag] + ret['sharpness'] = round(interval[1] - interval[0], 2) + ret['coverage'] = 1 if interval[0] <= datum <= interval[1] else 0 + ret['pinball05'] = round(pinball(0.05, datum, interval[0]), 2) + ret['pinball25'] = round(pinball(0.25, datum, interval[0]), 2) + ret['pinball75'] = round(pinball(0.75, datum, interval[1]), 2) + ret['pinball95'] = round(pinball(0.95, datum, interval[1]), 2) + ret['winkler05'] = round(winkler_score(0.05, datum, interval), 2) + ret['winkler25'] = round(winkler_score(0.25, datum, interval), 2) + lags[lag] = ret + + return lags + + def get_distribution_statistics(data, model, **kwargs): """ Get CRPS statistic and time for a forecasting model @@ -452,3 +495,32 @@ def get_distribution_statistics(data, model, **kwargs): ret.append(round(_e1 - _s1, 3)) ret.append(round(brier_score(data[start:-1:skip], forecasts), 3)) return ret + + +def get_distribution_ahead_statistics(data, distributions): + """ + Get CRPS statistic and time for a forecasting model + + :param data: test data + :param model: FTS model with probabilistic forecasting capability + :param kwargs: + :return: a list with the CRPS and execution time + """ + + l = len(distributions) + + if len(data) != l: + raise Exception("Data and distributions have different lenghts!") + + lags = {} + + for lag in range(l): + ret = {} + datum = data[lag] + dist = distributions[lag] + ret['crps'] = round(crps(datum, dist), 3) + ret['brier'] = round(brier_score(datum, dist), 3) + ret['log'] = round(logarithm_score(datum, dist), 3) + lags[lag] = ret + + return lags diff --git a/pyFTS/benchmarks/quantreg.py b/pyFTS/benchmarks/quantreg.py index f87e409..e1c792e 100644 --- a/pyFTS/benchmarks/quantreg.py +++ b/pyFTS/benchmarks/quantreg.py @@ -13,7 +13,7 @@ class QuantileRegression(fts.FTS): """Façade for statsmodels.regression.quantile_regression""" def __init__(self, **kwargs): super(QuantileRegression, self).__init__(**kwargs) - self.name = "QR" + self.name = "QAR" self.detail = "Quantile Regression" self.is_high_order = True self.has_point_forecasting = True @@ -105,7 +105,7 @@ class QuantileRegression(fts.FTS): nmeans = self.forecast_ahead(ndata, steps, **kwargs) for k in np.arange(0, self.order): - nmeans.insert(k,ndata[-(k+1)]) + nmeans.insert(k,ndata[k]) for k in np.arange(self.order, steps+self.order): intl = self.point_to_interval(nmeans[k - self.order: k], self.lower_qt, self.upper_qt) @@ -136,18 +136,29 @@ class QuantileRegression(fts.FTS): return ret def forecast_ahead_distribution(self, ndata, steps, **kwargs): + smoothing = kwargs.get("smoothing", 0.9) + + l = len(ndata) ret = [] + nmeans = self.forecast_ahead(ndata, steps, **kwargs) + + for k in np.arange(0, self.order): + nmeans.insert(k, ndata[k]) + for k in np.arange(self.order, steps + self.order): dist = ProbabilityDistribution.ProbabilityDistribution(type="histogram", uod=[self.original_min, self.original_max]) - intervals = [[k, k] for k in ndata[-self.order:]] + + intervals = [[nmeans[self.order], nmeans[self.order]]] for qt in self.dist_qt: - intl = self.interval_to_interval([intervals[x] for x in np.arange(k - self.order, k)], qt[0], qt[1]) - intervals.append(intl) + intl1 = self.point_to_interval(nmeans[k - self.order: k], qt[0], qt[1]) + intl2 = [intl1[0] * (1 + k * smoothing), intl1[1] * (1 + k * smoothing)] + intervals.append(intl2) dist.append_interval(intervals) ret.append(dist) - return ret \ No newline at end of file + return ret + diff --git a/pyFTS/common/Util.py b/pyFTS/common/Util.py index 7666ec2..8bf054f 100644 --- a/pyFTS/common/Util.py +++ b/pyFTS/common/Util.py @@ -162,17 +162,23 @@ def plot_distribution(ax, cmap, probabilitydist, fig, time_from, reference_data= cb.set_label('Density') -def plot_distribution2(probabilitydist, data, **kwargs): #ax, cmap, probabilitydist, time_from, data=None): +def plot_distribution2(probabilitydist, data, **kwargs): ''' Plot distributions over the time (x-axis) - :param probabilitydist: - :param data: - :param kwargs: - :return: + + :param probabilitydist: the forecasted probability distributions to plot + :param data: the original test sample + :keyword start_at: the time index (inside of data) to start to plot the probability distributions + :keyword ax: a matplotlib axis. If no value was provided a new axis is created. + :keyword cmap: a matplotlib colormap name, the default value is 'Blues' + :keyword quantiles: the list of quantiles intervals to plot, e. g. [.05, .25, .75, .95] + :keyword median: a boolean value indicating if the median value will be plot. ''' import matplotlib.colorbar as cbar import matplotlib.cm as cm + order = kwargs.get('order', 1) + ax = kwargs.get('ax',None) if ax is None: fig, ax = plt.subplots(nrows=1, ncols=1, figsize=[15, 5]) @@ -182,7 +188,7 @@ def plot_distribution2(probabilitydist, data, **kwargs): #ax, cmap, probabilityd cmap = kwargs.get('cmap','Blues') cmap = plt.get_cmap(cmap) - start_at = kwargs.get('start_at',0) + start_at = kwargs.get('start_at',0) + order - 1 x = [k + start_at for k in range(l + 1)] @@ -207,12 +213,13 @@ def plot_distribution2(probabilitydist, data, **kwargs): #ax, cmap, probabilityd ax.fill_between(x, [k[0] for k in y], [k[1] for k in y], facecolor=scalarMap.to_rgba(ct / lq)) - y = [data[start_at]] - for pd in probabilitydist: - qts = pd.quantile(.5) - y.append(qts[0]) + if kwargs.get('median',True): + y = [data[start_at]] + for pd in probabilitydist: + qts = pd.quantile(.5) + y.append(qts[0]) - ax.plot(x, y, color='red') + ax.plot(x, y, color='red', label='Median') cax, _ = cbar.make_axes(ax) cb = cbar.ColorbarBase(cax, cmap=cmap, norm=normal) diff --git a/pyFTS/common/fts.py b/pyFTS/common/fts.py index 6620f20..371a6f0 100644 --- a/pyFTS/common/fts.py +++ b/pyFTS/common/fts.py @@ -232,7 +232,7 @@ class FTS(object): ret = [] for k in np.arange(0, steps): - tmp = self.forecast(data[-self.max_lag:], **kwargs) + tmp = self.forecast(data[k:self.max_lag+k], **kwargs) if isinstance(tmp,(list, np.ndarray)): tmp = tmp[-1] diff --git a/pyFTS/models/ensemble/ensemble.py b/pyFTS/models/ensemble/ensemble.py index 1a5a704..c4d5d62 100644 --- a/pyFTS/models/ensemble/ensemble.py +++ b/pyFTS/models/ensemble/ensemble.py @@ -16,10 +16,13 @@ import scipy.stats as st from itertools import product -def sampler(data, quantiles): +def sampler(data, quantiles, bounds=False): ret = [] for qt in quantiles: ret.append(np.nanpercentile(data, q=qt * 100)) + if bounds: + ret.insert(0, min(data)) + ret.append(max(data)) return ret @@ -190,25 +193,25 @@ class EnsembleFTS(fts.FTS): ret = [] - samples = [[k] for k in data[-self.order:]] + start = kwargs.get('start', self.order) + + uod = self.get_UoD() + + sample = [[k] for k in data[start - self.order: start]] for k in np.arange(self.order, steps + self.order): forecasts = [] - lags = {} - for i in np.arange(0, self.order): lags[i] = samples[k - self.order + i] - # Build the tree with all possible paths - - root = tree.FLRGTreeNode(None) - - tree.build_tree_without_order(root, lags, 0) - - for p in root.paths(): - path = list(reversed(list(filter(None.__ne__, p)))) + lags = [] + for i in np.arange(0, self.order): + lags.append(sample[i - self.order]) + # Trace the possible paths + for path in product(*lags): forecasts.extend(self.get_models_forecasts(path)) - samples.append(sampler(forecasts, np.arange(0.1, 1, 0.2))) + sample.append(sampler(forecasts, np.arange(.1, 1, 0.1), bounds=True)) + interval = self.get_interval(forecasts) if len(interval) == 1: @@ -248,7 +251,7 @@ class EnsembleFTS(fts.FTS): if 'method' in kwargs: self.point_method = kwargs.get('method','mean') - smooth = kwargs.get("smooth", "KDE") + smooth = kwargs.get("smooth", "histogram") alpha = kwargs.get("alpha", None) ret = [] @@ -270,7 +273,7 @@ class EnsembleFTS(fts.FTS): for path in product(*lags): forecasts.extend(self.get_models_forecasts(path)) - sample.append(forecasts) + sample.append(sampler(forecasts, np.arange(.1, 1, 0.1), bounds=True)) if alpha is None: forecasts = np.ravel(forecasts).tolist() diff --git a/pyFTS/probabilistic/ProbabilityDistribution.py b/pyFTS/probabilistic/ProbabilityDistribution.py index 9a855cd..0c35b7e 100644 --- a/pyFTS/probabilistic/ProbabilityDistribution.py +++ b/pyFTS/probabilistic/ProbabilityDistribution.py @@ -17,6 +17,8 @@ class ProbabilityDistribution(object): self.data = [] + data = kwargs.get("data", None) + self.type = type """ If type is histogram, the PDF is discrete @@ -28,12 +30,10 @@ class ProbabilityDistribution(object): self.labels = kwargs.get("bins_labels", None) """Bins labels on a discrete PDF""" - data = kwargs.get("data", None) - if self.type == "KDE": self.kde = kde.KernelSmoothing(h=kwargs.get("h", 0.5), kernel=kwargs.get("kernel", "epanechnikov")) - if self.data is not None: + if data is not None and self.uod is None: _min = np.nanmin(data) _min = _min * .7 if _min > 0 else _min * 1.3 _max = np.nanmax(data) diff --git a/pyFTS/tests/ensemble.py b/pyFTS/tests/ensemble.py index 8eb0e24..f55de00 100644 --- a/pyFTS/tests/ensemble.py +++ b/pyFTS/tests/ensemble.py @@ -5,6 +5,8 @@ import os import numpy as np import pandas as pd +import matplotlib.pyplot as plt +from pyFTS.common import Util from pyFTS.partitioners import Grid from pyFTS.common import Transformations from pyFTS.models import chen, hofts @@ -17,27 +19,56 @@ from pyFTS.models import hofts from pyFTS.data import TAIEX data = TAIEX.get_data() - +train = data[:800] +test = data[800:1000] +''' model = ensemble.EnsembleFTS() for k in [15, 25, 35]: for order in [1, 2, 3]: - fs = Grid.GridPartitioner(data=data, npart=k) + fs = Grid.GridPartitioner(data=train, npart=k) tmp = hofts.WeightedHighOrderFTS(partitioner=fs, order=order) - tmp.fit(data) + tmp.fit(train) model.append_model(tmp) +''' +#fig, ax = plt.subplots(nrows=1, ncols=1, figsize=[15, 5]) + +#ax.plot(data[:28], label='Original', color='black') + +from pyFTS.benchmarks import arima, quantreg, BSTS + +#model = arima.ARIMA(order=(2,0,0)) +#model = quantreg.QuantileRegression(order=1, dist=True) +model = BSTS.ARIMA(order=(2,0,0)) +model.fit(train) + +horizon = 5 + +intervals = model.predict(test[:10], type='interval', alpha=.25, steps_ahead=horizon) + +distributions = model.predict(test[:10], type='distribution', smooth='histogram', steps_ahead=horizon, num_bins=100) + +#Util.plot_distribution2(forecasts, data[:28], start_at=20, order=3, ax=ax, cmap="Blues") + +print('end') + +from pyFTS.benchmarks import Measures + +start = model.order+1 +end = start + horizon + +print(Measures.get_interval_ahead_statistics(test[start:end], intervals)) +print(Measures.get_distribution_ahead_statistics(test[start:end], distributions)) -forecasts = model.predict(data, type='distribution', smooth='histogram', steps_ahead=5) -from pyFTS.benchmarks import benchmarks as bchmk #f, ax = plt.subplots(1, 1, figsize=[20, 5]) #ax.plot(data) #bchmk.plot_interval(ax, forecasts, 3, "") -print(forecasts) +#print(forecasts) ''' mu_local = 5