Bugfixes and improvements in benchmarks, arima and quantreg

This commit is contained in:
Petrônio Cândido 2019-05-30 00:03:01 -03:00
parent 46b719464f
commit 9b2f286bab
8 changed files with 228 additions and 83 deletions

View File

@ -60,27 +60,48 @@ class ARIMA(fts.FTS):
print(ex)
self.model_fit = None
def forecast(self, ndata, **kwargs):
def inference(self, steps):
t_z = self.model.transform_z()
mu, Y = self.model._model(self.model.latent_variables.get_z_values())
date_index = self.model.shift_dates(steps)
sim_vector = self.model._sim_prediction(mu, Y, steps, t_z, 1000)
return ret
return sim_vector
def forecast(self, ndata, **kwargs):
raise NotImplementedError()
def forecast_interval(self, data, **kwargs):
return ret
raise NotImplementedError()
def forecast_ahead_interval(self, ndata, steps, **kwargs):
sim_vector = self.inference(steps)
if 'alpha' in kwargs:
alpha = kwargs.get('alpha')
else:
alpha = self.alpha
ret = []
for ct, sample in enumerate(sim_vector):
i = [np.percentile(sample, qt) for qt in [alpha, 1-alpha]]
ret.append(i)
return ret
def forecast_distribution(self, data, **kwargs):
return ret
raise NotImplementedError()
def forecast_ahead_distribution(self, data, steps, **kwargs):
sim_vector = self.inference(steps)
ret = []
for ct, sample in enumerate(sim_vector):
pd = ProbabilityDistribution.ProbabilityDistribution(type='histogram', data=sample, nbins=500)
ret.append(pd)
return ret

View File

@ -225,12 +225,12 @@ def pinball_mean(tau, targets, forecasts):
def winkler_score(tau, target, forecast):
'''R. L. Winkler, A Decision-Theoretic Approach to Interval Estimation, J. Am. Stat. Assoc. 67 (337) (1972) 187191. doi:10.2307/2284720. '''
delta = forecast[1] - forecast[0]
if forecast[0] < target and target < forecast[1]:
if forecast[0] <= target <= forecast[1]:
return delta
elif forecast[0] > target:
return delta + 2 * (forecast[0] - target) / tau
return delta + (2 * (forecast[0] - target)) / tau
elif forecast[1] < target:
return delta + 2 * (target - forecast[1]) / tau
return delta + (2 * (target - forecast[1])) / tau
def winkler_mean(tau, targets, forecasts):
@ -248,44 +248,52 @@ def winkler_mean(tau, targets, forecasts):
def brier_score(targets, densities):
'''Brier (1950). "Verification of Forecasts Expressed in Terms of Probability". Monthly Weather Review. 78: 13. '''
'''
Brier Score for probabilistic forecasts.
Brier (1950). "Verification of Forecasts Expressed in Terms of Probability". Monthly Weather Review. 78: 13.
:param targets: a list with the target values
:param densities: a list with pyFTS.probabil objectsistic.ProbabilityDistribution
:return: float
'''
if not isinstance(densities, list):
densities = [densities]
targets = [targets]
ret = []
for ct, d in enumerate(densities):
try:
v = d.bin_index.find_ge(targets[ct])
v = d.bin_index.find_le(targets[ct])
score = sum([d.distribution[k] ** 2 for k in d.bins if k != v])
score += (d.distribution[v] - 1) ** 2
score = sum([d.density(k) ** 2 for k in d.bins if k != v])
score += (d.density(v) - 1) ** 2
ret.append(score)
except ValueError as ex:
ret.append(sum([d.distribution[k] ** 2 for k in d.bins]))
ret.append(sum([d.density(k) ** 2 for k in d.bins]))
return sum(ret) / len(ret)
def pmf_to_cdf(density):
ret = []
for row in density.index:
tmp = []
prev = 0
for col in density.columns:
prev += density[col][row] if not np.isnan(density[col][row]) else 0
tmp.append(prev)
ret.append(tmp)
df = pd.DataFrame(ret, columns=density.columns)
return df
def logarithm_score(targets, densities):
'''
Logarithm Score for probabilistic forecasts.
Good IJ (1952). Rational Decisions.Journal of the Royal Statistical Society B,14(1),107114. URLhttps://www.jstor.org/stable/2984087.
:param targets: a list with the target values
:param densities: a list with pyFTS.probabil objectsistic.ProbabilityDistribution
:return: float
'''
_ls = float(0.0)
if isinstance(densities, ProbabilityDistribution.ProbabilityDistribution):
densities = [densities]
targets = [targets]
def heavyside(bin, target):
return 1 if bin >= target else 0
n = len(densities)
for ct, df in enumerate(densities):
_ls += -np.log(df.density(targets[ct]))
def heavyside_cdf(bins, targets):
ret = []
for t in targets:
result = [1 if b >= t else 0 for b in bins]
ret.append(result)
df = pd.DataFrame(ret, columns=bins)
return df
return _ls / n
def crps(targets, densities):
@ -299,13 +307,13 @@ def crps(targets, densities):
_crps = float(0.0)
if isinstance(densities, ProbabilityDistribution.ProbabilityDistribution):
densities = [densities]
targets = [targets]
l = len(densities[0].bins)
n = len(densities)
for ct, df in enumerate(densities):
_crps += sum([(df.cumulative(bin) - (1 if bin >= targets[ct] else 0)) ** 2 for bin in df.bins])
return _crps / float(l * n)
return _crps / n
def get_point_statistics(data, model, **kwargs):
@ -416,6 +424,41 @@ def get_interval_statistics(data, model, **kwargs):
return ret
def get_interval_ahead_statistics(data, intervals, **kwargs):
"""
Condensate all measures for point interval forecasters
:param data: test data
:param model: FTS model with interval forecasting capability
:param kwargs:
:return: a list with the sharpness, resolution, coverage, .05 pinball mean,
.25 pinball mean, .75 pinball mean and .95 pinball mean.
"""
l = len(intervals)
if len(data) != l:
raise Exception("Data and intervals have different lenghts!")
lags = {}
for lag in range(l):
ret = {}
datum = data[lag]
interval = intervals[lag]
ret['sharpness'] = round(interval[1] - interval[0], 2)
ret['coverage'] = 1 if interval[0] <= datum <= interval[1] else 0
ret['pinball05'] = round(pinball(0.05, datum, interval[0]), 2)
ret['pinball25'] = round(pinball(0.25, datum, interval[0]), 2)
ret['pinball75'] = round(pinball(0.75, datum, interval[1]), 2)
ret['pinball95'] = round(pinball(0.95, datum, interval[1]), 2)
ret['winkler05'] = round(winkler_score(0.05, datum, interval), 2)
ret['winkler25'] = round(winkler_score(0.25, datum, interval), 2)
lags[lag] = ret
return lags
def get_distribution_statistics(data, model, **kwargs):
"""
Get CRPS statistic and time for a forecasting model
@ -452,3 +495,32 @@ def get_distribution_statistics(data, model, **kwargs):
ret.append(round(_e1 - _s1, 3))
ret.append(round(brier_score(data[start:-1:skip], forecasts), 3))
return ret
def get_distribution_ahead_statistics(data, distributions):
"""
Get CRPS statistic and time for a forecasting model
:param data: test data
:param model: FTS model with probabilistic forecasting capability
:param kwargs:
:return: a list with the CRPS and execution time
"""
l = len(distributions)
if len(data) != l:
raise Exception("Data and distributions have different lenghts!")
lags = {}
for lag in range(l):
ret = {}
datum = data[lag]
dist = distributions[lag]
ret['crps'] = round(crps(datum, dist), 3)
ret['brier'] = round(brier_score(datum, dist), 3)
ret['log'] = round(logarithm_score(datum, dist), 3)
lags[lag] = ret
return lags

View File

@ -13,7 +13,7 @@ class QuantileRegression(fts.FTS):
"""Façade for statsmodels.regression.quantile_regression"""
def __init__(self, **kwargs):
super(QuantileRegression, self).__init__(**kwargs)
self.name = "QR"
self.name = "QAR"
self.detail = "Quantile Regression"
self.is_high_order = True
self.has_point_forecasting = True
@ -105,7 +105,7 @@ class QuantileRegression(fts.FTS):
nmeans = self.forecast_ahead(ndata, steps, **kwargs)
for k in np.arange(0, self.order):
nmeans.insert(k,ndata[-(k+1)])
nmeans.insert(k,ndata[k])
for k in np.arange(self.order, steps+self.order):
intl = self.point_to_interval(nmeans[k - self.order: k], self.lower_qt, self.upper_qt)
@ -136,18 +136,29 @@ class QuantileRegression(fts.FTS):
return ret
def forecast_ahead_distribution(self, ndata, steps, **kwargs):
smoothing = kwargs.get("smoothing", 0.9)
l = len(ndata)
ret = []
nmeans = self.forecast_ahead(ndata, steps, **kwargs)
for k in np.arange(0, self.order):
nmeans.insert(k, ndata[k])
for k in np.arange(self.order, steps + self.order):
dist = ProbabilityDistribution.ProbabilityDistribution(type="histogram",
uod=[self.original_min, self.original_max])
intervals = [[k, k] for k in ndata[-self.order:]]
intervals = [[nmeans[self.order], nmeans[self.order]]]
for qt in self.dist_qt:
intl = self.interval_to_interval([intervals[x] for x in np.arange(k - self.order, k)], qt[0], qt[1])
intervals.append(intl)
intl1 = self.point_to_interval(nmeans[k - self.order: k], qt[0], qt[1])
intl2 = [intl1[0] * (1 + k * smoothing), intl1[1] * (1 + k * smoothing)]
intervals.append(intl2)
dist.append_interval(intervals)
ret.append(dist)
return ret

View File

@ -162,17 +162,23 @@ def plot_distribution(ax, cmap, probabilitydist, fig, time_from, reference_data=
cb.set_label('Density')
def plot_distribution2(probabilitydist, data, **kwargs): #ax, cmap, probabilitydist, time_from, data=None):
def plot_distribution2(probabilitydist, data, **kwargs):
'''
Plot distributions over the time (x-axis)
:param probabilitydist:
:param data:
:param kwargs:
:return:
:param probabilitydist: the forecasted probability distributions to plot
:param data: the original test sample
:keyword start_at: the time index (inside of data) to start to plot the probability distributions
:keyword ax: a matplotlib axis. If no value was provided a new axis is created.
:keyword cmap: a matplotlib colormap name, the default value is 'Blues'
:keyword quantiles: the list of quantiles intervals to plot, e. g. [.05, .25, .75, .95]
:keyword median: a boolean value indicating if the median value will be plot.
'''
import matplotlib.colorbar as cbar
import matplotlib.cm as cm
order = kwargs.get('order', 1)
ax = kwargs.get('ax',None)
if ax is None:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=[15, 5])
@ -182,7 +188,7 @@ def plot_distribution2(probabilitydist, data, **kwargs): #ax, cmap, probabilityd
cmap = kwargs.get('cmap','Blues')
cmap = plt.get_cmap(cmap)
start_at = kwargs.get('start_at',0)
start_at = kwargs.get('start_at',0) + order - 1
x = [k + start_at for k in range(l + 1)]
@ -207,12 +213,13 @@ def plot_distribution2(probabilitydist, data, **kwargs): #ax, cmap, probabilityd
ax.fill_between(x, [k[0] for k in y], [k[1] for k in y],
facecolor=scalarMap.to_rgba(ct / lq))
y = [data[start_at]]
for pd in probabilitydist:
qts = pd.quantile(.5)
y.append(qts[0])
if kwargs.get('median',True):
y = [data[start_at]]
for pd in probabilitydist:
qts = pd.quantile(.5)
y.append(qts[0])
ax.plot(x, y, color='red')
ax.plot(x, y, color='red', label='Median')
cax, _ = cbar.make_axes(ax)
cb = cbar.ColorbarBase(cax, cmap=cmap, norm=normal)

View File

@ -232,7 +232,7 @@ class FTS(object):
ret = []
for k in np.arange(0, steps):
tmp = self.forecast(data[-self.max_lag:], **kwargs)
tmp = self.forecast(data[k:self.max_lag+k], **kwargs)
if isinstance(tmp,(list, np.ndarray)):
tmp = tmp[-1]

View File

@ -16,10 +16,13 @@ import scipy.stats as st
from itertools import product
def sampler(data, quantiles):
def sampler(data, quantiles, bounds=False):
ret = []
for qt in quantiles:
ret.append(np.nanpercentile(data, q=qt * 100))
if bounds:
ret.insert(0, min(data))
ret.append(max(data))
return ret
@ -190,25 +193,25 @@ class EnsembleFTS(fts.FTS):
ret = []
samples = [[k] for k in data[-self.order:]]
start = kwargs.get('start', self.order)
uod = self.get_UoD()
sample = [[k] for k in data[start - self.order: start]]
for k in np.arange(self.order, steps + self.order):
forecasts = []
lags = {}
for i in np.arange(0, self.order): lags[i] = samples[k - self.order + i]
# Build the tree with all possible paths
root = tree.FLRGTreeNode(None)
tree.build_tree_without_order(root, lags, 0)
for p in root.paths():
path = list(reversed(list(filter(None.__ne__, p))))
lags = []
for i in np.arange(0, self.order):
lags.append(sample[i - self.order])
# Trace the possible paths
for path in product(*lags):
forecasts.extend(self.get_models_forecasts(path))
samples.append(sampler(forecasts, np.arange(0.1, 1, 0.2)))
sample.append(sampler(forecasts, np.arange(.1, 1, 0.1), bounds=True))
interval = self.get_interval(forecasts)
if len(interval) == 1:
@ -248,7 +251,7 @@ class EnsembleFTS(fts.FTS):
if 'method' in kwargs:
self.point_method = kwargs.get('method','mean')
smooth = kwargs.get("smooth", "KDE")
smooth = kwargs.get("smooth", "histogram")
alpha = kwargs.get("alpha", None)
ret = []
@ -270,7 +273,7 @@ class EnsembleFTS(fts.FTS):
for path in product(*lags):
forecasts.extend(self.get_models_forecasts(path))
sample.append(forecasts)
sample.append(sampler(forecasts, np.arange(.1, 1, 0.1), bounds=True))
if alpha is None:
forecasts = np.ravel(forecasts).tolist()

View File

@ -17,6 +17,8 @@ class ProbabilityDistribution(object):
self.data = []
data = kwargs.get("data", None)
self.type = type
"""
If type is histogram, the PDF is discrete
@ -28,12 +30,10 @@ class ProbabilityDistribution(object):
self.labels = kwargs.get("bins_labels", None)
"""Bins labels on a discrete PDF"""
data = kwargs.get("data", None)
if self.type == "KDE":
self.kde = kde.KernelSmoothing(h=kwargs.get("h", 0.5), kernel=kwargs.get("kernel", "epanechnikov"))
if self.data is not None:
if data is not None and self.uod is None:
_min = np.nanmin(data)
_min = _min * .7 if _min > 0 else _min * 1.3
_max = np.nanmax(data)

View File

@ -5,6 +5,8 @@ import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pyFTS.common import Util
from pyFTS.partitioners import Grid
from pyFTS.common import Transformations
from pyFTS.models import chen, hofts
@ -17,27 +19,56 @@ from pyFTS.models import hofts
from pyFTS.data import TAIEX
data = TAIEX.get_data()
train = data[:800]
test = data[800:1000]
'''
model = ensemble.EnsembleFTS()
for k in [15, 25, 35]:
for order in [1, 2, 3]:
fs = Grid.GridPartitioner(data=data, npart=k)
fs = Grid.GridPartitioner(data=train, npart=k)
tmp = hofts.WeightedHighOrderFTS(partitioner=fs, order=order)
tmp.fit(data)
tmp.fit(train)
model.append_model(tmp)
'''
#fig, ax = plt.subplots(nrows=1, ncols=1, figsize=[15, 5])
#ax.plot(data[:28], label='Original', color='black')
from pyFTS.benchmarks import arima, quantreg, BSTS
#model = arima.ARIMA(order=(2,0,0))
#model = quantreg.QuantileRegression(order=1, dist=True)
model = BSTS.ARIMA(order=(2,0,0))
model.fit(train)
horizon = 5
intervals = model.predict(test[:10], type='interval', alpha=.25, steps_ahead=horizon)
distributions = model.predict(test[:10], type='distribution', smooth='histogram', steps_ahead=horizon, num_bins=100)
#Util.plot_distribution2(forecasts, data[:28], start_at=20, order=3, ax=ax, cmap="Blues")
print('end')
from pyFTS.benchmarks import Measures
start = model.order+1
end = start + horizon
print(Measures.get_interval_ahead_statistics(test[start:end], intervals))
print(Measures.get_distribution_ahead_statistics(test[start:end], distributions))
forecasts = model.predict(data, type='distribution', smooth='histogram', steps_ahead=5)
from pyFTS.benchmarks import benchmarks as bchmk
#f, ax = plt.subplots(1, 1, figsize=[20, 5])
#ax.plot(data)
#bchmk.plot_interval(ax, forecasts, 3, "")
print(forecasts)
#print(forecasts)
'''
mu_local = 5