From 60785ac89f50abbe258b258c3d1f0ee46e08a94b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=C3=B4nio=20C=C3=A2ndido?= Date: Tue, 28 May 2019 14:32:06 -0300 Subject: [PATCH] Bugfixes in ProbabilityDistribution --- pyFTS/benchmarks/BSTS.py | 86 +++++++++++++++++++ pyFTS/models/ensemble/ensemble.py | 1 - .../probabilistic/ProbabilityDistribution.py | 53 +++++------- pyFTS/probabilistic/kde.py | 22 +++-- pyFTS/tests/general.py | 21 +---- 5 files changed, 126 insertions(+), 57 deletions(-) create mode 100644 pyFTS/benchmarks/BSTS.py diff --git a/pyFTS/benchmarks/BSTS.py b/pyFTS/benchmarks/BSTS.py new file mode 100644 index 0000000..acf1d9d --- /dev/null +++ b/pyFTS/benchmarks/BSTS.py @@ -0,0 +1,86 @@ +#!/usr/bin/python +# -*- coding: utf8 -*- + +import numpy as np +import pandas as pd +import pyflux as pf +import scipy.stats as st +from pyFTS.common import SortedCollection, fts +from pyFTS.probabilistic import ProbabilityDistribution + + +class ARIMA(fts.FTS): + """ + Façade for statsmodels.tsa.arima_model + """ + def __init__(self, **kwargs): + super(ARIMA, self).__init__(**kwargs) + self.name = "BSTS" + self.detail = "Bayesian Structural Time Series" + self.is_high_order = True + self.has_point_forecasting = True + self.has_interval_forecasting = True + self.has_probability_forecasting = True + self.model = None + self.model_fit = None + self.trained_data = None + self.p = 1 + self.d = 0 + self.q = 0 + self.benchmark_only = True + self.min_order = 1 + self.alpha = kwargs.get("alpha", 0.05) + self.order = kwargs.get("order", (1,0,0)) + self._decompose_order(self.order) + self.model = None + + def _decompose_order(self, order): + if isinstance(order, (tuple, set, list)): + self.p = order[0] + self.d = order[1] + self.q = order[2] + self.order = self.p + self.q + (self.q - 1 if self.q > 0 else 0) + self.max_lag = self.order + self.d = len(self.transformations) + self.shortname = "ARIMA(" + str(self.p) + "," + str(self.d) + "," + str(self.q) + ") - " + str(self.alpha) + + def train(self, data, **kwargs): + + if 'order' in kwargs: + order = kwargs.pop('order') + self._decompose_order(order) + + if self.indexer is not None: + data = self.indexer.get_data(data) + + try: + self.model = pf.ARIMA(data=data, ar=self.p, ma=self.q, integ=self.d, family=pf.Normal()) + self.model_fit = self.model.fit('M-H', nsims=20000) + except Exception as ex: + print(ex) + self.model_fit = None + + def forecast(self, ndata, **kwargs): + + return ret + + def forecast_interval(self, data, **kwargs): + + + + return ret + + def forecast_ahead_interval(self, ndata, steps, **kwargs): + + return ret + + def forecast_distribution(self, data, **kwargs): + + + return ret + + + def forecast_ahead_distribution(self, data, steps, **kwargs): + + + return ret \ No newline at end of file diff --git a/pyFTS/models/ensemble/ensemble.py b/pyFTS/models/ensemble/ensemble.py index ea72920..c3188a2 100644 --- a/pyFTS/models/ensemble/ensemble.py +++ b/pyFTS/models/ensemble/ensemble.py @@ -239,7 +239,6 @@ class EnsembleFTS(fts.FTS): return ret - def forecast_ahead_distribution(self, data, steps, **kwargs): if 'method' in kwargs: self.point_method = kwargs.get('method','mean') diff --git a/pyFTS/probabilistic/ProbabilityDistribution.py b/pyFTS/probabilistic/ProbabilityDistribution.py index b8a249d..9a855cd 100644 --- a/pyFTS/probabilistic/ProbabilityDistribution.py +++ b/pyFTS/probabilistic/ProbabilityDistribution.py @@ -1,8 +1,7 @@ import numpy as np import pandas as pd import matplotlib.pyplot as plt -from pyFTS.common import FuzzySet -from scipy.spatial import KDTree +from pyFTS.common import FuzzySet,SortedCollection,tree from pyFTS.probabilistic import kde @@ -32,7 +31,9 @@ class ProbabilityDistribution(object): data = kwargs.get("data", None) if self.type == "KDE": - self.kde = kde.KernelSmoothing(kwargs.get("h", 0.5), kwargs.get("kernel", "epanechnikov")) + self.kde = kde.KernelSmoothing(h=kwargs.get("h", 0.5), kernel=kwargs.get("kernel", "epanechnikov")) + + if self.data is not None: _min = np.nanmin(data) _min = _min * .7 if _min > 0 else _min * 1.3 _max = np.nanmax(data) @@ -42,13 +43,13 @@ class ProbabilityDistribution(object): self.nbins = kwargs.get("num_bins", 100) if self.bins is None: - self.bins = np.linspace(self.uod[0], self.uod[1], int(self.nbins)).tolist() + self.bins = np.linspace(int(self.uod[0]), int(self.uod[1]), int(self.nbins)).tolist() self.labels = [str(k) for k in self.bins] if self.uod is not None: self.resolution = (self.uod[1] - self.uod[0]) / self.nbins - self.bin_index = KDTree(self.bins) + self.bin_index = SortedCollection.SortedCollection(iterable=sorted(self.bins)) self.quantile_index = None self.distribution = {} self.cdf = None @@ -68,8 +69,7 @@ class ProbabilityDistribution(object): :param value: A value in the universe of discourse from the distribution :param density: The probability density to assign to the value """ - _, ix = self.bin_index.query([value], 1) - k = self.labels[ix] + k = self.bin_index.find_ge(value) self.distribution[k] = density def append(self, values): @@ -80,8 +80,7 @@ class ProbabilityDistribution(object): """ if self.type == "histogram": for k in values: - _, ix = self.bin_index.query([k], 1) - v = self.labels[ix] + v = self.bin_index.find_ge(k) self.distribution[v] += 1 self.count += 1 else: @@ -99,12 +98,8 @@ class ProbabilityDistribution(object): """ if self.type == "histogram": for interval in intervals: - _, ix1 = self.bin_index.query([interval[0]], 1) - _, ix2 = self.bin_index.query([interval[1]], 1) - - for k in range(ix1, ix2): - v = self.labels[k] - self.distribution[v] += 1 + for k in self.bin_index.inside(interval[0], interval[1]): + self.distribution[k] += 1 self.count += 1 def density(self, values): @@ -123,15 +118,13 @@ class ProbabilityDistribution(object): for k in values: if self.type == "histogram": - _, ix = self.bin_index.query([k], 1) - v = self.labels[ix] + v = self.bin_index.find_ge(k) ret.append(self.distribution[v] / (self.count + 1e-5)) elif self.type == "KDE": - v = self.kde.probability(k, self.data) + v = self.kde.probability(k, data=self.data) ret.append(v) else: - _, ix = self.bin_index.query([k], 1) - v = self.labels[ix] + v = self.bin_index.find_ge(k) ret.append(self.distribution[v]) if scalar: @@ -158,7 +151,7 @@ class ProbabilityDistribution(object): self.distribution = dist self.labels = [str(k) for k in self.bins] - self.bin_index = KDTree(self.bins) + self.bin_index = SortedCollection.SortedCollection(iterable=sorted(self.bins)) self.quantile_index = None self.cdf = None self.qtl = None @@ -187,11 +180,11 @@ class ProbabilityDistribution(object): _keys = [float(k) for k in sorted(self.qtl.keys())] - self.quantile_index = KDTree(_keys) + self.quantile_index = SortedCollection.SortedCollection(iterable=_keys) def cumulative(self, values): """ - Return the cumulative probability densities for the input values, + Return the cumulative probability densities for the input values, such that F(x) = P(X <= x) :param values: A list of input values @@ -203,17 +196,15 @@ class ProbabilityDistribution(object): if isinstance(values, list): ret = [] for val in values: - _, ix = self.bin_index.query([val], 1) - k = self.labels[ix] + k = self.bin_index.find_ge(val) ret.append(self.cdf[k]) else: - _, ix = self.bin_index.query([values], 1) - k = self.labels[ix] + k = self.bin_index.find_ge(values) return self.cdf[values] def quantile(self, values): """ - Return the Universe of Discourse values in relation to the quantile input values, + Return the Universe of Discourse values in relation to the quantile input values, such that Q(tau) = min( {x | F(x) >= tau }) :param values: input values @@ -225,12 +216,10 @@ class ProbabilityDistribution(object): if isinstance(values, list): ret = [] for val in values: - _, ix = self.quantile_index.query([val], 1) - k = self.labels[ix] + k = self.quantile_index.find_ge(val) ret.append(self.qtl[str(k)][0]) else: - _, ix = self.quantile_index.query([values], 1) - k = self.labels[ix] + k = self.quantile_index.find_ge(values) ret = self.qtl[str(k)] return ret diff --git a/pyFTS/probabilistic/kde.py b/pyFTS/probabilistic/kde.py index b5c668f..a77b9a2 100644 --- a/pyFTS/probabilistic/kde.py +++ b/pyFTS/probabilistic/kde.py @@ -10,13 +10,17 @@ import numpy as np class KernelSmoothing(object): """Kernel Density Estimation""" - def __init__(self,h, kernel="epanechnikov"): - self.h = h + def __init__(self, **kwargs): + self.h = kwargs.get('h',.5) """Width parameter""" - self.kernel = kernel + self.kernel = kwargs.get("kernel", "epanechnikov") """Kernel function""" + self.data = kwargs.get("data",None) self.transf = Transformations.Scale(min=0,max=1) + if self.data is not None: + self.data = self.transf.apply(self.data) + def kernel_function(self, u): """ Apply the kernel @@ -45,7 +49,7 @@ class KernelSmoothing(object): elif self.kernel == "exponential": return 0.5 * np.exp(-np.abs(u)) - def probability(self, x, data): + def probability(self, x, **kwargs): """ Probability of the point x on data @@ -53,10 +57,14 @@ class KernelSmoothing(object): :param data: :return: """ - l = len(data) - ndata = self.transf.apply(data) + if self.data is None: + self.data = kwargs.get('data',None) + self.data = self.transf.apply(self.data) + + l = len(self.data) + nx = self.transf.apply(x) - p = sum([self.kernel_function((nx - k)/self.h) for k in ndata]) / l*self.h + p = sum([self.kernel_function((nx - k)/self.h) for k in self.data]) / l*self.h return p \ No newline at end of file diff --git a/pyFTS/tests/general.py b/pyFTS/tests/general.py index f50df95..c6c197a 100644 --- a/pyFTS/tests/general.py +++ b/pyFTS/tests/general.py @@ -21,25 +21,12 @@ from pyFTS.data import Enrollments, TAIEX x = [k for k in np.arange(-2*np.pi, 2*np.pi, 0.15)] y = [np.sin(k) for k in x] -from pyFTS.models import hofts -from pyFTS.partitioners import Grid, FCM, CMeans, Entropy -from pyFTS.benchmarks import Measures +from pyFTS.probabilistic import ProbabilityDistribution -metodos = [FCM.FCMPartitioner] - -k = 35 - -rows = [] - -for contador, metodo in enumerate(metodos): - print(metodo) - part = metodo(data=y, npart=k) - model = hofts.HighOrderFTS(order=2, partitioner=part) - model.fit(y) - forecasts = model.predict(y) - for o in range(model.order): - forecasts.insert(0, None) +pd = ProbabilityDistribution.ProbabilityDistribution(type='histogram', data=y, num_bins=30) +print(pd.quantile([.5])) +print(pd.cdf) ''' model = fts.FCM_FTS(partitioner=fs, order=1)