From cb12810e0a77bd6f785ffd83182172548d9ee9c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=C3=B4nio=20C=C3=A2ndido=20de=20Lima=20e=20Silva?= Date: Wed, 15 Feb 2017 18:16:13 -0200 Subject: [PATCH] - Probability distributions --- benchmarks/Measures.py | 41 --------------- benchmarks/ProbabilityDistribution.py | 73 +++++++++++++++++++++++++++ benchmarks/benchmarks.py | 46 +++++++++++++++-- fts.py | 14 +---- tests/pfts.py | 22 ++------ 5 files changed, 122 insertions(+), 74 deletions(-) create mode 100644 benchmarks/ProbabilityDistribution.py diff --git a/benchmarks/Measures.py b/benchmarks/Measures.py index a15d5c7..d0a7a6d 100644 --- a/benchmarks/Measures.py +++ b/benchmarks/Measures.py @@ -145,44 +145,3 @@ def crps(targets, densities): _crps += sum([ (Ff[col][k]-Fa[col][k])**2 for col in densities.columns]) return _crps / float(l * n) - - -def pdf(data, bins=100): - _mx = max(data) - _mn = min(data) - _pdf = {} - percentiles = np.linspace(_mn, _mx, bins).tolist() - - print (percentiles) - - index_percentiles = SortedCollection.SortedCollection(iterable=percentiles) - - for k in percentiles: _pdf[k] = 0 - - for k in data: - v = index_percentiles.find_ge(k) - _pdf[v] += 1 - - norm = sum(list(_pdf.values())) - for k in _pdf: _pdf[k] /= norm - - return _pdf - - -def pdf_fuzzysets(data,sets): - _pdf = {} - for k in sets: _pdf[k.name] = 0 - for k in data: - memberships = FuzzySet.fuzzyInstance(k, sets) - for c, fs in enumerate(sets, start=0): - _pdf[fs.name] += memberships[c] - - norm = sum(list(_pdf.values())) - for k in _pdf: _pdf[k] /= norm - - return _pdf - - -def entropy(pdf): - h = -sum([pdf[k] * np.log(pdf[k]) for k in pdf]) - return h \ No newline at end of file diff --git a/benchmarks/ProbabilityDistribution.py b/benchmarks/ProbabilityDistribution.py new file mode 100644 index 0000000..ce7625f --- /dev/null +++ b/benchmarks/ProbabilityDistribution.py @@ -0,0 +1,73 @@ +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +from pyFTS.common import FuzzySet,SortedCollection + + +class ProbabilityDistribution(object): + def __init__(self,name,nbins,uod,bins=None,labels=None, data=None): + self.name = name + self.nbins = nbins + self.uod = uod + if bins is None: + #range = (uod[1] - uod[0])/nbins + #self.bins = np.arange(uod[0],uod[1],range).tolist() + self.bins = np.linspace(uod[0], uod[1], nbins).tolist() + self.labels = [str(k) for k in self.bins] + else: + self.bins = bins + self.labels = labels + + self.index = SortedCollection.SortedCollection(iterable=sorted(self.bins)) + self.distribution = {} + self.count = 0 + for k in self.bins: self.distribution[k] = 0 + + if data is not None: self.append(data) + + def append(self, values): + for k in values: + v = self.index.find_ge(k) + self.distribution[v] += 1 + self.count += 1 + + def density(self, values): + ret = [] + for k in values: + v = self.index.find_ge(k) + ret.append(self.distribution[v] / self.count) + return ret + + def entropy(self): + h = -sum([self.distribution[k] * np.log(self.distribution[k]) if self.distribution[k] > 0 else 0 + for k in self.bins]) + return h + + def empiricalloglikelihood(self): + _s = 0 + for k in self.bins: + if self.distribution[k] > 0: + _s += np.log(self.distribution[k]) + return _s + + def pseudologlikelihood(self, data): + + densities = self.density(data) + + _s = 0 + for k in densities: + if k > 0: + _s += np.log(k) + return _s + + def plot(self,axis=None,color="black",tam=[10, 6]): + if axis is None: + fig = plt.figure(figsize=tam) + axis = fig.add_subplot(111) + + ys = [self.distribution[k]/self.count for k in self.bins] + + axis.plot(self.bins, ys,c=color, label=self.name) + + axis.set_xlabel('Universe of Discourse') + axis.set_ylabel('Probability') diff --git a/benchmarks/benchmarks.py b/benchmarks/benchmarks.py index 69c72bb..b056c21 100644 --- a/benchmarks/benchmarks.py +++ b/benchmarks/benchmarks.py @@ -9,7 +9,7 @@ import matplotlib.cm as cmx import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D # from sklearn.cross_validation import KFold -from pyFTS.benchmarks import Measures, naive, ResidualAnalysis +from pyFTS.benchmarks import Measures, naive, ResidualAnalysis, ProbabilityDistribution from pyFTS.partitioners import Grid from pyFTS.common import Membership, FuzzySet, FLR, Transformations, Util from pyFTS import fts, chen, yu, ismailefendi, sadaei, hofts, hwang, pfts, ifts @@ -24,7 +24,8 @@ styles = ['-','--','-.',':','.'] nsty = len(styles) def allPointForecasters(data_train, data_test, partitions, max_order=3, statistics=True, residuals=True, - series=True, save=False, file=None, tam=[20, 5], models=None, transformation=None): + series=True, save=False, file=None, tam=[20, 5], models=None, transformation=None, + distributions=False): if models is None: models = [naive.Naive, chen.ConventionalFTS, yu.WeightedFTS, ismailefendi.ImprovedWeightedFTS, @@ -58,7 +59,7 @@ def allPointForecasters(data_train, data_test, partitions, max_order=3, statisti mfts.appendTransformation(transformation) mfts.train(data_train, data_train_fs, order=order) objs.append(mfts) - lcolors.append(colors[count % ncol]) + lcolors.append(colors[(count + order) % ncol]) if statistics: print(getPointStatistics(data_test, objs)) @@ -71,6 +72,21 @@ def allPointForecasters(data_train, data_test, partitions, max_order=3, statisti plotComparedSeries(data_test, objs, lcolors, typeonlegend=False, save=save, file=file, tam=tam, intervals=False) + if distributions: + lcolors.insert(0,'black') + pmfs = [] + pmfs.append( + ProbabilityDistribution.ProbabilityDistribution("Original", 100, [min(data_train), max(data_train)], data=data_train) ) + + for m in objs: + forecasts = m.forecast(data_train) + pmfs.append( + ProbabilityDistribution.ProbabilityDistribution(m.shortname, 100, [min(data_train), max(data_train)], + data=forecasts)) + print(getProbabilityDistributionStatistics(pmfs,data_train)) + + plotProbabilityDistributions(pmfs, lcolors) + def getPointStatistics(data, models, externalmodels = None, externalforecasts = None, indexers=None): ret = "Model & Order & RMSE & SMAPE & Theil's U \\\\ \n" @@ -108,6 +124,17 @@ def getPointStatistics(data, models, externalmodels = None, externalforecasts = return ret +def getProbabilityDistributionStatistics(pmfs, data): + ret = "Model & Entropy & Empirical Likelihood & Pseudo Likelihood \\\\ \n" + for k in pmfs: + ret += k.name + " & " + ret += str(k.entropy()) + " & " + ret += str(k.empiricalloglikelihood())+ " & " + ret += str(k.pseudologlikelihood(data)) + ret += " \\\\ \n" + return ret + + def allIntervalForecasters(data_train, data_test, partitions, max_order=3,save=False, file=None, tam=[20, 5], models=None, transformation=None): if models is None: @@ -215,6 +242,18 @@ def plotComparedSeries(original, models, colors, typeonlegend=False, save=False, Util.showAndSaveImage(fig, file, save, lgd=legends) + +def plotProbabilityDistributions(pmfs,lcolors): + fig = plt.figure(figsize=[15, 7]) + ax = fig.add_subplot(111) + + for k,m in enumerate(pmfs,start=0): + m.plot(ax, color=lcolors[k]) + + handles0, labels0 = ax.get_legend_handles_labels() + ax.legend(handles0, labels0) + + def allAheadForecasters(data_train, data_test, partitions, start, steps, resolution = None, max_order=3,save=False, file=None, tam=[20, 5], models=None, transformation=None, option=2): if models is None: @@ -786,3 +825,4 @@ def pftsExploreOrderAndPartitions(data,save=False, file=None): plt.tight_layout() Util.showAndSaveImage(fig, file, save) + diff --git a/fts.py b/fts.py index 15108e2..d315d9f 100644 --- a/fts.py +++ b/fts.py @@ -115,7 +115,7 @@ class FTS(object): for child in node.getChildren(): self.buildTreeWithoutOrder(child, lags, level + 1) - def generate_data(self,bins=100): + def inputoutputmapping(self,bins=100): dim_uod = tuple([bins for k in range(0,self.order)]) @@ -136,8 +136,6 @@ class FTS(object): for k in self.sets: pdf_fs[k.name] = 0 - index_percentiles = SortedCollection.SortedCollection(iterable=percentiles) - lags = {} for o in np.arange(0, self.order): @@ -165,15 +163,7 @@ class FTS(object): simulation_fs[index_fs] = forecast - - pdf_fs = Measures.pdf_fuzzysets(np.ravel(simulation_fs),self.sets) - - pdf_uod = Measures.pdf(np.ravel(simulation_fs), bins=bins) - - #tmp_pdf_fs = pd.DataFrame( [[pdf_fs[k] for k in sorted(pdf_fs)]], columns=[k for k in sorted(pdf_fs)]) - #tmp_pdf_uod = pd.DataFrame([[pdf_uod[k] for k in sorted(pdf_uod)]], columns=[k for k in sorted(pdf_uod)]) - - return [pdf_fs, pdf_uod, simulation_fs, simulation_uod ] + return [simulation_fs, simulation_uod ] diff --git a/tests/pfts.py b/tests/pfts.py index 32a36d4..ce6be1f 100644 --- a/tests/pfts.py +++ b/tests/pfts.py @@ -39,12 +39,12 @@ fs = Grid.GridPartitionerTrimf(gauss_treino,10) #tmp = chen.ConventionalFTS("") -pfts1 = pfts.ProbabilisticFTS("1") +#pfts1 = pfts.ProbabilisticFTS("1") #pfts1.appendTransformation(diff) -pfts1.train(gauss_treino,fs,2) -#pfts2 = pfts.ProbabilisticFTS("n = 2") +pfts1.train(gauss_treino,fs,1) +pfts2 = pfts.ProbabilisticFTS("n = 2") #pfts2.appendTransformation(diff) -#pfts2.train(gauss_treino,fs,2) +pfts2.train(gauss_treino,fs,2) #pfts3 = pfts.ProbabilisticFTS("n = 3") #pfts3.appendTransformation(diff) @@ -54,20 +54,6 @@ pfts1.train(gauss_treino,fs,2) #print(bchmk.getDistributionStatistics(gauss_teste[:50], [pfts1,pfts2,pfts3], 20, 1.50)) -sim_fs, sim_uod =pfts1.generate_data(bins=10) - -#print(pdf_fs) - -#print(sim_fs) - -print(sim_uod) - -print(np.ravel(sim_uod)) - -#print(sim_uod) - -#print (Measures.pdf(gauss_treino,bins=10)) -