From 9bfd931e45e0039a145eb5591544e265419026d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=C3=B4nio=20C=C3=A2ndido=20de=20Lima=20e=20Silva?= Date: Sat, 1 Jul 2017 19:42:45 -0300 Subject: [PATCH] - Improvements on probability distributions and KDE - Seasonal Ensemble --- benchmarks/Util.py | 17 +++++- ensemble.py | 26 +++++++++ models/seasonal/SeasonalIndexer.py | 2 + partitioners/Util.py | 2 +- partitioners/partitioner.py | 13 ++++- probabilistic/ProbabilityDistribution.py | 67 ++++++++++++++-------- probabilistic/kde.py | 11 ++-- tests/general.py | 72 ++++++++++++++++++++---- 8 files changed, 162 insertions(+), 48 deletions(-) diff --git a/benchmarks/Util.py b/benchmarks/Util.py index cbd8061..7407fcd 100644 --- a/benchmarks/Util.py +++ b/benchmarks/Util.py @@ -209,6 +209,8 @@ def scale(data, params): ndata = [(k-params[0])/params[1] for k in data] return ndata +def stats(measure, data): + print(measure, np.nanmean(data), np.nanstd(data)) def unified_scaled_point(experiments, tam, save=False, file=None, sort_columns=['UAVG', 'RMSEAVG', 'USTD', 'RMSESTD'], @@ -259,7 +261,6 @@ def unified_scaled_point(experiments, tam, save=False, file=None, mdl[b]['times'] = [] best = bests[b] - print(best) tmp = dat_ana[(dat_ana.Model == best["Model"]) & (dat_ana.Order == best["Order"]) & (dat_ana.Scheme == best["Scheme"]) & (dat_ana.Partitions == best["Partitions"])] tmpl = extract_measure(tmp,'RMSE',data_columns) @@ -277,10 +278,13 @@ def unified_scaled_point(experiments, tam, save=False, file=None, models[b]['label'] = check_replace_list(best["Model"] + " " + str(best["Order"]), replace) - + print("GLOBAL") rmse_param = scale_params(rmse) + stats("rmse", rmse) smape_param = scale_params(smape) + stats("smape", smape) u_param = scale_params(u) + stats("u", u) times_param = scale_params(times) for key in sorted(models.keys()): @@ -295,9 +299,13 @@ def unified_scaled_point(experiments, tam, save=False, file=None, times = [] labels = [] for key in sorted(models.keys()): + print(key) rmse.append(models[key]['rmse']) + stats("rmse", models[key]['rmse']) smape.append(models[key]['smape']) + stats("smape", models[key]['smape']) u.append(models[key]['u']) + stats("u", models[key]['u']) times.append(models[key]['times']) labels.append(models[key]['label']) @@ -995,6 +1003,8 @@ def unified_scaled_ahead(experiments, tam, save=False, file=None, for experiment in experiments: + print(experiment) + mdl = {} dat_syn = pd.read_csv(experiment[0], sep=";", usecols=ahead_dataframe_synthetic_columns()) @@ -1023,6 +1033,9 @@ def unified_scaled_ahead(experiments, tam, save=False, file=None, mdl[b]['crps2'] = [] best = bests[b] + + print(best) + tmp = dat_ana[(dat_ana.Model == best["Model"]) & (dat_ana.Order == best["Order"]) & (dat_ana.Scheme == best["Scheme"]) & (dat_ana.Partitions == best["Partitions"])] tmpl = extract_measure(tmp, 'CRPS_Interval', data_columns) diff --git a/ensemble.py b/ensemble.py index 2895cb6..589fc1e 100644 --- a/ensemble.py +++ b/ensemble.py @@ -11,6 +11,7 @@ from pyFTS.benchmarks import arima, quantreg from pyFTS.common import Transformations import scipy.stats as st from pyFTS import tree +from pyFTS.models import msfts def sampler(data, quantiles): ret = [] @@ -241,3 +242,28 @@ class AllMethodEnsembleFTS(EnsembleFTS): self.appendModel(model) +class SeasonalEnsembleFTS(EnsembleFTS): + def __init__(self, name, **kwargs): + super(SeasonalEnsembleFTS, self).__init__(name="Seasonal Ensemble FTS", **kwargs) + self.min_order = 1 + self.indexers = [] + self.partitioners = [] + self.is_multivariate = True + self.has_seasonality = True + self.has_probability_forecasting = True + + def train(self, data, sets, order=1, parameters=None): + self.original_max = max(data) + self.original_min = min(data) + + for ix in self.indexers: + for pt in self.partitioners: + + model = msfts.MultiSeasonalFTS() + model.indexer = ix + model.appendTransformation(pt.transformation) + model.train(data,pt.sets,order=1) + + self.appendModel(model) + + diff --git a/models/seasonal/SeasonalIndexer.py b/models/seasonal/SeasonalIndexer.py index 59979db..184226f 100644 --- a/models/seasonal/SeasonalIndexer.py +++ b/models/seasonal/SeasonalIndexer.py @@ -1,6 +1,7 @@ import numpy as np from enum import Enum + class SeasonalIndexer(object): """ Seasonal Indexer. Responsible to find the seasonal index of a data point inside its data set @@ -117,6 +118,7 @@ class DataFrameSeasonalIndexer(SeasonalIndexer): data.loc[:,self.data_fields] = value return data + class DateTime(Enum): year = 1 month = 2 diff --git a/partitioners/Util.py b/partitioners/Util.py index a7e641a..3056f74 100644 --- a/partitioners/Util.py +++ b/partitioners/Util.py @@ -3,7 +3,7 @@ import pandas as pd import matplotlib as plt import matplotlib.colors as pltcolors import matplotlib.pyplot as plt -from mpl_toolkits.mplot3d import Axes3D +#from mpl_toolkits.mplot3d import Axes3D from pyFTS.common import Membership, Util from pyFTS.partitioners import Grid,Huarng,FCM,Entropy diff --git a/partitioners/partitioner.py b/partitioners/partitioner.py index 6e0d4fd..7ce2ecf 100644 --- a/partitioners/partitioner.py +++ b/partitioners/partitioner.py @@ -7,7 +7,7 @@ class Partitioner(object): Universe of Discourse partitioner. Split data on several fuzzy sets """ - def __init__(self, name, data, npart, func=Membership.trimf, names=None, prefix="A", transformation=None): + def __init__(self, name, data, npart, func=Membership.trimf, names=None, prefix="A", transformation=None, indexer=None): """ Universe of Discourse partitioner scheme. Split data on several fuzzy sets :param name: partitioner name @@ -25,9 +25,15 @@ class Partitioner(object): self.setnames = names self.prefix = prefix self.transformation = transformation + self.indexer = indexer + + if self.indexer is not None: + ndata = self.indexer.get_data(data) + else: + ndata = data if transformation is not None: - ndata = transformation.apply(data) + ndata = transformation.apply(ndata) else: ndata = data @@ -42,8 +48,11 @@ class Partitioner(object): self.max = _max * 1.1 else: self.max = _max * 0.9 + self.sets = self.build(ndata) + del(ndata) + def build(self, data): """ Perform the partitioning of the Universe of Discourse diff --git a/probabilistic/ProbabilityDistribution.py b/probabilistic/ProbabilityDistribution.py index 3d70dad..b238289 100644 --- a/probabilistic/ProbabilityDistribution.py +++ b/probabilistic/ProbabilityDistribution.py @@ -5,38 +5,55 @@ from pyFTS.common import FuzzySet,SortedCollection class ProbabilityDistribution(object): - def __init__(self,name,nbins,uod,bins=None,labels=None, data=None): - self.name = name - self.nbins = nbins - self.uod = uod - if bins is None: - #range = (uod[1] - uod[0])/nbins - #self.bins = np.arange(uod[0],uod[1],range).tolist() - self.bins = np.linspace(uod[0], uod[1], nbins).tolist() - self.labels = [str(k) for k in self.bins] + """ + Represents a discrete or continous probability distribution + If type is histogram, the PDF is discrete + If type is KDE the PDF is continuous + """ + def __init__(self,type, **kwargs): + if type is None: + self.type = "KDE" else: - self.bins = bins - self.labels = labels + self.type = type + self.description = kwargs.get("description", None) - self.index = SortedCollection.SortedCollection(iterable=sorted(self.bins)) - self.distribution = {} - self.count = 0 - for k in self.bins: self.distribution[k] = 0 + self.uod = kwargs.get("uod", None) - if data is not None: self.append(data) + if self.type == "histogram": + self.nbins = kwargs.get("num_bins", None) + self.bins = kwargs.get("bins", None) + self.labels = kwargs.get("bins_labels", None) + + if self.bins is None: + self.bins = np.linspace(self.uod[0], self.uod[1], self.nbins).tolist() + self.labels = [str(k) for k in self.bins] + + self.index = SortedCollection.SortedCollection(iterable=sorted(self.bins)) + self.distribution = {} + self.count = 0 + for k in self.bins: self.distribution[k] = 0 + + self.data = kwargs.get("data",None) def append(self, values): - for k in values: - v = self.index.find_ge(k) - self.distribution[v] += 1 - self.count += 1 + if self.type == "histogram": + for k in values: + v = self.index.find_ge(k) + self.distribution[v] += 1 + self.count += 1 + else: + self.data.extend(values) def density(self, values): - ret = [] - for k in values: - v = self.index.find_ge(k) - ret.append(self.distribution[v] / self.count) - return ret + if self.type == "histogram": + ret = [] + for k in values: + v = self.index.find_ge(k) + ret.append(self.distribution[v] / self.count) + return ret + else: + pass + def cummulative(self, values): pass diff --git a/probabilistic/kde.py b/probabilistic/kde.py index 724dbbb..82acdba 100644 --- a/probabilistic/kde.py +++ b/probabilistic/kde.py @@ -5,21 +5,20 @@ Kernel Density Estimation class KernelSmoothing(object): """Kernel Density Estimation""" - def __init__(self,h, data, method="epanechnikov"): + def __init__(self,h, method="epanechnikov"): self.h = h - self.data = data self.method = method def kernel(self, u): if self.method == "epanechnikov": return (3/4) * (1 - u**2) - elif self.method == "uniform": + elif self.method == "gaussian": return 0.5 elif self.method == "uniform": return 0.5 - def probability(self, x): - l = len(self.data) - p = sum([self.kernel((x - k)/self.h) for k in self.data]) / l*self.h + def probability(self, x, data): + l = len(data) + p = sum([self.kernel((x - k)/self.h) for k in data]) / l*self.h return p \ No newline at end of file diff --git a/tests/general.py b/tests/general.py index f0b8c48..b5d3088 100644 --- a/tests/general.py +++ b/tests/general.py @@ -10,7 +10,7 @@ import matplotlib.pyplot as plt import pandas as pd from pyFTS.partitioners import Grid, Entropy, FCM, Huarng -from pyFTS.common import FLR,FuzzySet,Membership,Transformations +from pyFTS.common import FLR,FuzzySet,Membership,Transformations, Util as cUtil from pyFTS import fts,hofts,ifts,pwfts,tree, chen #from pyFTS.benchmarks import benchmarks as bchmk from pyFTS.benchmarks import naive, arima @@ -20,8 +20,8 @@ from pyFTS.models.seasonal import SeasonalIndexer os.chdir("/home/petronio/dados/Dropbox/Doutorado/Codigos/") -diff = Transformations.Differential(1) -ix = SeasonalIndexer.LinearSeasonalIndexer([12, 24], [720, 1],[False, False]) +#diff = Transformations.Differential(1) +#ix = SeasonalIndexer.LinearSeasonalIndexer([12, 24], [720, 1],[False, False]) """ DATASETS @@ -63,6 +63,52 @@ DATASETS #print(lag) #print(a) +sonda = pd.read_csv("DataSets/SONDA_BSB_MOD.csv", sep=";") + +sonda['data'] = pd.to_datetime(sonda['data']) + +sonda = sonda[:][527041:] + +sonda.index = np.arange(0,len(sonda.index)) + +sonda_treino = sonda[:1051200] +sonda_teste = sonda[1051201:] + +from pyFTS.models.seasonal import SeasonalIndexer + +ix_m15 = SeasonalIndexer.DateTimeSeasonalIndexer('data',[SeasonalIndexer.DateTime.minute],[15],'glo_avg') + +cUtil.persist_obj(ix_m15, "models/sonda_ix_m15.pkl") + + +ix_Mh = SeasonalIndexer.DateTimeSeasonalIndexer('data',[SeasonalIndexer.DateTime.month,SeasonalIndexer.DateTime.hour], + [None, None],'glo_avg') + +cUtil.persist_obj(ix_Mh, "models/sonda_ix_Mh.pkl") + +ix_Mhm15 = SeasonalIndexer.DateTimeSeasonalIndexer('data',[SeasonalIndexer.DateTime.month, + SeasonalIndexer.DateTime.hour, SeasonalIndexer.DateTime.minute], + [None, None,15],'glo_avg') + +cUtil.persist_obj(ix_Mhm15, "models/sonda_ix_Mhm15.pkl") + + +tmp = ix_Mh.get_data(sonda_treino) +for max_part in [10, 20, 30, 40, 50]: + + fs1 = Grid.GridPartitionerTrimf(tmp,max_part) + + cUtil.persist_obj(fs1,"models/sonda_fs_grid_" + str(max_part) + ".pkl") + + fs2 = FCM.FCMPartitionerTrimf(tmp, max_part) + + cUtil.persist_obj(fs2, "models/sonda_fs_fcm_" + str(max_part) + ".pkl") + + fs3 = Entropy.EntropyPartitionerTrimf(tmp, max_part) + + cUtil.persist_obj(fs3, "models/sonda_fs_entropy_" + str(max_part) + ".pkl") + + from pyFTS.benchmarks import benchmarks as bchmk #from pyFTS.benchmarks import distributed_benchmarks as bchmk #from pyFTS.benchmarks import parallel_benchmarks as bchmk @@ -189,7 +235,6 @@ experiments = [ Util.unified_scaled_point(experiments,tam=[15,8],save=True,file="pictures/unified_experiments_point.png", ignore=['ARIMA(1,0,0)','ARIMA(2,0,0)','ARIMA(2,0,1)','ARIMA(2,0,2)','QAR(2)'], replace=[['ARIMA','ARIMA'],['QAR','QAR']]) - ''' ''' @@ -215,13 +260,14 @@ Util.unified_scaled_interval_pinball(experiments,tam=[15,8],save=True,file="pict ''' +''' experiments = [ - ["experiments/taiex_ahead_synthetic.csv","experiments/taiex_ahead_analytic.csv",16], - ["experiments/nasdaq_ahead_synthetic.csv","experiments/nasdaq_ahead_analytic.csv",11], - ["experiments/sp500_ahead_synthetic.csv","experiments/sp500_ahead_analytic.csv", 21], - ["experiments/best_ahead_synthetic.csv","experiments/best_ahead_analytic.csv", 24], - ["experiments/sondasun_ahead_synthetic.csv","experiments/sondasun_ahead_analytic.csv",13], - ["experiments/sondawind_ahead_synthetic.csv","experiments/sondawind_ahead_analytic.csv", 13], + ["experiments/taiex_ahead_synthetic_diff.csv","experiments/taiex_ahead_analytic_diff.csv",16], + ["experiments/nasdaq_ahead_synthetic_diff.csv","experiments/nasdaq_ahead_analytic_diff.csv",11], + ["experiments/sp500_ahead_synthetic_diff.csv","experiments/sp500_ahead_analytic_diff.csv", 21], + ["experiments/best_ahead_synthetic_diff.csv","experiments/best_ahead_analytic_diff.csv", 24], + ["experiments/sondasun_ahead_synthetic_diff.csv","experiments/sondasun_ahead_analytic_diff.csv",13], + ["experiments/sondawind_ahead_synthetic_diff.csv","experiments/sondawind_ahead_analytic_diff.csv", 13], ["experiments/gauss_ahead_synthetic_diff.csv","experiments/gauss_ahead_analytic_diff.csv",16] ] @@ -233,7 +279,9 @@ Util.unified_scaled_ahead(experiments,tam=[15,8],save=True,file="pictures/unifie -""" +''' + +''' from pyFTS.partitioners import Grid from pyFTS import sfts @@ -268,4 +316,4 @@ x = tmp.forecast(sonda[:1610]) #print(taiex[1600:1610]) print(x) -#""" \ No newline at end of file +''' \ No newline at end of file