- Improvements on probability distributions and KDE

- Seasonal Ensemble
This commit is contained in:
Petrônio Cândido de Lima e Silva 2017-07-01 19:42:45 -03:00
parent ba71e08e76
commit 9bfd931e45
8 changed files with 162 additions and 48 deletions

View File

@ -209,6 +209,8 @@ def scale(data, params):
ndata = [(k-params[0])/params[1] for k in data] ndata = [(k-params[0])/params[1] for k in data]
return ndata return ndata
def stats(measure, data):
print(measure, np.nanmean(data), np.nanstd(data))
def unified_scaled_point(experiments, tam, save=False, file=None, def unified_scaled_point(experiments, tam, save=False, file=None,
sort_columns=['UAVG', 'RMSEAVG', 'USTD', 'RMSESTD'], sort_columns=['UAVG', 'RMSEAVG', 'USTD', 'RMSESTD'],
@ -259,7 +261,6 @@ def unified_scaled_point(experiments, tam, save=False, file=None,
mdl[b]['times'] = [] mdl[b]['times'] = []
best = bests[b] best = bests[b]
print(best)
tmp = dat_ana[(dat_ana.Model == best["Model"]) & (dat_ana.Order == best["Order"]) tmp = dat_ana[(dat_ana.Model == best["Model"]) & (dat_ana.Order == best["Order"])
& (dat_ana.Scheme == best["Scheme"]) & (dat_ana.Partitions == best["Partitions"])] & (dat_ana.Scheme == best["Scheme"]) & (dat_ana.Partitions == best["Partitions"])]
tmpl = extract_measure(tmp,'RMSE',data_columns) tmpl = extract_measure(tmp,'RMSE',data_columns)
@ -277,10 +278,13 @@ def unified_scaled_point(experiments, tam, save=False, file=None,
models[b]['label'] = check_replace_list(best["Model"] + " " + str(best["Order"]), replace) models[b]['label'] = check_replace_list(best["Model"] + " " + str(best["Order"]), replace)
print("GLOBAL")
rmse_param = scale_params(rmse) rmse_param = scale_params(rmse)
stats("rmse", rmse)
smape_param = scale_params(smape) smape_param = scale_params(smape)
stats("smape", smape)
u_param = scale_params(u) u_param = scale_params(u)
stats("u", u)
times_param = scale_params(times) times_param = scale_params(times)
for key in sorted(models.keys()): for key in sorted(models.keys()):
@ -295,9 +299,13 @@ def unified_scaled_point(experiments, tam, save=False, file=None,
times = [] times = []
labels = [] labels = []
for key in sorted(models.keys()): for key in sorted(models.keys()):
print(key)
rmse.append(models[key]['rmse']) rmse.append(models[key]['rmse'])
stats("rmse", models[key]['rmse'])
smape.append(models[key]['smape']) smape.append(models[key]['smape'])
stats("smape", models[key]['smape'])
u.append(models[key]['u']) u.append(models[key]['u'])
stats("u", models[key]['u'])
times.append(models[key]['times']) times.append(models[key]['times'])
labels.append(models[key]['label']) labels.append(models[key]['label'])
@ -995,6 +1003,8 @@ def unified_scaled_ahead(experiments, tam, save=False, file=None,
for experiment in experiments: for experiment in experiments:
print(experiment)
mdl = {} mdl = {}
dat_syn = pd.read_csv(experiment[0], sep=";", usecols=ahead_dataframe_synthetic_columns()) dat_syn = pd.read_csv(experiment[0], sep=";", usecols=ahead_dataframe_synthetic_columns())
@ -1023,6 +1033,9 @@ def unified_scaled_ahead(experiments, tam, save=False, file=None,
mdl[b]['crps2'] = [] mdl[b]['crps2'] = []
best = bests[b] best = bests[b]
print(best)
tmp = dat_ana[(dat_ana.Model == best["Model"]) & (dat_ana.Order == best["Order"]) tmp = dat_ana[(dat_ana.Model == best["Model"]) & (dat_ana.Order == best["Order"])
& (dat_ana.Scheme == best["Scheme"]) & (dat_ana.Partitions == best["Partitions"])] & (dat_ana.Scheme == best["Scheme"]) & (dat_ana.Partitions == best["Partitions"])]
tmpl = extract_measure(tmp, 'CRPS_Interval', data_columns) tmpl = extract_measure(tmp, 'CRPS_Interval', data_columns)

View File

@ -11,6 +11,7 @@ from pyFTS.benchmarks import arima, quantreg
from pyFTS.common import Transformations from pyFTS.common import Transformations
import scipy.stats as st import scipy.stats as st
from pyFTS import tree from pyFTS import tree
from pyFTS.models import msfts
def sampler(data, quantiles): def sampler(data, quantiles):
ret = [] ret = []
@ -241,3 +242,28 @@ class AllMethodEnsembleFTS(EnsembleFTS):
self.appendModel(model) self.appendModel(model)
class SeasonalEnsembleFTS(EnsembleFTS):
def __init__(self, name, **kwargs):
super(SeasonalEnsembleFTS, self).__init__(name="Seasonal Ensemble FTS", **kwargs)
self.min_order = 1
self.indexers = []
self.partitioners = []
self.is_multivariate = True
self.has_seasonality = True
self.has_probability_forecasting = True
def train(self, data, sets, order=1, parameters=None):
self.original_max = max(data)
self.original_min = min(data)
for ix in self.indexers:
for pt in self.partitioners:
model = msfts.MultiSeasonalFTS()
model.indexer = ix
model.appendTransformation(pt.transformation)
model.train(data,pt.sets,order=1)
self.appendModel(model)

View File

@ -1,6 +1,7 @@
import numpy as np import numpy as np
from enum import Enum from enum import Enum
class SeasonalIndexer(object): class SeasonalIndexer(object):
""" """
Seasonal Indexer. Responsible to find the seasonal index of a data point inside its data set Seasonal Indexer. Responsible to find the seasonal index of a data point inside its data set
@ -117,6 +118,7 @@ class DataFrameSeasonalIndexer(SeasonalIndexer):
data.loc[:,self.data_fields] = value data.loc[:,self.data_fields] = value
return data return data
class DateTime(Enum): class DateTime(Enum):
year = 1 year = 1
month = 2 month = 2

View File

@ -3,7 +3,7 @@ import pandas as pd
import matplotlib as plt import matplotlib as plt
import matplotlib.colors as pltcolors import matplotlib.colors as pltcolors
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D #from mpl_toolkits.mplot3d import Axes3D
from pyFTS.common import Membership, Util from pyFTS.common import Membership, Util
from pyFTS.partitioners import Grid,Huarng,FCM,Entropy from pyFTS.partitioners import Grid,Huarng,FCM,Entropy

View File

@ -7,7 +7,7 @@ class Partitioner(object):
Universe of Discourse partitioner. Split data on several fuzzy sets Universe of Discourse partitioner. Split data on several fuzzy sets
""" """
def __init__(self, name, data, npart, func=Membership.trimf, names=None, prefix="A", transformation=None): def __init__(self, name, data, npart, func=Membership.trimf, names=None, prefix="A", transformation=None, indexer=None):
""" """
Universe of Discourse partitioner scheme. Split data on several fuzzy sets Universe of Discourse partitioner scheme. Split data on several fuzzy sets
:param name: partitioner name :param name: partitioner name
@ -25,9 +25,15 @@ class Partitioner(object):
self.setnames = names self.setnames = names
self.prefix = prefix self.prefix = prefix
self.transformation = transformation self.transformation = transformation
self.indexer = indexer
if self.indexer is not None:
ndata = self.indexer.get_data(data)
else:
ndata = data
if transformation is not None: if transformation is not None:
ndata = transformation.apply(data) ndata = transformation.apply(ndata)
else: else:
ndata = data ndata = data
@ -42,8 +48,11 @@ class Partitioner(object):
self.max = _max * 1.1 self.max = _max * 1.1
else: else:
self.max = _max * 0.9 self.max = _max * 0.9
self.sets = self.build(ndata) self.sets = self.build(ndata)
del(ndata)
def build(self, data): def build(self, data):
""" """
Perform the partitioning of the Universe of Discourse Perform the partitioning of the Universe of Discourse

View File

@ -5,38 +5,55 @@ from pyFTS.common import FuzzySet,SortedCollection
class ProbabilityDistribution(object): class ProbabilityDistribution(object):
def __init__(self,name,nbins,uod,bins=None,labels=None, data=None): """
self.name = name Represents a discrete or continous probability distribution
self.nbins = nbins If type is histogram, the PDF is discrete
self.uod = uod If type is KDE the PDF is continuous
if bins is None: """
#range = (uod[1] - uod[0])/nbins def __init__(self,type, **kwargs):
#self.bins = np.arange(uod[0],uod[1],range).tolist() if type is None:
self.bins = np.linspace(uod[0], uod[1], nbins).tolist() self.type = "KDE"
self.labels = [str(k) for k in self.bins]
else: else:
self.bins = bins self.type = type
self.labels = labels self.description = kwargs.get("description", None)
self.index = SortedCollection.SortedCollection(iterable=sorted(self.bins)) self.uod = kwargs.get("uod", None)
self.distribution = {}
self.count = 0
for k in self.bins: self.distribution[k] = 0
if data is not None: self.append(data) if self.type == "histogram":
self.nbins = kwargs.get("num_bins", None)
self.bins = kwargs.get("bins", None)
self.labels = kwargs.get("bins_labels", None)
if self.bins is None:
self.bins = np.linspace(self.uod[0], self.uod[1], self.nbins).tolist()
self.labels = [str(k) for k in self.bins]
self.index = SortedCollection.SortedCollection(iterable=sorted(self.bins))
self.distribution = {}
self.count = 0
for k in self.bins: self.distribution[k] = 0
self.data = kwargs.get("data",None)
def append(self, values): def append(self, values):
for k in values: if self.type == "histogram":
v = self.index.find_ge(k) for k in values:
self.distribution[v] += 1 v = self.index.find_ge(k)
self.count += 1 self.distribution[v] += 1
self.count += 1
else:
self.data.extend(values)
def density(self, values): def density(self, values):
ret = [] if self.type == "histogram":
for k in values: ret = []
v = self.index.find_ge(k) for k in values:
ret.append(self.distribution[v] / self.count) v = self.index.find_ge(k)
return ret ret.append(self.distribution[v] / self.count)
return ret
else:
pass
def cummulative(self, values): def cummulative(self, values):
pass pass

View File

@ -5,21 +5,20 @@ Kernel Density Estimation
class KernelSmoothing(object): class KernelSmoothing(object):
"""Kernel Density Estimation""" """Kernel Density Estimation"""
def __init__(self,h, data, method="epanechnikov"): def __init__(self,h, method="epanechnikov"):
self.h = h self.h = h
self.data = data
self.method = method self.method = method
def kernel(self, u): def kernel(self, u):
if self.method == "epanechnikov": if self.method == "epanechnikov":
return (3/4) * (1 - u**2) return (3/4) * (1 - u**2)
elif self.method == "uniform": elif self.method == "gaussian":
return 0.5 return 0.5
elif self.method == "uniform": elif self.method == "uniform":
return 0.5 return 0.5
def probability(self, x): def probability(self, x, data):
l = len(self.data) l = len(data)
p = sum([self.kernel((x - k)/self.h) for k in self.data]) / l*self.h p = sum([self.kernel((x - k)/self.h) for k in data]) / l*self.h
return p return p

View File

@ -10,7 +10,7 @@ import matplotlib.pyplot as plt
import pandas as pd import pandas as pd
from pyFTS.partitioners import Grid, Entropy, FCM, Huarng from pyFTS.partitioners import Grid, Entropy, FCM, Huarng
from pyFTS.common import FLR,FuzzySet,Membership,Transformations from pyFTS.common import FLR,FuzzySet,Membership,Transformations, Util as cUtil
from pyFTS import fts,hofts,ifts,pwfts,tree, chen from pyFTS import fts,hofts,ifts,pwfts,tree, chen
#from pyFTS.benchmarks import benchmarks as bchmk #from pyFTS.benchmarks import benchmarks as bchmk
from pyFTS.benchmarks import naive, arima from pyFTS.benchmarks import naive, arima
@ -20,8 +20,8 @@ from pyFTS.models.seasonal import SeasonalIndexer
os.chdir("/home/petronio/dados/Dropbox/Doutorado/Codigos/") os.chdir("/home/petronio/dados/Dropbox/Doutorado/Codigos/")
diff = Transformations.Differential(1) #diff = Transformations.Differential(1)
ix = SeasonalIndexer.LinearSeasonalIndexer([12, 24], [720, 1],[False, False]) #ix = SeasonalIndexer.LinearSeasonalIndexer([12, 24], [720, 1],[False, False])
""" """
DATASETS DATASETS
@ -63,6 +63,52 @@ DATASETS
#print(lag) #print(lag)
#print(a) #print(a)
sonda = pd.read_csv("DataSets/SONDA_BSB_MOD.csv", sep=";")
sonda['data'] = pd.to_datetime(sonda['data'])
sonda = sonda[:][527041:]
sonda.index = np.arange(0,len(sonda.index))
sonda_treino = sonda[:1051200]
sonda_teste = sonda[1051201:]
from pyFTS.models.seasonal import SeasonalIndexer
ix_m15 = SeasonalIndexer.DateTimeSeasonalIndexer('data',[SeasonalIndexer.DateTime.minute],[15],'glo_avg')
cUtil.persist_obj(ix_m15, "models/sonda_ix_m15.pkl")
ix_Mh = SeasonalIndexer.DateTimeSeasonalIndexer('data',[SeasonalIndexer.DateTime.month,SeasonalIndexer.DateTime.hour],
[None, None],'glo_avg')
cUtil.persist_obj(ix_Mh, "models/sonda_ix_Mh.pkl")
ix_Mhm15 = SeasonalIndexer.DateTimeSeasonalIndexer('data',[SeasonalIndexer.DateTime.month,
SeasonalIndexer.DateTime.hour, SeasonalIndexer.DateTime.minute],
[None, None,15],'glo_avg')
cUtil.persist_obj(ix_Mhm15, "models/sonda_ix_Mhm15.pkl")
tmp = ix_Mh.get_data(sonda_treino)
for max_part in [10, 20, 30, 40, 50]:
fs1 = Grid.GridPartitionerTrimf(tmp,max_part)
cUtil.persist_obj(fs1,"models/sonda_fs_grid_" + str(max_part) + ".pkl")
fs2 = FCM.FCMPartitionerTrimf(tmp, max_part)
cUtil.persist_obj(fs2, "models/sonda_fs_fcm_" + str(max_part) + ".pkl")
fs3 = Entropy.EntropyPartitionerTrimf(tmp, max_part)
cUtil.persist_obj(fs3, "models/sonda_fs_entropy_" + str(max_part) + ".pkl")
from pyFTS.benchmarks import benchmarks as bchmk from pyFTS.benchmarks import benchmarks as bchmk
#from pyFTS.benchmarks import distributed_benchmarks as bchmk #from pyFTS.benchmarks import distributed_benchmarks as bchmk
#from pyFTS.benchmarks import parallel_benchmarks as bchmk #from pyFTS.benchmarks import parallel_benchmarks as bchmk
@ -189,7 +235,6 @@ experiments = [
Util.unified_scaled_point(experiments,tam=[15,8],save=True,file="pictures/unified_experiments_point.png", Util.unified_scaled_point(experiments,tam=[15,8],save=True,file="pictures/unified_experiments_point.png",
ignore=['ARIMA(1,0,0)','ARIMA(2,0,0)','ARIMA(2,0,1)','ARIMA(2,0,2)','QAR(2)'], ignore=['ARIMA(1,0,0)','ARIMA(2,0,0)','ARIMA(2,0,1)','ARIMA(2,0,2)','QAR(2)'],
replace=[['ARIMA','ARIMA'],['QAR','QAR']]) replace=[['ARIMA','ARIMA'],['QAR','QAR']])
''' '''
''' '''
@ -215,13 +260,14 @@ Util.unified_scaled_interval_pinball(experiments,tam=[15,8],save=True,file="pict
''' '''
'''
experiments = [ experiments = [
["experiments/taiex_ahead_synthetic.csv","experiments/taiex_ahead_analytic.csv",16], ["experiments/taiex_ahead_synthetic_diff.csv","experiments/taiex_ahead_analytic_diff.csv",16],
["experiments/nasdaq_ahead_synthetic.csv","experiments/nasdaq_ahead_analytic.csv",11], ["experiments/nasdaq_ahead_synthetic_diff.csv","experiments/nasdaq_ahead_analytic_diff.csv",11],
["experiments/sp500_ahead_synthetic.csv","experiments/sp500_ahead_analytic.csv", 21], ["experiments/sp500_ahead_synthetic_diff.csv","experiments/sp500_ahead_analytic_diff.csv", 21],
["experiments/best_ahead_synthetic.csv","experiments/best_ahead_analytic.csv", 24], ["experiments/best_ahead_synthetic_diff.csv","experiments/best_ahead_analytic_diff.csv", 24],
["experiments/sondasun_ahead_synthetic.csv","experiments/sondasun_ahead_analytic.csv",13], ["experiments/sondasun_ahead_synthetic_diff.csv","experiments/sondasun_ahead_analytic_diff.csv",13],
["experiments/sondawind_ahead_synthetic.csv","experiments/sondawind_ahead_analytic.csv", 13], ["experiments/sondawind_ahead_synthetic_diff.csv","experiments/sondawind_ahead_analytic_diff.csv", 13],
["experiments/gauss_ahead_synthetic_diff.csv","experiments/gauss_ahead_analytic_diff.csv",16] ["experiments/gauss_ahead_synthetic_diff.csv","experiments/gauss_ahead_analytic_diff.csv",16]
] ]
@ -233,7 +279,9 @@ Util.unified_scaled_ahead(experiments,tam=[15,8],save=True,file="pictures/unifie
""" '''
'''
from pyFTS.partitioners import Grid from pyFTS.partitioners import Grid
from pyFTS import sfts from pyFTS import sfts
@ -268,4 +316,4 @@ x = tmp.forecast(sonda[:1610])
#print(taiex[1600:1610]) #print(taiex[1600:1610])
print(x) print(x)
#""" '''