- Improvements on probability distributions and KDE

- Seasonal Ensemble
This commit is contained in:
Petrônio Cândido de Lima e Silva 2017-07-01 19:42:45 -03:00
parent ba71e08e76
commit 9bfd931e45
8 changed files with 162 additions and 48 deletions

View File

@ -209,6 +209,8 @@ def scale(data, params):
ndata = [(k-params[0])/params[1] for k in data]
return ndata
def stats(measure, data):
print(measure, np.nanmean(data), np.nanstd(data))
def unified_scaled_point(experiments, tam, save=False, file=None,
sort_columns=['UAVG', 'RMSEAVG', 'USTD', 'RMSESTD'],
@ -259,7 +261,6 @@ def unified_scaled_point(experiments, tam, save=False, file=None,
mdl[b]['times'] = []
best = bests[b]
print(best)
tmp = dat_ana[(dat_ana.Model == best["Model"]) & (dat_ana.Order == best["Order"])
& (dat_ana.Scheme == best["Scheme"]) & (dat_ana.Partitions == best["Partitions"])]
tmpl = extract_measure(tmp,'RMSE',data_columns)
@ -277,10 +278,13 @@ def unified_scaled_point(experiments, tam, save=False, file=None,
models[b]['label'] = check_replace_list(best["Model"] + " " + str(best["Order"]), replace)
print("GLOBAL")
rmse_param = scale_params(rmse)
stats("rmse", rmse)
smape_param = scale_params(smape)
stats("smape", smape)
u_param = scale_params(u)
stats("u", u)
times_param = scale_params(times)
for key in sorted(models.keys()):
@ -295,9 +299,13 @@ def unified_scaled_point(experiments, tam, save=False, file=None,
times = []
labels = []
for key in sorted(models.keys()):
print(key)
rmse.append(models[key]['rmse'])
stats("rmse", models[key]['rmse'])
smape.append(models[key]['smape'])
stats("smape", models[key]['smape'])
u.append(models[key]['u'])
stats("u", models[key]['u'])
times.append(models[key]['times'])
labels.append(models[key]['label'])
@ -995,6 +1003,8 @@ def unified_scaled_ahead(experiments, tam, save=False, file=None,
for experiment in experiments:
print(experiment)
mdl = {}
dat_syn = pd.read_csv(experiment[0], sep=";", usecols=ahead_dataframe_synthetic_columns())
@ -1023,6 +1033,9 @@ def unified_scaled_ahead(experiments, tam, save=False, file=None,
mdl[b]['crps2'] = []
best = bests[b]
print(best)
tmp = dat_ana[(dat_ana.Model == best["Model"]) & (dat_ana.Order == best["Order"])
& (dat_ana.Scheme == best["Scheme"]) & (dat_ana.Partitions == best["Partitions"])]
tmpl = extract_measure(tmp, 'CRPS_Interval', data_columns)

View File

@ -11,6 +11,7 @@ from pyFTS.benchmarks import arima, quantreg
from pyFTS.common import Transformations
import scipy.stats as st
from pyFTS import tree
from pyFTS.models import msfts
def sampler(data, quantiles):
ret = []
@ -241,3 +242,28 @@ class AllMethodEnsembleFTS(EnsembleFTS):
self.appendModel(model)
class SeasonalEnsembleFTS(EnsembleFTS):
def __init__(self, name, **kwargs):
super(SeasonalEnsembleFTS, self).__init__(name="Seasonal Ensemble FTS", **kwargs)
self.min_order = 1
self.indexers = []
self.partitioners = []
self.is_multivariate = True
self.has_seasonality = True
self.has_probability_forecasting = True
def train(self, data, sets, order=1, parameters=None):
self.original_max = max(data)
self.original_min = min(data)
for ix in self.indexers:
for pt in self.partitioners:
model = msfts.MultiSeasonalFTS()
model.indexer = ix
model.appendTransformation(pt.transformation)
model.train(data,pt.sets,order=1)
self.appendModel(model)

View File

@ -1,6 +1,7 @@
import numpy as np
from enum import Enum
class SeasonalIndexer(object):
"""
Seasonal Indexer. Responsible to find the seasonal index of a data point inside its data set
@ -117,6 +118,7 @@ class DataFrameSeasonalIndexer(SeasonalIndexer):
data.loc[:,self.data_fields] = value
return data
class DateTime(Enum):
year = 1
month = 2

View File

@ -3,7 +3,7 @@ import pandas as pd
import matplotlib as plt
import matplotlib.colors as pltcolors
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
#from mpl_toolkits.mplot3d import Axes3D
from pyFTS.common import Membership, Util
from pyFTS.partitioners import Grid,Huarng,FCM,Entropy

View File

@ -7,7 +7,7 @@ class Partitioner(object):
Universe of Discourse partitioner. Split data on several fuzzy sets
"""
def __init__(self, name, data, npart, func=Membership.trimf, names=None, prefix="A", transformation=None):
def __init__(self, name, data, npart, func=Membership.trimf, names=None, prefix="A", transformation=None, indexer=None):
"""
Universe of Discourse partitioner scheme. Split data on several fuzzy sets
:param name: partitioner name
@ -25,9 +25,15 @@ class Partitioner(object):
self.setnames = names
self.prefix = prefix
self.transformation = transformation
self.indexer = indexer
if self.indexer is not None:
ndata = self.indexer.get_data(data)
else:
ndata = data
if transformation is not None:
ndata = transformation.apply(data)
ndata = transformation.apply(ndata)
else:
ndata = data
@ -42,8 +48,11 @@ class Partitioner(object):
self.max = _max * 1.1
else:
self.max = _max * 0.9
self.sets = self.build(ndata)
del(ndata)
def build(self, data):
"""
Perform the partitioning of the Universe of Discourse

View File

@ -5,38 +5,55 @@ from pyFTS.common import FuzzySet,SortedCollection
class ProbabilityDistribution(object):
def __init__(self,name,nbins,uod,bins=None,labels=None, data=None):
self.name = name
self.nbins = nbins
self.uod = uod
if bins is None:
#range = (uod[1] - uod[0])/nbins
#self.bins = np.arange(uod[0],uod[1],range).tolist()
self.bins = np.linspace(uod[0], uod[1], nbins).tolist()
self.labels = [str(k) for k in self.bins]
"""
Represents a discrete or continous probability distribution
If type is histogram, the PDF is discrete
If type is KDE the PDF is continuous
"""
def __init__(self,type, **kwargs):
if type is None:
self.type = "KDE"
else:
self.bins = bins
self.labels = labels
self.type = type
self.description = kwargs.get("description", None)
self.index = SortedCollection.SortedCollection(iterable=sorted(self.bins))
self.distribution = {}
self.count = 0
for k in self.bins: self.distribution[k] = 0
self.uod = kwargs.get("uod", None)
if data is not None: self.append(data)
if self.type == "histogram":
self.nbins = kwargs.get("num_bins", None)
self.bins = kwargs.get("bins", None)
self.labels = kwargs.get("bins_labels", None)
if self.bins is None:
self.bins = np.linspace(self.uod[0], self.uod[1], self.nbins).tolist()
self.labels = [str(k) for k in self.bins]
self.index = SortedCollection.SortedCollection(iterable=sorted(self.bins))
self.distribution = {}
self.count = 0
for k in self.bins: self.distribution[k] = 0
self.data = kwargs.get("data",None)
def append(self, values):
for k in values:
v = self.index.find_ge(k)
self.distribution[v] += 1
self.count += 1
if self.type == "histogram":
for k in values:
v = self.index.find_ge(k)
self.distribution[v] += 1
self.count += 1
else:
self.data.extend(values)
def density(self, values):
ret = []
for k in values:
v = self.index.find_ge(k)
ret.append(self.distribution[v] / self.count)
return ret
if self.type == "histogram":
ret = []
for k in values:
v = self.index.find_ge(k)
ret.append(self.distribution[v] / self.count)
return ret
else:
pass
def cummulative(self, values):
pass

View File

@ -5,21 +5,20 @@ Kernel Density Estimation
class KernelSmoothing(object):
"""Kernel Density Estimation"""
def __init__(self,h, data, method="epanechnikov"):
def __init__(self,h, method="epanechnikov"):
self.h = h
self.data = data
self.method = method
def kernel(self, u):
if self.method == "epanechnikov":
return (3/4) * (1 - u**2)
elif self.method == "uniform":
elif self.method == "gaussian":
return 0.5
elif self.method == "uniform":
return 0.5
def probability(self, x):
l = len(self.data)
p = sum([self.kernel((x - k)/self.h) for k in self.data]) / l*self.h
def probability(self, x, data):
l = len(data)
p = sum([self.kernel((x - k)/self.h) for k in data]) / l*self.h
return p

View File

@ -10,7 +10,7 @@ import matplotlib.pyplot as plt
import pandas as pd
from pyFTS.partitioners import Grid, Entropy, FCM, Huarng
from pyFTS.common import FLR,FuzzySet,Membership,Transformations
from pyFTS.common import FLR,FuzzySet,Membership,Transformations, Util as cUtil
from pyFTS import fts,hofts,ifts,pwfts,tree, chen
#from pyFTS.benchmarks import benchmarks as bchmk
from pyFTS.benchmarks import naive, arima
@ -20,8 +20,8 @@ from pyFTS.models.seasonal import SeasonalIndexer
os.chdir("/home/petronio/dados/Dropbox/Doutorado/Codigos/")
diff = Transformations.Differential(1)
ix = SeasonalIndexer.LinearSeasonalIndexer([12, 24], [720, 1],[False, False])
#diff = Transformations.Differential(1)
#ix = SeasonalIndexer.LinearSeasonalIndexer([12, 24], [720, 1],[False, False])
"""
DATASETS
@ -63,6 +63,52 @@ DATASETS
#print(lag)
#print(a)
sonda = pd.read_csv("DataSets/SONDA_BSB_MOD.csv", sep=";")
sonda['data'] = pd.to_datetime(sonda['data'])
sonda = sonda[:][527041:]
sonda.index = np.arange(0,len(sonda.index))
sonda_treino = sonda[:1051200]
sonda_teste = sonda[1051201:]
from pyFTS.models.seasonal import SeasonalIndexer
ix_m15 = SeasonalIndexer.DateTimeSeasonalIndexer('data',[SeasonalIndexer.DateTime.minute],[15],'glo_avg')
cUtil.persist_obj(ix_m15, "models/sonda_ix_m15.pkl")
ix_Mh = SeasonalIndexer.DateTimeSeasonalIndexer('data',[SeasonalIndexer.DateTime.month,SeasonalIndexer.DateTime.hour],
[None, None],'glo_avg')
cUtil.persist_obj(ix_Mh, "models/sonda_ix_Mh.pkl")
ix_Mhm15 = SeasonalIndexer.DateTimeSeasonalIndexer('data',[SeasonalIndexer.DateTime.month,
SeasonalIndexer.DateTime.hour, SeasonalIndexer.DateTime.minute],
[None, None,15],'glo_avg')
cUtil.persist_obj(ix_Mhm15, "models/sonda_ix_Mhm15.pkl")
tmp = ix_Mh.get_data(sonda_treino)
for max_part in [10, 20, 30, 40, 50]:
fs1 = Grid.GridPartitionerTrimf(tmp,max_part)
cUtil.persist_obj(fs1,"models/sonda_fs_grid_" + str(max_part) + ".pkl")
fs2 = FCM.FCMPartitionerTrimf(tmp, max_part)
cUtil.persist_obj(fs2, "models/sonda_fs_fcm_" + str(max_part) + ".pkl")
fs3 = Entropy.EntropyPartitionerTrimf(tmp, max_part)
cUtil.persist_obj(fs3, "models/sonda_fs_entropy_" + str(max_part) + ".pkl")
from pyFTS.benchmarks import benchmarks as bchmk
#from pyFTS.benchmarks import distributed_benchmarks as bchmk
#from pyFTS.benchmarks import parallel_benchmarks as bchmk
@ -189,7 +235,6 @@ experiments = [
Util.unified_scaled_point(experiments,tam=[15,8],save=True,file="pictures/unified_experiments_point.png",
ignore=['ARIMA(1,0,0)','ARIMA(2,0,0)','ARIMA(2,0,1)','ARIMA(2,0,2)','QAR(2)'],
replace=[['ARIMA','ARIMA'],['QAR','QAR']])
'''
'''
@ -215,13 +260,14 @@ Util.unified_scaled_interval_pinball(experiments,tam=[15,8],save=True,file="pict
'''
'''
experiments = [
["experiments/taiex_ahead_synthetic.csv","experiments/taiex_ahead_analytic.csv",16],
["experiments/nasdaq_ahead_synthetic.csv","experiments/nasdaq_ahead_analytic.csv",11],
["experiments/sp500_ahead_synthetic.csv","experiments/sp500_ahead_analytic.csv", 21],
["experiments/best_ahead_synthetic.csv","experiments/best_ahead_analytic.csv", 24],
["experiments/sondasun_ahead_synthetic.csv","experiments/sondasun_ahead_analytic.csv",13],
["experiments/sondawind_ahead_synthetic.csv","experiments/sondawind_ahead_analytic.csv", 13],
["experiments/taiex_ahead_synthetic_diff.csv","experiments/taiex_ahead_analytic_diff.csv",16],
["experiments/nasdaq_ahead_synthetic_diff.csv","experiments/nasdaq_ahead_analytic_diff.csv",11],
["experiments/sp500_ahead_synthetic_diff.csv","experiments/sp500_ahead_analytic_diff.csv", 21],
["experiments/best_ahead_synthetic_diff.csv","experiments/best_ahead_analytic_diff.csv", 24],
["experiments/sondasun_ahead_synthetic_diff.csv","experiments/sondasun_ahead_analytic_diff.csv",13],
["experiments/sondawind_ahead_synthetic_diff.csv","experiments/sondawind_ahead_analytic_diff.csv", 13],
["experiments/gauss_ahead_synthetic_diff.csv","experiments/gauss_ahead_analytic_diff.csv",16]
]
@ -233,7 +279,9 @@ Util.unified_scaled_ahead(experiments,tam=[15,8],save=True,file="pictures/unifie
"""
'''
'''
from pyFTS.partitioners import Grid
from pyFTS import sfts
@ -268,4 +316,4 @@ x = tmp.forecast(sonda[:1610])
#print(taiex[1600:1610])
print(x)
#"""
'''