Clustered MVFTS improvements

This commit is contained in:
Petrônio Cândido 2018-11-19 08:30:06 -02:00
parent a8c05563d4
commit 8c1fec482d
7 changed files with 165 additions and 35 deletions

View File

@ -16,8 +16,8 @@ class ClusteredMVFTS(mvfts.MVFTS):
"""The cluster method to be called when a new model is build""" """The cluster method to be called when a new model is build"""
self.cluster_params = kwargs.get('cluster_params', {}) self.cluster_params = kwargs.get('cluster_params', {})
"""The cluster method parameters""" """The cluster method parameters"""
self.cluster = None self.cluster = kwargs.get('cluster', None)
"""The most recent trained clusterer""" """The trained clusterer"""
self.fts_method = kwargs.get('fts_method', hofts.WeightedHighOrderFTS) self.fts_method = kwargs.get('fts_method', hofts.WeightedHighOrderFTS)
"""The FTS method to be called when a new model is build""" """The FTS method to be called when a new model is build"""
@ -38,17 +38,16 @@ class ClusteredMVFTS(mvfts.MVFTS):
def fuzzyfy(self,data): def fuzzyfy(self,data):
ndata = [] ndata = []
for ct in range(1, len(data.index)+1): for index, row in data.iterrows():
ix = data.index[ct - 1] data_point = self.format_data(row)
data_point = self.format_data(data.loc[ix])
ndata.append(common.fuzzyfy_instance_clustered(data_point, self.cluster, self.alpha_cut)) ndata.append(common.fuzzyfy_instance_clustered(data_point, self.cluster, self.alpha_cut))
return ndata return ndata
def train(self, data, **kwargs): def train(self, data, **kwargs):
self.cluster = self.cluster_method(data=data, mvfts=self, neighbors=self.knn) if self.cluster is None:
self.cluster = self.cluster_method(data=data, mvfts=self, neighbors=self.knn, **self.cluster_params)
self.model = self.fts_method(partitioner=self.cluster, **self.fts_params) self.model = self.fts_method(partitioner=self.cluster, **self.fts_params)
if self.model.is_high_order: if self.model.is_high_order:
@ -59,6 +58,8 @@ class ClusteredMVFTS(mvfts.MVFTS):
self.model.train(ndata, fuzzyfied=True) self.model.train(ndata, fuzzyfied=True)
self.cluster.prune()
def forecast(self, ndata, **kwargs): def forecast(self, ndata, **kwargs):
ndata = self.fuzzyfy(ndata) ndata = self.fuzzyfy(ndata)

View File

@ -5,6 +5,7 @@ from scipy.spatial import KDTree
import numpy as np import numpy as np
import pandas as pd import pandas as pd
class GridCluster(partitioner.Partitioner): class GridCluster(partitioner.Partitioner):
""" """
A cartesian product of all fuzzy sets of all variables A cartesian product of all fuzzy sets of all variables
@ -17,8 +18,12 @@ class GridCluster(partitioner.Partitioner):
self.sets = {} self.sets = {}
self.kdtree = None self.kdtree = None
self.index = {} self.index = {}
self.build(None)
self.neighbors = kwargs.get('neighbors', 2) self.neighbors = kwargs.get('neighbors', 2)
self.optmize = kwargs.get('optmize', False)
if self.optmize:
self.count = {}
data = kwargs.get('data', [None])
self.build(data)
def build(self, data): def build(self, data):
@ -26,7 +31,6 @@ class GridCluster(partitioner.Partitioner):
for k in self.mvfts.explanatory_variables] for k in self.mvfts.explanatory_variables]
midpoints = [] midpoints = []
index = {}
c = 0 c = 0
for k in product(*fsets): for k in product(*fsets):
@ -44,14 +48,59 @@ class GridCluster(partitioner.Partitioner):
self.index[c] = _key self.index[c] = _key
c += 1 c += 1
import sys
sys.setrecursionlimit(100000)
self.kdtree = KDTree(midpoints) self.kdtree = KDTree(midpoints)
sys.setrecursionlimit(1000)
def prune(self):
if not self.optmize:
return
for fset in [fs for fs in self.sets.keys()]:
if fset not in self.count:
fs = self.sets.pop(fset)
del (fs)
vars = [k.name for k in self.mvfts.explanatory_variables]
midpoints = []
self.index = {}
for ct, fset in enumerate(self.sets.values()):
mp = []
for vr in vars:
mp.append(fset.sets[vr].centroid)
midpoints.append(mp)
self.index[ct] = fset.name
import sys
sys.setrecursionlimit(100000)
self.kdtree = KDTree(midpoints)
sys.setrecursionlimit(1000)
def knn(self, data): def knn(self, data):
tmp = [data[k.name] for k in self.mvfts.explanatory_variables] tmp = [data[k.name]
tmp, ix = self.kdtree.query(tmp, self.neighbors ) for k in self.mvfts.explanatory_variables]
tmp, ix = self.kdtree.query(tmp, self.neighbors)
if not isinstance(ix, (list, np.ndarray)): if not isinstance(ix, (list, np.ndarray)):
ix = [ix] ix = [ix]
return [self.index[k] for k in ix] if self.optmize:
tmp = []
for k in ix:
tmp.append(self.index[k])
self.count[self.index[k]] = 1
return tmp
else:
return [self.index[k] for k in ix]

View File

@ -31,7 +31,8 @@ class MVFTS(fts.FTS):
def format_data(self, data): def format_data(self, data):
ndata = {} ndata = {}
for var in self.explanatory_variables: for var in self.explanatory_variables:
ndata[var.name] = data[var.data_label] #ndata[var.name] = data[var.data_label]
ndata[var.name] = var.partitioner.extractor(data[var.data_label])
return ndata return ndata
@ -109,9 +110,8 @@ class MVFTS(fts.FTS):
def forecast(self, data, **kwargs): def forecast(self, data, **kwargs):
ret = [] ret = []
ndata = self.apply_transformations(data) ndata = self.apply_transformations(data)
for ix in ndata.index: for index, row in ndata.iterrows():
data_point = ndata.loc[ix] flrs = self.generate_lhs_flrs(row)
flrs = self.generate_lhs_flrs(data_point)
mvs = [] mvs = []
mps = [] mps = []
for flr in flrs: for flr in flrs:
@ -120,7 +120,7 @@ class MVFTS(fts.FTS):
mvs.append(0.) mvs.append(0.)
mps.append(0.) mps.append(0.)
else: else:
mvs.append(self.flrgs[flrg.get_key()].get_membership(self.format_data(data_point), self.explanatory_variables)) mvs.append(self.flrgs[flrg.get_key()].get_membership(self.format_data(row), self.explanatory_variables))
mps.append(self.flrgs[flrg.get_key()].get_midpoint(self.target_variable.partitioner.sets)) mps.append(self.flrgs[flrg.get_key()].get_midpoint(self.target_variable.partitioner.sets))
mv = np.array(mvs) mv = np.array(mvs)

View File

@ -3,6 +3,8 @@ import pandas as pd
from enum import Enum from enum import Enum
from pyFTS.common import FuzzySet, Membership from pyFTS.common import FuzzySet, Membership
from pyFTS.partitioners import partitioner, Grid from pyFTS.partitioners import partitioner, Grid
from datetime import date as dt
class DateTime(Enum): class DateTime(Enum):
@ -94,7 +96,7 @@ class FuzzySet(FuzzySet.FuzzySet):
self.type = kwargs.get('type', 'seasonal') self.type = kwargs.get('type', 'seasonal')
def transform(self, x): def transform(self, x):
if self.type == 'seasonal': if self.type == 'seasonal' and isinstance(x, (dt, pd.Timestamp)):
dp = strip_datepart(x, self.datepart) dp = strip_datepart(x, self.datepart)
else: else:
dp = x dp = x

View File

@ -39,6 +39,8 @@ class TimeGridPartitioner(partitioner.Partitioner):
else: else:
self.ordered_sets = FS.set_ordered(self.sets) self.ordered_sets = FS.set_ordered(self.sets)
self.extractor = lambda x: strip_datepart(x, self.season)
def build(self, data): def build(self, data):
sets = {} sets = {}

View File

@ -30,6 +30,8 @@ class Partitioner(object):
"""In a multivariate context, the variable that contains this partitioner""" """In a multivariate context, the variable that contains this partitioner"""
self.type = kwargs.get('type', 'common') self.type = kwargs.get('type', 'common')
"""The type of fuzzy sets that are generated by this partitioner""" """The type of fuzzy sets that are generated by this partitioner"""
self.extractor = kwargs.get('extractor', lambda x: x)
"""Anonymous function used to extract a single primitive type from an object instance"""
self.ordered_sets = None self.ordered_sets = None
if kwargs.get('preprocess',True): if kwargs.get('preprocess',True):

View File

@ -17,37 +17,92 @@ from pyFTS.models.multivariate import common, variable, mvfts, cmvfts
from pyFTS.models.seasonal import partitioner as seasonal from pyFTS.models.seasonal import partitioner as seasonal
from pyFTS.models.seasonal.common import DateTime from pyFTS.models.seasonal.common import DateTime
from pyFTS.data import Malaysia
dataset = Malaysia.get_dataframe()
dataset["time"] = pd.to_datetime(dataset["time"], format='%m/%d/%y %I:%M %p')
data = dataset['load'].values
train_split = 8760
train_mv = dataset.iloc[:train_split]
test_mv = dataset.iloc[train_split:]
sp = {'seasonality': DateTime.month , #'type': 'common',
'names': ['Jan','Feb','Mar','Apr','May','Jun','Jul', 'Aug','Sep','Oct','Nov','Dec']}
vmonth = variable.Variable("Month", data_label="time", partitioner=seasonal.TimeGridPartitioner, npart=12,
data=train_mv, partitioner_specific=sp)
sp = {'seasonality': DateTime.day_of_week, #'type': 'common',
'names': ['Mon','Tue','Wed','Thu','Fri','Sat','Sun']}
vday = variable.Variable("Weekday", data_label="time", partitioner=seasonal.TimeGridPartitioner, npart=7,
data=train_mv, partitioner_specific=sp)
sp = {'seasonality': DateTime.hour_of_day} #, 'type': 'common'}
vhour = variable.Variable("Hour", data_label="time", partitioner=seasonal.TimeGridPartitioner, npart=24,
data=train_mv, partitioner_specific=sp)
vload = variable.Variable("load", data_label="load", partitioner=Grid.GridPartitioner, npart=10,
data=train_mv)
""" """
model = cmvfts.ClusteredMVFTS(order=2, knn=3, cluster_params={'optmize': True})
model.append_variable(vmonthp)
model.append_variable(vdayp)
model.append_variable(vhourp)
model.append_variable(vload)
model.target_variable = vload
model.fit(train_mv)
print(len(model.cluster.sets.keys()))
model.cluster.prune()
print(len(model.cluster.sets.keys()))
model.predict(test_mv)
"""
'''
from pyFTS.data import Malaysia from pyFTS.data import Malaysia
dataset = Malaysia.get_dataframe() dataset = Malaysia.get_dataframe()
dataset["date"] = pd.to_datetime(dataset["time"], format='%m/%d/%y %I:%M %p') dataset["date"] = pd.to_datetime(dataset["time"], format='%m/%d/%y %I:%M %p')
mv_train = dataset.iloc[:100000] train_mv = dataset.iloc[:10000]
test_mv = dataset.iloc[10000:]
sp = {'seasonality': DateTime.month , 'names': ['Jan','Feb','Mar','Apr','May','Jun','Jul', 'Aug','Sep','Oct','Nov','Dec']} sp = {'seasonality': DateTime.month , 'names': ['Jan','Feb','Mar','Apr','May','Jun','Jul', 'Aug','Sep','Oct','Nov','Dec']}
vmonth = variable.Variable("Month", data_label="date", partitioner=seasonal.TimeGridPartitioner, npart=12, vmonth = variable.Variable("Month", data_label="date", partitioner=seasonal.TimeGridPartitioner, npart=12,
data=mv_train, partitioner_specific=sp) data=train_mv, partitioner_specific=sp)
sp = {'seasonality': DateTime.day_of_week, 'names': ['Mon','Tue','Wed','Thu','Fri','Sat','Sun']} sp = {'seasonality': DateTime.day_of_week, 'names': ['Mon','Tue','Wed','Thu','Fri','Sat','Sun']}
vday = variable.Variable("Weekday", data_label="date", partitioner=seasonal.TimeGridPartitioner, npart=7, vday = variable.Variable("Weekday", data_label="date", partitioner=seasonal.TimeGridPartitioner, npart=7,
data=mv_train, partitioner_specific=sp) data=train_mv, partitioner_specific=sp)
sp = {'seasonality': DateTime.hour_of_day} sp = {'seasonality': DateTime.hour_of_day}
vhour = variable.Variable("Hour", data_label="date", partitioner=seasonal.TimeGridPartitioner, npart=24, vhour = variable.Variable("Hour", data_label="date", partitioner=seasonal.TimeGridPartitioner, npart=24,
data=mv_train, partitioner_specific=sp) data=train_mv, partitioner_specific=sp)
vload = variable.Variable("load", data_label="load", partitioner=Grid.GridPartitioner, npart=10, vload = variable.Variable("load", data_label="load", partitioner=Grid.GridPartitioner, npart=10,
data=mv_train) data=train_mv)
vtemperature = variable.Variable("temperature", data_label="temperature", partitioner=Grid.GridPartitioner, npart=10, vtemperature = variable.Variable("temperature", data_label="temperature", partitioner=Grid.GridPartitioner, npart=10,
data=mv_train) data=train_mv)
"""
variables = { variables = {
'month': vmonth, 'month': vmonth,
'day': vday, 'day': vday,
@ -74,7 +129,7 @@ for k in [itertools.combinations(var_list, r) for r in range(2,len(var_list))]:
models.append(model) models.append(model)
""" """
#""" """
dataset = pd.read_csv('/home/petronio/Downloads/priceHong') dataset = pd.read_csv('/home/petronio/Downloads/priceHong')
dataset['hour'] = dataset.index.values % 24 dataset['hour'] = dataset.index.values % 24
@ -107,27 +162,46 @@ data = [[1, 1.0], [2, 2.0]]
df = pd.DataFrame(data, columns=['hour','price']) df = pd.DataFrame(data, columns=['hour','price'])
forecasts = model.predict(df, steps_ahead=24, generators={'Hour': lambda x : (x+1)%24 }) forecasts = model.predict(df, steps_ahead=24, generators={'Hour': lambda x : (x+1)%24 })
"""
''' '''
params = [ params = [
{}, {},
{}, {},
{'order': 2, 'knn': 1}, {'order': 2, 'knn': 3, 'cluster_params': {'optmize': True}},
{'order': 2, 'knn': 2}, {'order': 2, 'knn': 2, 'cluster_params': {'optmize': True}},
{'order': 2, 'knn': 3} {'order': 2, 'knn': 1, 'cluster_params': {'optmize': True}}
] ]
from pyFTS.models.multivariate import grid
cluster = None
for ct, method in enumerate([mvfts.MVFTS, wmvfts.WeightedMVFTS, cmvfts.ClusteredMVFTS, cmvfts.ClusteredMVFTS, cmvfts.ClusteredMVFTS]): for ct, method in enumerate([mvfts.MVFTS, wmvfts.WeightedMVFTS, cmvfts.ClusteredMVFTS, cmvfts.ClusteredMVFTS, cmvfts.ClusteredMVFTS]):
model = method(**params[ct]) model = method(**params[ct])
model.append_variable(vmonth)
model.append_variable(vday)
model.append_variable(vhour) model.append_variable(vhour)
model.append_variable(vprice) model.append_variable(vload)
model.target_variable = vprice model.target_variable = vload
model.fit(train_mv) model.fit(train_mv)
print(model.shortname, params[ct], Measures.get_point_statistics(test_mv, model))
if method == cmvfts.ClusteredMVFTS:
model.cluster.prune()
try:
print(model.shortname, params[ct], Measures.get_point_statistics(test_mv, model))
except Exception as ex:
print(model.shortname, params[ct])
print(ex)
print("\n\n==============================================\n\n")
#print(model1) #print(model1)
#print(model1.predict(test_mv, steps_ahead=24, generators={'Hour': lambda x : (x+1)%24 })) #print(model1.predict(test_mv, steps_ahead=24, generators={'Hour': lambda x : (x+1)%24 }))
''' #'''