Clustered MVFTS improvements

This commit is contained in:
Petrônio Cândido 2018-11-19 08:30:06 -02:00
parent a8c05563d4
commit 8c1fec482d
7 changed files with 165 additions and 35 deletions

View File

@ -16,8 +16,8 @@ class ClusteredMVFTS(mvfts.MVFTS):
"""The cluster method to be called when a new model is build"""
self.cluster_params = kwargs.get('cluster_params', {})
"""The cluster method parameters"""
self.cluster = None
"""The most recent trained clusterer"""
self.cluster = kwargs.get('cluster', None)
"""The trained clusterer"""
self.fts_method = kwargs.get('fts_method', hofts.WeightedHighOrderFTS)
"""The FTS method to be called when a new model is build"""
@ -38,17 +38,16 @@ class ClusteredMVFTS(mvfts.MVFTS):
def fuzzyfy(self,data):
ndata = []
for ct in range(1, len(data.index)+1):
ix = data.index[ct - 1]
data_point = self.format_data(data.loc[ix])
for index, row in data.iterrows():
data_point = self.format_data(row)
ndata.append(common.fuzzyfy_instance_clustered(data_point, self.cluster, self.alpha_cut))
return ndata
def train(self, data, **kwargs):
self.cluster = self.cluster_method(data=data, mvfts=self, neighbors=self.knn)
if self.cluster is None:
self.cluster = self.cluster_method(data=data, mvfts=self, neighbors=self.knn, **self.cluster_params)
self.model = self.fts_method(partitioner=self.cluster, **self.fts_params)
if self.model.is_high_order:
@ -59,6 +58,8 @@ class ClusteredMVFTS(mvfts.MVFTS):
self.model.train(ndata, fuzzyfied=True)
self.cluster.prune()
def forecast(self, ndata, **kwargs):
ndata = self.fuzzyfy(ndata)

View File

@ -5,6 +5,7 @@ from scipy.spatial import KDTree
import numpy as np
import pandas as pd
class GridCluster(partitioner.Partitioner):
"""
A cartesian product of all fuzzy sets of all variables
@ -17,8 +18,12 @@ class GridCluster(partitioner.Partitioner):
self.sets = {}
self.kdtree = None
self.index = {}
self.build(None)
self.neighbors = kwargs.get('neighbors', 2)
self.optmize = kwargs.get('optmize', False)
if self.optmize:
self.count = {}
data = kwargs.get('data', [None])
self.build(data)
def build(self, data):
@ -26,7 +31,6 @@ class GridCluster(partitioner.Partitioner):
for k in self.mvfts.explanatory_variables]
midpoints = []
index = {}
c = 0
for k in product(*fsets):
@ -44,14 +48,59 @@ class GridCluster(partitioner.Partitioner):
self.index[c] = _key
c += 1
import sys
sys.setrecursionlimit(100000)
self.kdtree = KDTree(midpoints)
sys.setrecursionlimit(1000)
def prune(self):
if not self.optmize:
return
for fset in [fs for fs in self.sets.keys()]:
if fset not in self.count:
fs = self.sets.pop(fset)
del (fs)
vars = [k.name for k in self.mvfts.explanatory_variables]
midpoints = []
self.index = {}
for ct, fset in enumerate(self.sets.values()):
mp = []
for vr in vars:
mp.append(fset.sets[vr].centroid)
midpoints.append(mp)
self.index[ct] = fset.name
import sys
sys.setrecursionlimit(100000)
self.kdtree = KDTree(midpoints)
sys.setrecursionlimit(1000)
def knn(self, data):
tmp = [data[k.name] for k in self.mvfts.explanatory_variables]
tmp, ix = self.kdtree.query(tmp, self.neighbors )
tmp = [data[k.name]
for k in self.mvfts.explanatory_variables]
tmp, ix = self.kdtree.query(tmp, self.neighbors)
if not isinstance(ix, (list, np.ndarray)):
ix = [ix]
if self.optmize:
tmp = []
for k in ix:
tmp.append(self.index[k])
self.count[self.index[k]] = 1
return tmp
else:
return [self.index[k] for k in ix]

View File

@ -31,7 +31,8 @@ class MVFTS(fts.FTS):
def format_data(self, data):
ndata = {}
for var in self.explanatory_variables:
ndata[var.name] = data[var.data_label]
#ndata[var.name] = data[var.data_label]
ndata[var.name] = var.partitioner.extractor(data[var.data_label])
return ndata
@ -109,9 +110,8 @@ class MVFTS(fts.FTS):
def forecast(self, data, **kwargs):
ret = []
ndata = self.apply_transformations(data)
for ix in ndata.index:
data_point = ndata.loc[ix]
flrs = self.generate_lhs_flrs(data_point)
for index, row in ndata.iterrows():
flrs = self.generate_lhs_flrs(row)
mvs = []
mps = []
for flr in flrs:
@ -120,7 +120,7 @@ class MVFTS(fts.FTS):
mvs.append(0.)
mps.append(0.)
else:
mvs.append(self.flrgs[flrg.get_key()].get_membership(self.format_data(data_point), self.explanatory_variables))
mvs.append(self.flrgs[flrg.get_key()].get_membership(self.format_data(row), self.explanatory_variables))
mps.append(self.flrgs[flrg.get_key()].get_midpoint(self.target_variable.partitioner.sets))
mv = np.array(mvs)

View File

@ -3,6 +3,8 @@ import pandas as pd
from enum import Enum
from pyFTS.common import FuzzySet, Membership
from pyFTS.partitioners import partitioner, Grid
from datetime import date as dt
class DateTime(Enum):
@ -94,7 +96,7 @@ class FuzzySet(FuzzySet.FuzzySet):
self.type = kwargs.get('type', 'seasonal')
def transform(self, x):
if self.type == 'seasonal':
if self.type == 'seasonal' and isinstance(x, (dt, pd.Timestamp)):
dp = strip_datepart(x, self.datepart)
else:
dp = x

View File

@ -39,6 +39,8 @@ class TimeGridPartitioner(partitioner.Partitioner):
else:
self.ordered_sets = FS.set_ordered(self.sets)
self.extractor = lambda x: strip_datepart(x, self.season)
def build(self, data):
sets = {}

View File

@ -30,6 +30,8 @@ class Partitioner(object):
"""In a multivariate context, the variable that contains this partitioner"""
self.type = kwargs.get('type', 'common')
"""The type of fuzzy sets that are generated by this partitioner"""
self.extractor = kwargs.get('extractor', lambda x: x)
"""Anonymous function used to extract a single primitive type from an object instance"""
self.ordered_sets = None
if kwargs.get('preprocess',True):

View File

@ -17,37 +17,92 @@ from pyFTS.models.multivariate import common, variable, mvfts, cmvfts
from pyFTS.models.seasonal import partitioner as seasonal
from pyFTS.models.seasonal.common import DateTime
from pyFTS.data import Malaysia
dataset = Malaysia.get_dataframe()
dataset["time"] = pd.to_datetime(dataset["time"], format='%m/%d/%y %I:%M %p')
data = dataset['load'].values
train_split = 8760
train_mv = dataset.iloc[:train_split]
test_mv = dataset.iloc[train_split:]
sp = {'seasonality': DateTime.month , #'type': 'common',
'names': ['Jan','Feb','Mar','Apr','May','Jun','Jul', 'Aug','Sep','Oct','Nov','Dec']}
vmonth = variable.Variable("Month", data_label="time", partitioner=seasonal.TimeGridPartitioner, npart=12,
data=train_mv, partitioner_specific=sp)
sp = {'seasonality': DateTime.day_of_week, #'type': 'common',
'names': ['Mon','Tue','Wed','Thu','Fri','Sat','Sun']}
vday = variable.Variable("Weekday", data_label="time", partitioner=seasonal.TimeGridPartitioner, npart=7,
data=train_mv, partitioner_specific=sp)
sp = {'seasonality': DateTime.hour_of_day} #, 'type': 'common'}
vhour = variable.Variable("Hour", data_label="time", partitioner=seasonal.TimeGridPartitioner, npart=24,
data=train_mv, partitioner_specific=sp)
vload = variable.Variable("load", data_label="load", partitioner=Grid.GridPartitioner, npart=10,
data=train_mv)
"""
model = cmvfts.ClusteredMVFTS(order=2, knn=3, cluster_params={'optmize': True})
model.append_variable(vmonthp)
model.append_variable(vdayp)
model.append_variable(vhourp)
model.append_variable(vload)
model.target_variable = vload
model.fit(train_mv)
print(len(model.cluster.sets.keys()))
model.cluster.prune()
print(len(model.cluster.sets.keys()))
model.predict(test_mv)
"""
'''
from pyFTS.data import Malaysia
dataset = Malaysia.get_dataframe()
dataset["date"] = pd.to_datetime(dataset["time"], format='%m/%d/%y %I:%M %p')
mv_train = dataset.iloc[:100000]
train_mv = dataset.iloc[:10000]
test_mv = dataset.iloc[10000:]
sp = {'seasonality': DateTime.month , 'names': ['Jan','Feb','Mar','Apr','May','Jun','Jul', 'Aug','Sep','Oct','Nov','Dec']}
vmonth = variable.Variable("Month", data_label="date", partitioner=seasonal.TimeGridPartitioner, npart=12,
data=mv_train, partitioner_specific=sp)
data=train_mv, partitioner_specific=sp)
sp = {'seasonality': DateTime.day_of_week, 'names': ['Mon','Tue','Wed','Thu','Fri','Sat','Sun']}
vday = variable.Variable("Weekday", data_label="date", partitioner=seasonal.TimeGridPartitioner, npart=7,
data=mv_train, partitioner_specific=sp)
data=train_mv, partitioner_specific=sp)
sp = {'seasonality': DateTime.hour_of_day}
vhour = variable.Variable("Hour", data_label="date", partitioner=seasonal.TimeGridPartitioner, npart=24,
data=mv_train, partitioner_specific=sp)
data=train_mv, partitioner_specific=sp)
vload = variable.Variable("load", data_label="load", partitioner=Grid.GridPartitioner, npart=10,
data=mv_train)
data=train_mv)
vtemperature = variable.Variable("temperature", data_label="temperature", partitioner=Grid.GridPartitioner, npart=10,
data=mv_train)
data=train_mv)
"""
variables = {
'month': vmonth,
'day': vday,
@ -74,7 +129,7 @@ for k in [itertools.combinations(var_list, r) for r in range(2,len(var_list))]:
models.append(model)
"""
#"""
"""
dataset = pd.read_csv('/home/petronio/Downloads/priceHong')
dataset['hour'] = dataset.index.values % 24
@ -107,27 +162,46 @@ data = [[1, 1.0], [2, 2.0]]
df = pd.DataFrame(data, columns=['hour','price'])
forecasts = model.predict(df, steps_ahead=24, generators={'Hour': lambda x : (x+1)%24 })
"""
'''
params = [
{},
{},
{'order': 2, 'knn': 1},
{'order': 2, 'knn': 2},
{'order': 2, 'knn': 3}
{'order': 2, 'knn': 3, 'cluster_params': {'optmize': True}},
{'order': 2, 'knn': 2, 'cluster_params': {'optmize': True}},
{'order': 2, 'knn': 1, 'cluster_params': {'optmize': True}}
]
from pyFTS.models.multivariate import grid
cluster = None
for ct, method in enumerate([mvfts.MVFTS, wmvfts.WeightedMVFTS, cmvfts.ClusteredMVFTS, cmvfts.ClusteredMVFTS, cmvfts.ClusteredMVFTS]):
model = method(**params[ct])
model.append_variable(vmonth)
model.append_variable(vday)
model.append_variable(vhour)
model.append_variable(vprice)
model.target_variable = vprice
model.append_variable(vload)
model.target_variable = vload
model.fit(train_mv)
if method == cmvfts.ClusteredMVFTS:
model.cluster.prune()
try:
print(model.shortname, params[ct], Measures.get_point_statistics(test_mv, model))
except Exception as ex:
print(model.shortname, params[ct])
print(ex)
print("\n\n==============================================\n\n")
#print(model1)
#print(model1.predict(test_mv, steps_ahead=24, generators={'Hour': lambda x : (x+1)%24 }))
'''
#'''