From 8c1fec482da1e99a867464e45719d786308c33d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=C3=B4nio=20C=C3=A2ndido?= Date: Mon, 19 Nov 2018 08:30:06 -0200 Subject: [PATCH] Clustered MVFTS improvements --- pyFTS/models/multivariate/cmvfts.py | 15 ++-- pyFTS/models/multivariate/grid.py | 59 +++++++++++++-- pyFTS/models/multivariate/mvfts.py | 10 +-- pyFTS/models/seasonal/common.py | 4 +- pyFTS/models/seasonal/partitioner.py | 2 + pyFTS/partitioners/partitioner.py | 2 + pyFTS/tests/multivariate.py | 108 ++++++++++++++++++++++----- 7 files changed, 165 insertions(+), 35 deletions(-) diff --git a/pyFTS/models/multivariate/cmvfts.py b/pyFTS/models/multivariate/cmvfts.py index 8d94d51..b5b621f 100644 --- a/pyFTS/models/multivariate/cmvfts.py +++ b/pyFTS/models/multivariate/cmvfts.py @@ -16,8 +16,8 @@ class ClusteredMVFTS(mvfts.MVFTS): """The cluster method to be called when a new model is build""" self.cluster_params = kwargs.get('cluster_params', {}) """The cluster method parameters""" - self.cluster = None - """The most recent trained clusterer""" + self.cluster = kwargs.get('cluster', None) + """The trained clusterer""" self.fts_method = kwargs.get('fts_method', hofts.WeightedHighOrderFTS) """The FTS method to be called when a new model is build""" @@ -38,17 +38,16 @@ class ClusteredMVFTS(mvfts.MVFTS): def fuzzyfy(self,data): ndata = [] - for ct in range(1, len(data.index)+1): - ix = data.index[ct - 1] - data_point = self.format_data(data.loc[ix]) + for index, row in data.iterrows(): + data_point = self.format_data(row) ndata.append(common.fuzzyfy_instance_clustered(data_point, self.cluster, self.alpha_cut)) return ndata - def train(self, data, **kwargs): - self.cluster = self.cluster_method(data=data, mvfts=self, neighbors=self.knn) + if self.cluster is None: + self.cluster = self.cluster_method(data=data, mvfts=self, neighbors=self.knn, **self.cluster_params) self.model = self.fts_method(partitioner=self.cluster, **self.fts_params) if self.model.is_high_order: @@ -59,6 +58,8 @@ class ClusteredMVFTS(mvfts.MVFTS): self.model.train(ndata, fuzzyfied=True) + self.cluster.prune() + def forecast(self, ndata, **kwargs): ndata = self.fuzzyfy(ndata) diff --git a/pyFTS/models/multivariate/grid.py b/pyFTS/models/multivariate/grid.py index 83ee522..2fcc6e8 100644 --- a/pyFTS/models/multivariate/grid.py +++ b/pyFTS/models/multivariate/grid.py @@ -5,6 +5,7 @@ from scipy.spatial import KDTree import numpy as np import pandas as pd + class GridCluster(partitioner.Partitioner): """ A cartesian product of all fuzzy sets of all variables @@ -17,8 +18,12 @@ class GridCluster(partitioner.Partitioner): self.sets = {} self.kdtree = None self.index = {} - self.build(None) self.neighbors = kwargs.get('neighbors', 2) + self.optmize = kwargs.get('optmize', False) + if self.optmize: + self.count = {} + data = kwargs.get('data', [None]) + self.build(data) def build(self, data): @@ -26,7 +31,6 @@ class GridCluster(partitioner.Partitioner): for k in self.mvfts.explanatory_variables] midpoints = [] - index = {} c = 0 for k in product(*fsets): @@ -44,14 +48,59 @@ class GridCluster(partitioner.Partitioner): self.index[c] = _key c += 1 + import sys + sys.setrecursionlimit(100000) + self.kdtree = KDTree(midpoints) + sys.setrecursionlimit(1000) + + def prune(self): + + if not self.optmize: + return + + for fset in [fs for fs in self.sets.keys()]: + if fset not in self.count: + fs = self.sets.pop(fset) + del (fs) + + + vars = [k.name for k in self.mvfts.explanatory_variables] + + midpoints = [] + + self.index = {} + + for ct, fset in enumerate(self.sets.values()): + mp = [] + for vr in vars: + mp.append(fset.sets[vr].centroid) + midpoints.append(mp) + self.index[ct] = fset.name + + import sys + sys.setrecursionlimit(100000) + + self.kdtree = KDTree(midpoints) + + sys.setrecursionlimit(1000) + + def knn(self, data): - tmp = [data[k.name] for k in self.mvfts.explanatory_variables] - tmp, ix = self.kdtree.query(tmp, self.neighbors ) + tmp = [data[k.name] + for k in self.mvfts.explanatory_variables] + tmp, ix = self.kdtree.query(tmp, self.neighbors) if not isinstance(ix, (list, np.ndarray)): ix = [ix] - return [self.index[k] for k in ix] + if self.optmize: + tmp = [] + for k in ix: + tmp.append(self.index[k]) + self.count[self.index[k]] = 1 + return tmp + else: + return [self.index[k] for k in ix] diff --git a/pyFTS/models/multivariate/mvfts.py b/pyFTS/models/multivariate/mvfts.py index cf3ac12..40e1180 100644 --- a/pyFTS/models/multivariate/mvfts.py +++ b/pyFTS/models/multivariate/mvfts.py @@ -31,7 +31,8 @@ class MVFTS(fts.FTS): def format_data(self, data): ndata = {} for var in self.explanatory_variables: - ndata[var.name] = data[var.data_label] + #ndata[var.name] = data[var.data_label] + ndata[var.name] = var.partitioner.extractor(data[var.data_label]) return ndata @@ -109,9 +110,8 @@ class MVFTS(fts.FTS): def forecast(self, data, **kwargs): ret = [] ndata = self.apply_transformations(data) - for ix in ndata.index: - data_point = ndata.loc[ix] - flrs = self.generate_lhs_flrs(data_point) + for index, row in ndata.iterrows(): + flrs = self.generate_lhs_flrs(row) mvs = [] mps = [] for flr in flrs: @@ -120,7 +120,7 @@ class MVFTS(fts.FTS): mvs.append(0.) mps.append(0.) else: - mvs.append(self.flrgs[flrg.get_key()].get_membership(self.format_data(data_point), self.explanatory_variables)) + mvs.append(self.flrgs[flrg.get_key()].get_membership(self.format_data(row), self.explanatory_variables)) mps.append(self.flrgs[flrg.get_key()].get_midpoint(self.target_variable.partitioner.sets)) mv = np.array(mvs) diff --git a/pyFTS/models/seasonal/common.py b/pyFTS/models/seasonal/common.py index fd5d3db..3ccf7f4 100644 --- a/pyFTS/models/seasonal/common.py +++ b/pyFTS/models/seasonal/common.py @@ -3,6 +3,8 @@ import pandas as pd from enum import Enum from pyFTS.common import FuzzySet, Membership from pyFTS.partitioners import partitioner, Grid +from datetime import date as dt + class DateTime(Enum): @@ -94,7 +96,7 @@ class FuzzySet(FuzzySet.FuzzySet): self.type = kwargs.get('type', 'seasonal') def transform(self, x): - if self.type == 'seasonal': + if self.type == 'seasonal' and isinstance(x, (dt, pd.Timestamp)): dp = strip_datepart(x, self.datepart) else: dp = x diff --git a/pyFTS/models/seasonal/partitioner.py b/pyFTS/models/seasonal/partitioner.py index 2f3a6e8..a20d2f2 100644 --- a/pyFTS/models/seasonal/partitioner.py +++ b/pyFTS/models/seasonal/partitioner.py @@ -39,6 +39,8 @@ class TimeGridPartitioner(partitioner.Partitioner): else: self.ordered_sets = FS.set_ordered(self.sets) + self.extractor = lambda x: strip_datepart(x, self.season) + def build(self, data): sets = {} diff --git a/pyFTS/partitioners/partitioner.py b/pyFTS/partitioners/partitioner.py index 566cb65..820f6a8 100644 --- a/pyFTS/partitioners/partitioner.py +++ b/pyFTS/partitioners/partitioner.py @@ -30,6 +30,8 @@ class Partitioner(object): """In a multivariate context, the variable that contains this partitioner""" self.type = kwargs.get('type', 'common') """The type of fuzzy sets that are generated by this partitioner""" + self.extractor = kwargs.get('extractor', lambda x: x) + """Anonymous function used to extract a single primitive type from an object instance""" self.ordered_sets = None if kwargs.get('preprocess',True): diff --git a/pyFTS/tests/multivariate.py b/pyFTS/tests/multivariate.py index 3f0898b..5123ff8 100644 --- a/pyFTS/tests/multivariate.py +++ b/pyFTS/tests/multivariate.py @@ -17,37 +17,92 @@ from pyFTS.models.multivariate import common, variable, mvfts, cmvfts from pyFTS.models.seasonal import partitioner as seasonal from pyFTS.models.seasonal.common import DateTime + +from pyFTS.data import Malaysia + +dataset = Malaysia.get_dataframe() + +dataset["time"] = pd.to_datetime(dataset["time"], format='%m/%d/%y %I:%M %p') + + +data = dataset['load'].values + +train_split = 8760 + + +train_mv = dataset.iloc[:train_split] +test_mv = dataset.iloc[train_split:] + +sp = {'seasonality': DateTime.month , #'type': 'common', + 'names': ['Jan','Feb','Mar','Apr','May','Jun','Jul', 'Aug','Sep','Oct','Nov','Dec']} + +vmonth = variable.Variable("Month", data_label="time", partitioner=seasonal.TimeGridPartitioner, npart=12, + data=train_mv, partitioner_specific=sp) + +sp = {'seasonality': DateTime.day_of_week, #'type': 'common', + 'names': ['Mon','Tue','Wed','Thu','Fri','Sat','Sun']} + +vday = variable.Variable("Weekday", data_label="time", partitioner=seasonal.TimeGridPartitioner, npart=7, + data=train_mv, partitioner_specific=sp) + +sp = {'seasonality': DateTime.hour_of_day} #, 'type': 'common'} + +vhour = variable.Variable("Hour", data_label="time", partitioner=seasonal.TimeGridPartitioner, npart=24, + data=train_mv, partitioner_specific=sp) + +vload = variable.Variable("load", data_label="load", partitioner=Grid.GridPartitioner, npart=10, + data=train_mv) + """ +model = cmvfts.ClusteredMVFTS(order=2, knn=3, cluster_params={'optmize': True}) +model.append_variable(vmonthp) +model.append_variable(vdayp) +model.append_variable(vhourp) +model.append_variable(vload) +model.target_variable = vload +model.fit(train_mv) + +print(len(model.cluster.sets.keys())) + +model.cluster.prune() + +print(len(model.cluster.sets.keys())) + +model.predict(test_mv) +""" + +''' from pyFTS.data import Malaysia dataset = Malaysia.get_dataframe() dataset["date"] = pd.to_datetime(dataset["time"], format='%m/%d/%y %I:%M %p') -mv_train = dataset.iloc[:100000] +train_mv = dataset.iloc[:10000] +test_mv = dataset.iloc[10000:] sp = {'seasonality': DateTime.month , 'names': ['Jan','Feb','Mar','Apr','May','Jun','Jul', 'Aug','Sep','Oct','Nov','Dec']} vmonth = variable.Variable("Month", data_label="date", partitioner=seasonal.TimeGridPartitioner, npart=12, - data=mv_train, partitioner_specific=sp) + data=train_mv, partitioner_specific=sp) sp = {'seasonality': DateTime.day_of_week, 'names': ['Mon','Tue','Wed','Thu','Fri','Sat','Sun']} vday = variable.Variable("Weekday", data_label="date", partitioner=seasonal.TimeGridPartitioner, npart=7, - data=mv_train, partitioner_specific=sp) + data=train_mv, partitioner_specific=sp) sp = {'seasonality': DateTime.hour_of_day} vhour = variable.Variable("Hour", data_label="date", partitioner=seasonal.TimeGridPartitioner, npart=24, - data=mv_train, partitioner_specific=sp) + data=train_mv, partitioner_specific=sp) vload = variable.Variable("load", data_label="load", partitioner=Grid.GridPartitioner, npart=10, - data=mv_train) + data=train_mv) vtemperature = variable.Variable("temperature", data_label="temperature", partitioner=Grid.GridPartitioner, npart=10, - data=mv_train) - + data=train_mv) +""" variables = { 'month': vmonth, 'day': vday, @@ -74,7 +129,7 @@ for k in [itertools.combinations(var_list, r) for r in range(2,len(var_list))]: models.append(model) """ -#""" +""" dataset = pd.read_csv('/home/petronio/Downloads/priceHong') dataset['hour'] = dataset.index.values % 24 @@ -107,27 +162,46 @@ data = [[1, 1.0], [2, 2.0]] df = pd.DataFrame(data, columns=['hour','price']) forecasts = model.predict(df, steps_ahead=24, generators={'Hour': lambda x : (x+1)%24 }) - - +""" ''' + params = [ {}, {}, - {'order': 2, 'knn': 1}, - {'order': 2, 'knn': 2}, - {'order': 2, 'knn': 3} + {'order': 2, 'knn': 3, 'cluster_params': {'optmize': True}}, + {'order': 2, 'knn': 2, 'cluster_params': {'optmize': True}}, + {'order': 2, 'knn': 1, 'cluster_params': {'optmize': True}} ] +from pyFTS.models.multivariate import grid + +cluster = None + + for ct, method in enumerate([mvfts.MVFTS, wmvfts.WeightedMVFTS, cmvfts.ClusteredMVFTS, cmvfts.ClusteredMVFTS, cmvfts.ClusteredMVFTS]): + model = method(**params[ct]) + model.append_variable(vmonth) + model.append_variable(vday) model.append_variable(vhour) - model.append_variable(vprice) - model.target_variable = vprice + model.append_variable(vload) + model.target_variable = vload model.fit(train_mv) - print(model.shortname, params[ct], Measures.get_point_statistics(test_mv, model)) + + if method == cmvfts.ClusteredMVFTS: + model.cluster.prune() + + try: + + print(model.shortname, params[ct], Measures.get_point_statistics(test_mv, model)) + + except Exception as ex: + print(model.shortname, params[ct]) + print(ex) + print("\n\n==============================================\n\n") #print(model1) #print(model1.predict(test_mv, steps_ahead=24, generators={'Hour': lambda x : (x+1)%24 })) -''' \ No newline at end of file +#''' \ No newline at end of file