From 1fce1145cc619fd0efe34ae8e246b3e886ff9275 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=C3=B4nio=20C=C3=A2ndido?= Date: Mon, 28 Jan 2019 15:38:40 -0200 Subject: [PATCH] Bugfixes and improvementos con cmvfts and benchmarks.measures --- pyFTS/benchmarks/Measures.py | 22 +++++++-------- pyFTS/common/fts.py | 2 +- pyFTS/models/hofts.py | 27 ++++++++++++------ pyFTS/models/multivariate/cmvfts.py | 5 ++-- pyFTS/tests/multivariate.py | 43 ++++++++++++++++------------- 5 files changed, 58 insertions(+), 41 deletions(-) diff --git a/pyFTS/benchmarks/Measures.py b/pyFTS/benchmarks/Measures.py index 94e55bb..0db7a47 100644 --- a/pyFTS/benchmarks/Measures.py +++ b/pyFTS/benchmarks/Measures.py @@ -19,7 +19,7 @@ def acf(data, k): :param k: :return: """ - mu = np.mean(data) + mu = np.nanmean(data) sigma = np.var(data) n = len(data) s = 0 @@ -68,7 +68,7 @@ def mape(targets, forecasts): targets = np.array(targets) if isinstance(forecasts, list): forecasts = np.array(forecasts) - return np.mean(np.abs(np.divide((targets - forecasts), targets))) * 100 + return np.nanmean(np.abs(np.divide(np.subtract(targets, forecasts), targets))) * 100 def smape(targets, forecasts, type=2): @@ -85,11 +85,11 @@ def smape(targets, forecasts, type=2): if isinstance(forecasts, list): forecasts = np.array(forecasts) if type == 1: - return np.mean(np.abs(forecasts - targets) / ((forecasts + targets) / 2)) + return np.nanmean(np.abs(forecasts - targets) / ((forecasts + targets) / 2)) elif type == 2: - return np.mean(np.abs(forecasts - targets) / (abs(forecasts) + abs(targets))) * 100 + return np.nanmean(np.abs(forecasts - targets) / (np.abs(forecasts) + abs(targets))) * 100 else: - return sum(np.abs(forecasts - targets)) / sum(forecasts + targets) + return np.sum(np.abs(forecasts - targets)) / np.sum(forecasts + targets) def mape_interval(targets, forecasts): @@ -114,9 +114,9 @@ def UStatistic(targets, forecasts): naive = [] y = [] for k in np.arange(0, l - 1): - y.append((forecasts[k] - targets[k]) ** 2) - naive.append((targets[k + 1] - targets[k]) ** 2) - return np.sqrt(sum(y) / sum(naive)) + y.append(np.subtract(forecasts[k], targets[k]) ** 2) + naive.append(np.subtract(targets[k + 1], targets[k]) ** 2) + return np.sqrt(np.divide(np.sum(y), np.sum(naive))) def TheilsInequality(targets, forecasts): @@ -188,7 +188,7 @@ def coverage(targets, forecasts): preds.append(1) else: preds.append(0) - return np.mean(preds) + return np.nanmean(preds) def pinball(tau, target, forecast): @@ -201,9 +201,9 @@ def pinball(tau, target, forecast): :return: float, distance of forecast to the tau-quantile of the target """ if target >= forecast: - return (target - forecast) * tau + return np.subtract(target, forecast) * tau else: - return (forecast - target) * (1 - tau) + return np.subtract(forecast, target) * (1 - tau) def pinball_mean(tau, targets, forecasts): diff --git a/pyFTS/common/fts.py b/pyFTS/common/fts.py index 4cc4852..77b9c96 100644 --- a/pyFTS/common/fts.py +++ b/pyFTS/common/fts.py @@ -526,7 +526,7 @@ class FTS(object): for r in sorted(self.flrgs, key=lambda key: self.flrgs[key].get_midpoint(self.partitioner.sets)): tmp = "{0}{1}\n".format(tmp, str(self.flrgs[r])) else: - for r in self.model.flrgs: + for r in self.flrgs: tmp = "{0}{1}\n".format(tmp, str(self.flrgs[r])) return tmp diff --git a/pyFTS/models/hofts.py b/pyFTS/models/hofts.py index 06df6bf..defdf09 100644 --- a/pyFTS/models/hofts.py +++ b/pyFTS/models/hofts.py @@ -9,6 +9,7 @@ import numpy as np from pyFTS.common import FuzzySet, FLR, fts, flrg from itertools import product + class HighOrderFLRG(flrg.FLRG): """Conventional High Order Fuzzy Logical Relationship Group""" def __init__(self, order, **kwargs): @@ -184,6 +185,8 @@ class HighOrderFTS(fts.FTS): explain = kwargs.get('explain', False) + fuzzyfied = kwargs.get('fuzzyfied', False) + ret = [] l = len(ndata) if not explain else self.max_lag + 1 @@ -191,26 +194,31 @@ class HighOrderFTS(fts.FTS): if l < self.max_lag: return ndata - for k in np.arange(self.max_lag, l+1): + for k in np.arange(self.max_lag, l): + + sample = ndata[k - self.max_lag: k] if explain: print("Fuzzyfication \n") - if not kwargs.get('fuzzyfied', False): - flrgs = self.generate_lhs_flrg(ndata[k - self.max_lag: k], explain) + if not fuzzyfied: + flrgs = self.generate_lhs_flrg(sample, explain) else: - flrgs = self.generate_lhs_flrg_fuzzyfied(ndata[k - self.max_lag: k], explain) + flrgs = self.generate_lhs_flrg_fuzzyfied(sample, explain) if explain: print("Rules:\n") - tmp = [] + midpoints = [] + memberships = [] for flrg in flrgs: if flrg.get_key() not in self.flrgs: if len(flrg.LHS) > 0: mp = self.partitioner.sets[flrg.LHS[-1]].centroid - tmp.append(mp) + mv = self.partitioner.sets[flrg.LHS[-1]].membership(sample[-1]) if not fuzzyfied else None + midpoints.append(mp) + memberships.append(mv) if explain: print("\t {} -> {} (Naïve)\t Midpoint: {}\n".format(str(flrg.LHS), flrg.LHS[-1], @@ -218,12 +226,15 @@ class HighOrderFTS(fts.FTS): else: flrg = self.flrgs[flrg.get_key()] mp = flrg.get_midpoint(self.partitioner.sets) - tmp.append(mp) + mv = flrg.get_membership(sample, self.partitioner.sets) if not fuzzyfied else None + midpoints.append(mp) + memberships.append(mv) if explain: print("\t {} \t Midpoint: {}\n".format(str(flrg), mp)) + print("\t {} \t Membership: {}\n".format(str(flrg), mv)) - final = np.nanmean(tmp) + final = np.dot(midpoints, memberships) if not fuzzyfied else np.nanmean(midpoints) ret.append(final) if explain: diff --git a/pyFTS/models/multivariate/cmvfts.py b/pyFTS/models/multivariate/cmvfts.py index 58b6ed9..b5ba0de 100644 --- a/pyFTS/models/multivariate/cmvfts.py +++ b/pyFTS/models/multivariate/cmvfts.py @@ -27,7 +27,7 @@ class ClusteredMVFTS(mvfts.MVFTS): self.order = kwargs.get("order", 2) self.lags = kwargs.get("lags", None) - self.alpha_cut = kwargs.get('alpha_cut', 0.25) + self.alpha_cut = kwargs.get('alpha_cut', 0.0) self.shortname = "ClusteredMVFTS" self.name = "Clustered Multivariate FTS" @@ -38,7 +38,8 @@ class ClusteredMVFTS(mvfts.MVFTS): ndata = [] for index, row in data.iterrows(): data_point = self.format_data(row) - ndata.append(common.fuzzyfy_instance_clustered(data_point, self.partitioner, alpha_cut=self.alpha_cut)) + ndata.append(common.fuzzyfy_instance_clustered(data_point, self.partitioner, + alpha_cut=self.alpha_cut)) return ndata diff --git a/pyFTS/tests/multivariate.py b/pyFTS/tests/multivariate.py index 3a2f190..8f9d197 100644 --- a/pyFTS/tests/multivariate.py +++ b/pyFTS/tests/multivariate.py @@ -14,14 +14,7 @@ import os os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3' os.environ['PYSPARK_DRIVER_PYTHON'] = '/usr/bin/python3' #''' -data = SONDA.get_dataframe() -data = data[['datahora','glo_avg']] - -data = data[~(np.isnan(data['glo_avg']) | np.equal(data['glo_avg'], 0.0))] - -train = data.iloc[:1500000] -test = data.iloc[1500000:] from pyFTS.models.multivariate import common, variable, wmvfts from pyFTS.models.seasonal import partitioner as seasonal @@ -96,12 +89,14 @@ from pyFTS.models.multivariate import common, variable, mvfts, wmvfts, cmvfts, g from pyFTS.models.seasonal import partitioner as seasonal from pyFTS.models.seasonal.common import DateTime -dataset = pd.read_csv('/home/petronio/Downloads/kalang.csv', sep=',') +dataset = pd.read_csv('/home/petronio/Downloads/Klang-daily Max.csv', sep=',') -dataset['date'] = pd.to_datetime(dataset["date"], format='%Y-%m-%d %H:%M:%S') +dataset['date'] = pd.to_datetime(dataset["Day/Month/Year"], format='%m/%d/%Y') +dataset['value'] = dataset['Daily-Max API'] -train_mv = dataset.iloc[:24505] -test_mv = dataset.iloc[24505:] + +train_mv = dataset.iloc[:732] +test_mv = dataset.iloc[732:] sp = {'seasonality': DateTime.day_of_week, 'names': ['mon','tue','wed','tur','fri','sat','sun']} @@ -109,26 +104,36 @@ vday = variable.Variable("DayOfWeek", data_label="date", partitioner=seasonal.Ti data=train_mv, partitioner_specific=sp) -print(vday.partitioner) +sp = {'seasonality': DateTime.day_of_year, 'names': ['jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec']} +vmonth = variable.Variable("Month", data_label="date", partitioner=seasonal.TimeGridPartitioner, npart=12, + data=train_mv, partitioner_specific=sp) -sp = {'seasonality': DateTime.minute_of_day, 'names': [str(k)+'hs' for k in range(0,24)]} -vhour = variable.Variable("Hour", data_label="date", partitioner=seasonal.TimeGridPartitioner, npart=24, - data=train_mv, partitioner_specific=sp, data_type=pd.datetime, mask='%Y-%m-%d %H:%M:%S') vvalue = variable.Variable("Pollution", data_label="value", alias='value', - partitioner=Entropy.EntropyPartitioner, npart=35, data_type=np.float64, + partitioner=Grid.GridPartitioner, npart=35, data=train_mv) -fs = grid.GridCluster(explanatory_variables=[vhour, vvalue], target_variable=vvalue) +fs = grid.GridCluster(explanatory_variables=[vday, vmonth, vvalue], target_variable=vvalue) + +print(len(fs.sets)) + #model = wmvfts.WeightedMVFTS(explanatory_variables=[vhour, vvalue], target_variable=vvalue) -model = cmvfts.ClusteredMVFTS(explanatory_variables=[vhour, vvalue], target_variable=vvalue, - partitioner=fs) +model = cmvfts.ClusteredMVFTS(explanatory_variables=[vday, vmonth, vvalue], target_variable=vvalue, + partitioner=fs, knn=5, order=2) model.fit(train_mv) #, distributed='spark', url='spark://192.168.0.106:7077') #''' #print(model) +print(len(fs.sets)) + + +from pyFTS.benchmarks import Measures +print(Measures.get_point_statistics(test_mv, model)) + +#print(model) + ''' def fun(x): return (x, x % 2)