From 048bb64927c2beaf9faf975c73806908a9cbd7fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=C3=B4nio=20C=C3=A2ndido?= Date: Tue, 7 May 2019 14:06:12 -0300 Subject: [PATCH] Improvement on FCM GA including both average and standard deviation on the learning optimization objective --- pyFTS/common/Util.py | 11 +- pyFTS/fcm/GA.py | 3 +- pyFTS/tests/multivariate.py | 283 +++++++++++++++--------------------- 3 files changed, 128 insertions(+), 169 deletions(-) diff --git a/pyFTS/common/Util.py b/pyFTS/common/Util.py index 842764f..e4e433c 100644 --- a/pyFTS/common/Util.py +++ b/pyFTS/common/Util.py @@ -6,6 +6,7 @@ import time import matplotlib.pyplot as plt import dill import numpy as np +import pandas as pd import matplotlib.cm as cmx import matplotlib.colors as pltcolors from pyFTS.probabilistic import ProbabilityDistribution @@ -340,7 +341,10 @@ def sliding_window(data, windowsize, train=0.8, inc=0.1, **kwargs): :param inc: percentual of data used for slide the window :return: window count, training set, test set """ - l = len(data) + + multivariate = True if isinstance(data, pd.DataFrame) else False + + l = len(data) if not multivariate else len(data.index) ttrain = int(round(windowsize * train, 0)) ic = int(round(windowsize * inc, 0)) @@ -357,7 +361,10 @@ def sliding_window(data, windowsize, train=0.8, inc=0.1, **kwargs): _end = l else: _end = count + windowsize - yield (count, data[count : count + ttrain], data[count + ttrain : _end] ) + if multivariate: + yield (count, data.iloc[count: count + ttrain], data.iloc[count + ttrain: _end]) + else: + yield (count, data[count : count + ttrain], data[count + ttrain : _end] ) def persist_obj(obj, file): diff --git a/pyFTS/fcm/GA.py b/pyFTS/fcm/GA.py index c40981f..a199e16 100644 --- a/pyFTS/fcm/GA.py +++ b/pyFTS/fcm/GA.py @@ -125,9 +125,10 @@ def evaluate(dataset, individual, **kwargs): errors.append(rmse) _rmse = np.nanmean(errors) + _std = np.nanstd(errors) #print("EVALUATION {}".format(individual)) - return {'rmse': _rmse} + return {'rmse': .6 * _rmse + .4 * _std} diff --git a/pyFTS/tests/multivariate.py b/pyFTS/tests/multivariate.py index 2a45476..bf253cb 100644 --- a/pyFTS/tests/multivariate.py +++ b/pyFTS/tests/multivariate.py @@ -6,193 +6,144 @@ from pyFTS.data import Enrollments, TAIEX, SONDA from pyFTS.partitioners import Grid, Simple, Entropy from pyFTS.common import Util -from pyspark import SparkConf -from pyspark import SparkContext - -import os -# make sure pyspark tells workers to use python3 not 2 if both are installed -os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3' -os.environ['PYSPARK_DRIVER_PYTHON'] = '/usr/bin/python3' -#''' - - -from pyFTS.models.multivariate import common, variable, wmvfts -from pyFTS.models.seasonal import partitioner as seasonal -from pyFTS.models.seasonal.common import DateTime -from pyFTS.partitioners import Grid - -import matplotlib.pyplot as plt - -''' -#fig, ax = plt.subplots(nrows=3, ncols=1, figsize=[15,5]) - - -sp = {'seasonality': DateTime.day_of_year , 'names': ['Jan','Feb','Mar','Apr','May','Jun','Jul', 'Aug','Sep','Oct','Nov','Dec']} - -vmonth = variable.Variable("Month", data_label="datahora", partitioner=seasonal.TimeGridPartitioner, npart=12, alpha_cut=.25, - data=train, partitioner_specific=sp) - -#vmonth.partitioner.plot(ax[0]) - -sp = {'seasonality': DateTime.minute_of_day, 'names': [str(k) for k in range(0,24)]} - -vhour = variable.Variable("Hour", data_label="datahora", partitioner=seasonal.TimeGridPartitioner, npart=24, alpha_cut=.2, - data=train, partitioner_specific=sp) - -#vhour.partitioner.plot(ax[1]) - - -vavg = variable.Variable("Radiation", data_label="glo_avg", alias='R', - partitioner=Grid.GridPartitioner, npart=35, alpha_cut=.3, - data=train) - -#vavg.partitioner.plot(ax[2]) - -#plt.tight_layout() - -#Util.show_and_save_image(fig, 'variables', True) - -model = wmvfts.WeightedMVFTS(explanatory_variables=[vmonth,vhour,vavg], target_variable=vavg) - - -_s1 = time.time() -model.fit(train) -#model.fit(data, distributed='spark', url='spark://192.168.0.106:7077', num_batches=4) -_s2 = time.time() - -print(_s2-_s1) - -Util.persist_obj(model, 'sonda_wmvfts') -''' - -#model = Util.load_obj('sonda_wmvfts') - -''' +from pyFTS.models.multivariate import mvfts, wmvfts, cmvfts, grid, granular from pyFTS.benchmarks import Measures +from pyFTS.common import Util as cUtil -_s1 = time.time() -print(Measures.get_point_statistics(test, model)) -_s2 = time.time() - -print(_s2-_s1) -''' - -#print(len(model)) - - -# - -#model.fit(data, distributed='dispy', nodes=['192.168.0.110']) -#''' - -''' -from pyFTS.models.multivariate import common, variable, mvfts, wmvfts, cmvfts, grid from pyFTS.models.seasonal import partitioner as seasonal from pyFTS.models.seasonal.common import DateTime -dataset = pd.read_csv('/home/petronio/Downloads/gefcom12.csv') -dataset = dataset.dropna() - -train_mv = dataset.iloc[:15000] -test_mv = dataset.iloc[15000:] from pyFTS.models.multivariate import common, variable, mvfts -from pyFTS.models.seasonal import partitioner as seasonal -from pyFTS.models.seasonal.common import DateTime +from pyFTS.partitioners import Grid +from pyFTS.common import Membership -sp = {'seasonality': DateTime.minute_of_day, 'names': [str(k) for k in range(0,24)]} +import os + +''' +from pyFTS.data import lorentz +df = lorentz.get_dataframe(iterations=5000) + +train = df.iloc[:4000] +#test = df.iloc[4000:] + +npart=120 + + +import sys + + +vx = variable.Variable("x", data_label="x", alias='x', partitioner=Grid.GridPartitioner, + partitioner_specific={'mf': Membership.gaussmf}, npart=npart, data=train) +vy = variable.Variable("y", data_label="y", alias='y', partitioner=Grid.GridPartitioner, + partitioner_specific={'mf': Membership.gaussmf}, npart=int(npart*1.5), data=train) +vz = variable.Variable("z", data_label="z", alias='z', partitioner=Grid.GridPartitioner, + partitioner_specific={'mf': Membership.gaussmf}, npart=int(npart*1.2), data=train) + + + +rows = [] + +for ct, train, test in cUtil.sliding_window(df, windowsize=4100, train=.97, inc=.05): + print('Window {}'.format(ct)) + for order in [1, 2, 3]: + for knn in [1, 2, 3]: + model = granular.GranularWMVFTS(explanatory_variables=[vx, vy, vz], target_variable=vx, order=order, + knn=knn) + + model.fit(train) + + forecasts1 = model.predict(test, type='multivariate') + forecasts2 = model.predict(test, type='multivariate', steps_ahead=100) + + for var in ['x', 'y', 'z']: + row = [order, knn, var, len(model)] + for horizon in [1, 25, 50, 75, 100]: + if horizon == 1: + row.append( Measures.mape(test[var].values[model.order:model.order+10], + forecasts1[var].values[:10])) + else: + row.append( Measures.mape(test[var].values[:horizon], + forecasts2[var].values[:horizon])) + + print(row) + rows.append(row) + +columns = ['Order', 'knn', 'var', 'Rules'] +for horizon in [1, 25, 50, 75, 100]: + columns.append('h{}'.format(horizon)) +final = pd.DataFrame(rows, columns=columns) + +final.to_csv('gmvfts_lorentz1.csv',sep=';',index=False) +''' + +import pandas as pd +df = pd.read_csv('https://query.data.world/s/ftb7bzgobr6bsg6bsuxuqowja6ew4r') + +#df.dropna() + +mload = np.nanmean(df["load"].values) +df['load'] = np.where(pd.isna(df["load"]), mload, df["load"]) + +mtemp = np.nanmean(df["temperature"].values) +df['temperature'] = np.where(pd.isna(df["temperature"]), mtemp, df["temperature"]) + +df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d %H:%M:%S') + +df['hour'] = np.float64(df['date'].apply(lambda x: x.strftime('%H'))) +df['weekday'] = np.float64(df['date'].apply(lambda x: x.strftime('%w'))) +df['month'] = np.float64(df['date'].apply(lambda x: x.strftime('%m'))) + +train_mv = df.iloc[:31000] +test_mv = df.iloc[:31000:] + +sp = {'seasonality': DateTime.minute_of_day, 'names': [str(k)+'hs' for k in range(0,24)]} vhour = variable.Variable("Hour", data_label="date", partitioner=seasonal.TimeGridPartitioner, npart=24, - data=train_mv, partitioner_specific=sp) - -sp = {'seasonality': DateTime.day_of_week, 'names': ['mon','tue','wed','tur','fri','sat','sun']} - -vday = variable.Variable("DayOfWeek", data_label="date", partitioner=seasonal.TimeGridPartitioner, npart=7, - data=train_mv, partitioner_specific=sp) - -#sp = {'seasonality': DateTime.day_of_month, 'names': ['jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec']} - -sp = {'seasonality': DateTime.quarter} - -vmonth = variable.Variable("Month", data_label="date", partitioner=seasonal.TimeGridPartitioner, npart=4, - data=train_mv, partitioner_specific=sp) + data=train_mv, partitioner_specific=sp, alpha_cut=.3) +vtemp = variable.Variable("Temperature", data_label="temperature", alias='temp', + partitioner=Grid.GridPartitioner, npart=15, func=Membership.gaussmf, + data=train_mv, alpha_cut=.3) vload = variable.Variable("Load", data_label="load", alias='load', - partitioner=Grid.GridPartitioner, npart=20, - data=train_mv) - -vtemp = variable.Variable("Temperature", data_label="temperature", alias='temperature', - partitioner=Grid.GridPartitioner, npart=20, - data=train_mv) - -from pyFTS.models.multivariate import mvfts, wmvfts, cmvfts, grid - -from pyFTS.models.multivariate import mvfts, wmvfts, cmvfts, grid - -mtemp = wmvfts.WeightedMVFTS(explanatory_variables=[vhour, vmonth, vtemp], target_variable=vtemp) -mtemp.fit(train_mv) - -Util.persist_obj(mtemp, 'mtemp') - -from pyFTS.models import hofts - -#mtemp = hofts.WeightedHighOrderFTS(order=2, partitioner=vtemp.partitioner) -#mtemp.fit(train_mv['temperature'].values) - -from pyFTS.models.multivariate import mvfts, wmvfts, cmvfts, grid - -mload = wmvfts.WeightedMVFTS(explanatory_variables=[vtemp, vload], target_variable=vload) -mload.fit(train_mv) - -Util.persist_obj(mload, 'mload') + partitioner=Grid.GridPartitioner, npart=20, func=Membership.gaussmf, + data=train_mv, alpha_cut=.3) +rows = [] time_generator = lambda x : pd.to_datetime(x) + pd.to_timedelta(1, unit='h') +for ct, train, test in cUtil.sliding_window(df, windowsize=32000, train=.98, inc=.05): + print('Window {}'.format(ct)) + for order in [1, 2, 3]: + for knn in [1, 2, 3]: + model = granular.GranularWMVFTS(explanatory_variables=[vhour, vtemp, vload], target_variable=vload, + order=order, knn=knn) -forecasts = mload.predict(test_mv.iloc[:1], steps_ahead=48, generators={'date': time_generator, - 'temperature': mtemp}) + model.fit(train) -''' + forecasts1 = model.predict(test, type='multivariate') + forecasts2 = model.predict(test, type='multivariate', generators={'date': time_generator}, + steps_ahead=100) + for var in ['temperature','load']: + row = [order, knn, var, len(model)] + for horizon in [1, 25, 50, 75, 100]: + if horizon == 1: + row.append(Measures.mape(test[var].values[model.order:model.order + 10], + forecasts1[var].values[:10])) + else: + row.append(Measures.mape(test[var].values[:horizon], + forecasts2[var].values[:horizon])) -data = pd.read_csv('https://query.data.world/s/6xfb5useuotbbgpsnm5b2l3wzhvw2i', sep=';') + print(row) + rows.append(row) -train = data.iloc[:9000] -test = data.iloc[9000:9200] +columns = ['Order', 'knn', 'var', 'Rules'] +for horizon in [1, 25, 50, 75, 100]: + columns.append('h{}'.format(horizon)) +final = pd.DataFrame(rows, columns=columns) -from pyFTS.models.multivariate import common, variable, mvfts -from pyFTS.models.seasonal import partitioner as seasonal -from pyFTS.models.seasonal.common import DateTime -from pyFTS.partitioners import Grid - -sp = {'seasonality': DateTime.day_of_year , 'names': ['Jan','Fev','Mar','Abr','Mai','Jun','Jul', 'Ago','Set','Out','Nov','Dez']} - -vmonth = variable.Variable("Month", data_label="data", partitioner=seasonal.TimeGridPartitioner, npart=12, - data=train, partitioner_specific=sp, alpha_cut=.5) - -sp = {'seasonality': DateTime.minute_of_day, 'names': [str(k) for k in range(0,24)]} - -vhour = variable.Variable("Hour", data_label="data", partitioner=seasonal.TimeGridPartitioner, npart=24, - data=train, partitioner_specific=sp, alpha_cut=.5) - -#print(vhour.partitioner) - -#print(vmonth.partitioner.fuzzyfy(180)) - -vavg = variable.Variable("Radiation", data_label="glo_avg", alias='rad', - partitioner=Grid.GridPartitioner, npart=25, alpha_cut=.3, - data=train) - -from pyFTS.models.multivariate import mvfts, wmvfts, cmvfts, grid, granular - -model = granular.GranularWMVFTS(explanatory_variables=[vmonth, vhour, vavg], target_variable=vavg, - order=2, knn=7) - -model.fit(train) - -print(model) - -#model.predict(test) +final.to_csv('gmvfts_gefcom12.csv', sep=';', index=False) \ No newline at end of file