From 9af879e1950e7f0d7612424473e11b479f3cf895 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=C3=B4nio=20C=C3=A2ndido=20de=20Lima=20e=20Silva?= Date: Wed, 24 May 2017 00:31:05 -0300 Subject: [PATCH] - Bugfixes and improvements on benchmarks --- benchmarks/ResidualAnalysis.py | 59 ++--- benchmarks/Util.py | 309 +++++++++++++++++++++++++-- benchmarks/benchmarks.py | 131 +++++++----- benchmarks/distributed_benchmarks.py | 66 +++--- common/Util.py | 4 +- ifts.py | 2 +- pwfts.py | 8 +- sadaei.py | 2 +- tests/general.py | 74 ++++--- 9 files changed, 495 insertions(+), 160 deletions(-) diff --git a/benchmarks/ResidualAnalysis.py b/benchmarks/ResidualAnalysis.py index 9265619..1de63fe 100644 --- a/benchmarks/ResidualAnalysis.py +++ b/benchmarks/ResidualAnalysis.py @@ -65,27 +65,30 @@ def plotResiduals(targets, models, tam=[8, 8], save=False, file=None): :return: """ fig, axes = plt.subplots(nrows=len(models), ncols=3, figsize=tam) - c = 0 - for mfts in models: + for c, mfts in enumerate(models): + if len(models) > 1: + ax = axes[c] + else: + ax = axes forecasts = mfts.forecast(targets) res = residuals(targets,forecasts,mfts.order) mu = np.mean(res) sig = np.std(res) - axes[c][0].set_title("Residuals Mean=" + str(mu) + " STD = " + str(sig)) - axes[c][0].set_ylabel('E') - axes[c][0].set_xlabel('T') - axes[c][0].plot(res) + ax[0].set_title("Residuals Mean=" + str(mu) + " STD = " + str(sig)) + ax[0].set_ylabel('E') + ax[0].set_xlabel('T') + ax[0].plot(res) - axes[c][1].set_title("Residuals Autocorrelation") - axes[c][1].set_ylabel('ACS') - axes[c][1].set_xlabel('Lag') - axes[c][1].acorr(res) + ax[1].set_title("Residuals Autocorrelation") + ax[1].set_ylabel('ACS') + ax[1].set_xlabel('Lag') + ax[1].acorr(res) - axes[c][2].set_title("Residuals Histogram") - axes[c][2].set_ylabel('Freq') - axes[c][2].set_xlabel('Bins') - axes[c][2].hist(res) + ax[2].set_title("Residuals Histogram") + ax[2].set_ylabel('Freq') + ax[2].set_xlabel('Bins') + ax[2].hist(res) c += 1 @@ -98,25 +101,29 @@ def plot_residuals(targets, models, tam=[8, 8], save=False, file=None): fig, axes = plt.subplots(nrows=len(models), ncols=3, figsize=tam) for c, mfts in enumerate(models, start=0): + if len(models) > 1: + ax = axes[c] + else: + ax = axes forecasts = mfts.forecast(targets) res = residuals(targets, forecasts, mfts.order) mu = np.mean(res) sig = np.std(res) - if c == 0: axes[c][0].set_title("Residuals", size='large') - axes[c][0].set_ylabel(mfts.shortname, size='large') - axes[c][0].set_xlabel(' ') - axes[c][0].plot(res) + if c == 0: ax[0].set_title("Residuals", size='large') + ax[0].set_ylabel(mfts.shortname, size='large') + ax[0].set_xlabel(' ') + ax[0].plot(res) - if c == 0: axes[c][1].set_title("Residuals Autocorrelation", size='large') - axes[c][1].set_ylabel('ACS') - axes[c][1].set_xlabel('Lag') - axes[c][1].acorr(res) + if c == 0: ax[1].set_title("Residuals Autocorrelation", size='large') + ax[1].set_ylabel('ACS') + ax[1].set_xlabel('Lag') + ax[1].acorr(res) - if c == 0: axes[c][2].set_title("Residuals Histogram", size='large') - axes[c][2].set_ylabel('Freq') - axes[c][2].set_xlabel('Bins') - axes[c][2].hist(res) + if c == 0: ax[2].set_title("Residuals Histogram", size='large') + ax[2].set_ylabel('Freq') + ax[2].set_xlabel('Bins') + ax[2].hist(res) plt.tight_layout() diff --git a/benchmarks/Util.py b/benchmarks/Util.py index a92a114..e08feec 100644 --- a/benchmarks/Util.py +++ b/benchmarks/Util.py @@ -9,7 +9,7 @@ import matplotlib.pyplot as plt import numpy as np import pandas as pd from checkbox_support.parsers.tests.test_modinfo import testMultipleModinfoParser -from mpl_toolkits.mplot3d import Axes3D +#from mpl_toolkits.mplot3d import Axes3D import numpy as np @@ -20,8 +20,9 @@ from pyFTS.common import Util def extract_measure(dataframe,measure,data_columns): if not dataframe.empty: - tmp = dataframe[(dataframe.Measure == measure)][data_columns].to_dict(orient="records")[0] - ret = [k for k in tmp.values()] + df = dataframe[(dataframe.Measure == measure)][data_columns] + tmp = df.to_dict(orient="records")[0] + ret = [k for k in tmp.values() if not np.isnan(k)] return ret else: return None @@ -191,7 +192,7 @@ def cast_dataframe_to_synthetic_point(infile, outfile, experiments): ret.append(mod) dat = pd.DataFrame(ret, columns=point_dataframe_synthetic_columns()) - dat.to_csv(Util.uniquefilename(outfile), sep=";", index=False) + dat.to_csv(outfile, sep=";", index=False) def analytical_data_columns(experiments): @@ -199,23 +200,29 @@ def analytical_data_columns(experiments): return data_columns -def plot_dataframe_point(file_synthetic, file_analytic, experiments, tam): +def plot_dataframe_point(file_synthetic, file_analytic, experiments, tam, save=False, file=None, + sort_columns=['UAVG', 'RMSEAVG', 'USTD', 'RMSESTD'], + sort_ascend=[1, 1, 1, 1],save_best=False, + ignore=None,replace=None): - fig, axes = plt.subplots(nrows=4, ncols=1, figsize=tam) + fig, axes = plt.subplots(nrows=3, ncols=1, figsize=tam) axes[0].set_title('RMSE') axes[1].set_title('SMAPE') axes[2].set_title('U Statistic') - axes[3].set_title('Execution Time') dat_syn = pd.read_csv(file_synthetic, sep=";", usecols=point_dataframe_synthetic_columns()) - bests = find_best(dat_syn, ['UAVG','RMSEAVG','USTD','RMSESTD'], [1,1,1,1]) + bests = find_best(dat_syn, sort_columns, sort_ascend) dat_ana = pd.read_csv(file_analytic, sep=";", usecols=point_dataframe_analytic_columns(experiments)) data_columns = analytical_data_columns(experiments) + if save_best: + dat = pd.DataFrame.from_dict(bests, orient='index') + dat.to_csv(Util.uniquefilename(file_synthetic.replace("synthetic","best")), sep=";", index=False) + rmse = [] smape = [] u = [] @@ -223,6 +230,9 @@ def plot_dataframe_point(file_synthetic, file_analytic, experiments, tam): labels = [] for b in sorted(bests.keys()): + if check_ignore_list(b, ignore): + continue + best = bests[b] tmp = dat_ana[(dat_ana.Model == best["Model"]) & (dat_ana.Order == best["Order"]) & (dat_ana.Scheme == best["Scheme"]) & (dat_ana.Partitions == best["Partitions"])] @@ -230,14 +240,36 @@ def plot_dataframe_point(file_synthetic, file_analytic, experiments, tam): smape.append(extract_measure(tmp, 'SMAPE', data_columns)) u.append(extract_measure(tmp, 'U', data_columns)) times.append(extract_measure(tmp, 'TIME', data_columns)) - labels.append(best["Model"] + " " + str(best["Order"])) - axes[0].boxplot(rmse, labels=labels, showmeans=True) - axes[1].boxplot(smape, labels=labels, showmeans=True) - axes[2].boxplot(u, labels=labels, showmeans=True) - axes[3].boxplot(times, labels=labels, showmeans=True) + labels.append(check_replace_list(best["Model"] + " " + str(best["Order"]),replace)) - plt.show() + axes[0].boxplot(rmse, labels=labels, autorange=True, showmeans=True) + axes[0].set_title("RMSE") + axes[1].boxplot(smape, labels=labels, autorange=True, showmeans=True) + axes[1].set_title("SMAPE") + axes[2].boxplot(u, labels=labels, autorange=True, showmeans=True) + axes[2].set_title("U Statistic") + + plt.tight_layout() + + Util.showAndSaveImage(fig, file, save) + + +def check_replace_list(m, replace): + if replace is not None: + for r in replace: + if r[0] in m: + return r[1] + return m + + +def check_ignore_list(b, ignore): + flag = False + if ignore is not None: + for i in ignore: + if i in b: + flag = True + return flag def save_dataframe_interval(coverage, experiments, file, objs, resolution, save, sharpness, synthetic, times, q05, q25, q75, q95): @@ -336,10 +368,170 @@ def interval_dataframe_analytic_columns(experiments): def interval_dataframe_synthetic_columns(): columns = ["Model", "Order", "Scheme", "Partitions", "SHARPAVG", "SHARPSTD", "RESAVG", "RESSTD", "COVAVG", - "COVSTD", "TIMEAVG", "TIMESTD", "Q05AVG", "Q05STD", "Q25AVG", "Q25STD", "Q75AVG", "Q75STD", "Q95AVG", "Q95STD", "SIZE"] + "COVSTD", "TIMEAVG", "TIMESTD", "Q05AVG", "Q05STD", "Q25AVG", "Q25STD", "Q75AVG", "Q75STD", "Q95AVG", "Q95STD"] return columns +def cast_dataframe_to_synthetic_interval(infile, outfile, experiments): + columns = interval_dataframe_analytic_columns(experiments) + dat = pd.read_csv(infile, sep=";", usecols=columns) + models = dat.Model.unique() + orders = dat.Order.unique() + schemes = dat.Scheme.unique() + partitions = dat.Partitions.unique() + + data_columns = analytical_data_columns(experiments) + + ret = [] + + for m in models: + for o in orders: + for s in schemes: + for p in partitions: + mod = [] + df = dat[(dat.Model == m) & (dat.Order == o) & (dat.Scheme == s) & (dat.Partitions == p)] + if not df.empty: + sharpness = extract_measure(df, 'Sharpness', data_columns) + resolution = extract_measure(df, 'Resolution', data_columns) + coverage = extract_measure(df, 'Coverage', data_columns) + times = extract_measure(df, 'TIME', data_columns) + q05 = extract_measure(df, 'Q05', data_columns) + q25 = extract_measure(df, 'Q25', data_columns) + q75 = extract_measure(df, 'Q75', data_columns) + q95 = extract_measure(df, 'Q95', data_columns) + mod.append(m) + mod.append(o) + mod.append(s) + mod.append(p) + mod.append(np.round(np.nanmean(sharpness), 2)) + mod.append(np.round(np.nanstd(sharpness), 2)) + mod.append(np.round(np.nanmean(resolution), 2)) + mod.append(np.round(np.nanstd(resolution), 2)) + mod.append(np.round(np.nanmean(coverage), 2)) + mod.append(np.round(np.nanstd(coverage), 2)) + mod.append(np.round(np.nanmean(times), 4)) + mod.append(np.round(np.nanstd(times), 4)) + mod.append(np.round(np.nanmean(q05), 4)) + mod.append(np.round(np.nanstd(q05), 4)) + mod.append(np.round(np.nanmean(q25), 4)) + mod.append(np.round(np.nanstd(q25), 4)) + mod.append(np.round(np.nanmean(q75), 4)) + mod.append(np.round(np.nanstd(q75), 4)) + mod.append(np.round(np.nanmean(q95), 4)) + mod.append(np.round(np.nanstd(q95), 4)) + ret.append(mod) + + dat = pd.DataFrame(ret, columns=interval_dataframe_synthetic_columns()) + dat.to_csv(outfile, sep=";", index=False) + + +def plot_dataframe_interval(file_synthetic, file_analytic, experiments, tam, save=False, file=None, + sort_columns=['COVAVG', 'SHARPAVG', 'COVSTD', 'SHARPSTD'], + sort_ascend=[True, False, True, True],save_best=False, + ignore=None, replace=None): + + fig, axes = plt.subplots(nrows=3, ncols=1, figsize=tam) + + axes[0].set_title('Sharpness') + axes[1].set_title('Resolution') + axes[2].set_title('Coverage') + + dat_syn = pd.read_csv(file_synthetic, sep=";", usecols=interval_dataframe_synthetic_columns()) + + bests = find_best(dat_syn, sort_columns, sort_ascend) + + dat_ana = pd.read_csv(file_analytic, sep=";", usecols=interval_dataframe_analytic_columns(experiments)) + + data_columns = analytical_data_columns(experiments) + + if save_best: + dat = pd.DataFrame.from_dict(bests, orient='index') + dat.to_csv(Util.uniquefilename(file_synthetic.replace("synthetic","best")), sep=";", index=False) + + sharpness = [] + resolution = [] + coverage = [] + times = [] + labels = [] + bounds_shp = [] + + for b in sorted(bests.keys()): + if check_ignore_list(b, ignore): + continue + best = bests[b] + df = dat_ana[(dat_ana.Model == best["Model"]) & (dat_ana.Order == best["Order"]) + & (dat_ana.Scheme == best["Scheme"]) & (dat_ana.Partitions == best["Partitions"])] + sharpness.append( extract_measure(df,'Sharpness',data_columns) ) + resolution.append(extract_measure(df, 'Resolution', data_columns)) + coverage.append(extract_measure(df, 'Coverage', data_columns)) + times.append(extract_measure(df, 'TIME', data_columns)) + labels.append(check_replace_list(best["Model"] + " " + str(best["Order"]), replace)) + + axes[0].boxplot(sharpness, labels=labels, autorange=True, showmeans=True) + axes[0].set_title("Sharpness") + axes[1].boxplot(resolution, labels=labels, autorange=True, showmeans=True) + axes[1].set_title("Resolution") + axes[2].boxplot(coverage, labels=labels, autorange=True, showmeans=True) + axes[2].set_title("Coverage") + axes[2].set_ylim([0, 1.1]) + + plt.tight_layout() + + Util.showAndSaveImage(fig, file, save) + + + +def plot_dataframe_interval_pinball(file_synthetic, file_analytic, experiments, tam, save=False, file=None, + sort_columns=['COVAVG','SHARPAVG','COVSTD','SHARPSTD'], + sort_ascend=[True, False, True, True], save_best=False, + ignore=None, replace=None): + + fig, axes = plt.subplots(nrows=1, ncols=4, figsize=tam) + axes[0].set_title(r'$\tau=0.05$') + axes[1].set_title(r'$\tau=0.25$') + axes[2].set_title(r'$\tau=0.75$') + axes[3].set_title(r'$\tau=0.95$') + + dat_syn = pd.read_csv(file_synthetic, sep=";", usecols=interval_dataframe_synthetic_columns()) + + bests = find_best(dat_syn, sort_columns, sort_ascend) + + dat_ana = pd.read_csv(file_analytic, sep=";", usecols=interval_dataframe_analytic_columns(experiments)) + + data_columns = analytical_data_columns(experiments) + + if save_best: + dat = pd.DataFrame.from_dict(bests, orient='index') + dat.to_csv(Util.uniquefilename(file_synthetic.replace("synthetic","best")), sep=";", index=False) + + q05 = [] + q25 = [] + q75 = [] + q95 = [] + labels = [] + + for b in sorted(bests.keys()): + if check_ignore_list(b, ignore): + continue + best = bests[b] + df = dat_ana[(dat_ana.Model == best["Model"]) & (dat_ana.Order == best["Order"]) + & (dat_ana.Scheme == best["Scheme"]) & (dat_ana.Partitions == best["Partitions"])] + q05.append(extract_measure(df, 'Q05', data_columns)) + q25.append(extract_measure(df, 'Q25', data_columns)) + q75.append(extract_measure(df, 'Q75', data_columns)) + q95.append(extract_measure(df, 'Q95', data_columns)) + labels.append(check_replace_list(best["Model"] + " " + str(best["Order"]), replace)) + + axes[0].boxplot(q05, labels=labels, vert=False, autorange=True, showmeans=True) + axes[1].boxplot(q25, labels=labels, vert=False, autorange=True, showmeans=True) + axes[2].boxplot(q75, labels=labels, vert=False, autorange=True, showmeans=True) + axes[3].boxplot(q95, labels=labels, vert=False, autorange=True, showmeans=True) + + plt.tight_layout() + + Util.showAndSaveImage(fig, file, save) + + def save_dataframe_ahead(experiments, file, objs, crps_interval, crps_distr, times1, times2, save, synthetic): """ Save benchmark results for m-step ahead probabilistic forecasters @@ -438,5 +630,90 @@ def ahead_dataframe_analytic_columns(experiments): def ahead_dataframe_synthetic_columns(): columns = ["Model", "Order", "Scheme", "Partitions", "CRPS1AVG", "CRPS1STD", "CRPS2AVG", "CRPS2STD", - "SIZE", "TIME1AVG", "TIME2AVG"] + "TIME1AVG", "TIME1STD", "TIME2AVG", "TIME2STD"] return columns + + +def cast_dataframe_to_synthetic_ahead(infile, outfile, experiments): + columns = ahead_dataframe_analytic_columns(experiments) + dat = pd.read_csv(infile, sep=";", usecols=columns) + models = dat.Model.unique() + orders = dat.Order.unique() + schemes = dat.Scheme.unique() + partitions = dat.Partitions.unique() + + data_columns = analytical_data_columns(experiments) + + ret = [] + + for m in models: + for o in orders: + for s in schemes: + for p in partitions: + mod = [] + df = dat[(dat.Model == m) & (dat.Order == o) & (dat.Scheme == s) & (dat.Partitions == p)] + if not df.empty: + crps1 = extract_measure(df, 'CRPS_Interval', data_columns) + crps2 = extract_measure(df, 'CRPS_Distribution', data_columns) + times1 = extract_measure(df, 'TIME_Interval', data_columns) + times2 = extract_measure(df, 'TIME_Distribution', data_columns) + mod.append(m) + mod.append(o) + mod.append(s) + mod.append(p) + mod.append(np.round(np.nanmean(crps1), 2)) + mod.append(np.round(np.nanstd(crps1), 2)) + mod.append(np.round(np.nanmean(crps2), 2)) + mod.append(np.round(np.nanstd(crps2), 2)) + mod.append(np.round(np.nanmean(times1), 2)) + mod.append(np.round(np.nanstd(times1), 2)) + mod.append(np.round(np.nanmean(times2), 4)) + mod.append(np.round(np.nanstd(times2), 4)) + ret.append(mod) + + dat = pd.DataFrame(ret, columns=ahead_dataframe_synthetic_columns()) + dat.to_csv(outfile, sep=";", index=False) + + +def plot_dataframe_ahead(file_synthetic, file_analytic, experiments, tam, save=False, file=None, + sort_columns=['CRPS1AVG', 'CRPS2AVG', 'CRPS1STD', 'CRPS2STD'], + sort_ascend=[True, True, True, True],save_best=False, + ignore=None, replace=None): + + fig, axes = plt.subplots(nrows=2, ncols=1, figsize=tam) + + axes[0].set_title('CRPS Interval Ahead') + axes[1].set_title('CRPS Distribution Ahead') + + dat_syn = pd.read_csv(file_synthetic, sep=";", usecols=ahead_dataframe_synthetic_columns()) + + bests = find_best(dat_syn, sort_columns, sort_ascend) + + dat_ana = pd.read_csv(file_analytic, sep=";", usecols=ahead_dataframe_analytic_columns(experiments)) + + data_columns = analytical_data_columns(experiments) + + if save_best: + dat = pd.DataFrame.from_dict(bests, orient='index') + dat.to_csv(Util.uniquefilename(file_synthetic.replace("synthetic","best")), sep=";", index=False) + + crps1 = [] + crps2 = [] + labels = [] + + for b in sorted(bests.keys()): + if check_ignore_list(b, ignore): + continue + best = bests[b] + df = dat_ana[(dat_ana.Model == best["Model"]) & (dat_ana.Order == best["Order"]) + & (dat_ana.Scheme == best["Scheme"]) & (dat_ana.Partitions == best["Partitions"])] + crps1.append( extract_measure(df,'CRPS_Interval',data_columns) ) + crps2.append(extract_measure(df, 'CRPS_Distribution', data_columns)) + labels.append(check_replace_list(best["Model"] + " " + str(best["Order"]), replace)) + + axes[0].boxplot(crps1, labels=labels, autorange=True, showmeans=True) + axes[1].boxplot(crps2, labels=labels, autorange=True, showmeans=True) + + plt.tight_layout() + Util.showAndSaveImage(fig, file, save) + diff --git a/benchmarks/benchmarks.py b/benchmarks/benchmarks.py index aedb6fa..5c78989 100644 --- a/benchmarks/benchmarks.py +++ b/benchmarks/benchmarks.py @@ -14,7 +14,7 @@ import matplotlib.colors as pltcolors import matplotlib.pyplot as plt import numpy as np import pandas as pd -from mpl_toolkits.mplot3d import Axes3D +#from mpl_toolkits.mplot3d import Axes3D from pyFTS.probabilistic import ProbabilityDistribution from pyFTS import song, chen, yu, ismailefendi, sadaei, hofts, pwfts, ifts, cheng, ensemble, hwang @@ -213,10 +213,35 @@ def point_sliding_window(data, windowsize, train=0.8, models=None, partitioners= return bUtil.save_dataframe_point(experiments, file, objs, rmse, save, sintetic, smape, times, u) +def build_model_pool_point(models, max_order, benchmark_models, benchmark_models_parameters): + pool = [] + if models is None: + models = get_point_methods() + for model in models: + mfts = model("") + + if mfts.is_high_order: + for order in np.arange(1, max_order + 1): + if order >= mfts.min_order: + mfts = model("") + mfts.order = order + pool.append(mfts) + else: + mfts.order = 1 + pool.append(mfts) + + if benchmark_models is not None: + for count, model in enumerate(benchmark_models, start=0): + par = benchmark_models_parameters[count] + mfts = model(str(par if par is not None else "")) + mfts.order = par + pool.append(mfts) + return pool + def all_point_forecasters(data_train, data_test, partitions, max_order=3, statistics=True, residuals=True, series=True, save=False, file=None, tam=[20, 5], models=None, transformation=None, - distributions=False): + distributions=False, benchmark_models=None, benchmark_models_parameters=None): """ Fixed data benchmark for FTS point forecasters :param data_train: data used to train the models @@ -234,8 +259,7 @@ def all_point_forecasters(data_train, data_test, partitions, max_order=3, statis :param distributions: plot distributions :return: """ - if models is None: - models = get_point_methods() + models = build_model_pool_point(models, max_order, benchmark_models, benchmark_models_parameters) objs = [] @@ -247,22 +271,11 @@ def all_point_forecasters(data_train, data_test, partitions, max_order=3, statis for count, model in enumerate(models, start=0): #print(model) - mfts = model("") - if not mfts.is_high_order: - if transformation is not None: - mfts.appendTransformation(transformation) - mfts.train(data_train, data_train_fs.sets) - objs.append(mfts) - lcolors.append( colors[count % ncol] ) - else: - for order in np.arange(1,max_order+1): - if order >= mfts.min_order: - mfts = model(" n = " + str(order)) - if transformation is not None: - mfts.appendTransformation(transformation) - mfts.train(data_train, data_train_fs.sets, order=order) - objs.append(mfts) - lcolors.append(colors[(count + order) % ncol]) + if transformation is not None: + model.appendTransformation(transformation) + model.train(data_train, data_train_fs.sets, order=model.order) + objs.append(model) + lcolors.append( colors[count % ncol] ) if statistics: print_point_statistics(data_test, objs) @@ -421,38 +434,55 @@ def interval_sliding_window(data, windowsize, train=0.8, models=None, partitione return bUtil.save_dataframe_interval(coverage, experiments, file, objs, resolution, save, sharpness, synthetic, times) -def all_interval_forecasters(data_train, data_test, partitions, max_order=3,save=False, file=None, tam=[20, 5], - models=None, transformation=None): +def build_model_pool_interval(models, max_order, benchmark_models, benchmark_models_parameters): + pool = [] if models is None: models = get_interval_methods() + for model in models: + mfts = model("") - objs = [] + if mfts.is_high_order: + for order in np.arange(1, max_order + 1): + if order >= mfts.min_order: + mfts = model("") + mfts.order = order + pool.append(mfts) + else: + mfts.order = 1 + pool.append(mfts) + alphas = [0.05, 0.25] + if benchmark_models is not None: + for count, model in enumerate(benchmark_models, start=0): + par = benchmark_models_parameters[count] + for alpha in alphas: + mfts = model(str(alpha), alpha=alpha) + mfts.order = par + pool.append(mfts) + return pool - data_train_fs = Grid.GridPartitioner(data_train,partitions, transformation=transformation).sets + +def all_interval_forecasters(data_train, data_test, partitions, max_order=3,save=False, file=None, tam=[20, 5], + statistics=False, models=None, transformation=None, + benchmark_models=None, benchmark_models_parameters=None): + models = build_model_pool_interval(models, max_order, benchmark_models, benchmark_models_parameters) + + data_train_fs = Grid.GridPartitioner(data_train, partitions, transformation=transformation).sets lcolors = [] + objs = [] for count, model in Util.enumerate2(models, start=0, step=2): - mfts = model("") - if not mfts.is_high_order: - if transformation is not None: - mfts.appendTransformation(transformation) - mfts.train(data_train, data_train_fs) - objs.append(mfts) - lcolors.append( colors[count % ncol] ) - else: - for order in np.arange(1,max_order+1): - if order >= mfts.min_order: - mfts = model(" n = " + str(order)) - if transformation is not None: - mfts.appendTransformation(transformation) - mfts.train(data_train, data_train_fs, order=order) - objs.append(mfts) - lcolors.append(colors[count % ncol]) + if transformation is not None: + model.appendTransformation(transformation) + model.train(data_train, data_train_fs, order=model.order) + objs.append(model) + lcolors.append( colors[count % ncol] ) - print_interval_statistics(data_test, objs) + if statistics: + print_interval_statistics(data_test, objs) - plot_compared_series(data_test, objs, lcolors, typeonlegend=False, save=save, file=file, tam=tam, intervals=True) + plot_compared_series(data_test, objs, lcolors, typeonlegend=False, save=save, file=file, tam=tam, + points=False, intervals=True) def print_interval_statistics(original, models): @@ -467,15 +497,6 @@ def print_interval_statistics(original, models): print(ret) -def plot_distribution(dist): - for k in dist.index: - alpha = np.array([dist[x][k] for x in dist]) * 100 - x = [k for x in np.arange(0, len(alpha))] - y = dist.columns - plt.scatter(x, y, c=alpha, marker='s', linewidths=0, cmap='Oranges', norm=pltcolors.Normalize(vmin=0, vmax=1), - vmin=0, vmax=1, edgecolors=None) - - def plot_compared_series(original, models, colors, typeonlegend=False, save=False, file=None, tam=[20, 5], points=True, intervals=True, linewidth=1.5): """ @@ -506,11 +527,13 @@ def plot_compared_series(original, models, colors, typeonlegend=False, save=Fals for count, fts in enumerate(models, start=0): if fts.has_point_forecasting and points: forecasted = fts.forecast(original) + if isinstance(forecasted, np.ndarray): + forecasted = forecasted.tolist() mi.append(min(forecasted) * 0.95) ma.append(max(forecasted) * 1.05) for k in np.arange(0, fts.order): forecasted.insert(0, None) - lbl = fts.shortname + lbl = fts.shortname + str(fts.order if fts.is_high_order and not fts.benchmark_only else "") if typeonlegend: lbl += " (Point)" ax.plot(forecasted, color=colors[count], label=lbl, ls="-",linewidth=linewidth) @@ -523,7 +546,7 @@ def plot_compared_series(original, models, colors, typeonlegend=False, save=Fals for k in np.arange(0, fts.order): lower.insert(0, None) upper.insert(0, None) - lbl = fts.shortname + lbl = fts.shortname + " " + str(fts.order if fts.is_high_order and not fts.benchmark_only else "") if typeonlegend: lbl += " (Interval)" if not points and intervals: ls = "-" @@ -556,8 +579,6 @@ def plot_probability_distributions(pmfs, lcolors, tam=[15, 7]): ax.legend(handles0, labels0) - - def ahead_sliding_window(data, windowsize, train, steps, models=None, resolution = None, partitioners=[Grid.GridPartitioner], partitions=[10], max_order=3, transformation=None, indexer=None, dump=False, save=False, file=None, synthetic=False): diff --git a/benchmarks/distributed_benchmarks.py b/benchmarks/distributed_benchmarks.py index 8f49986..46a3702 100644 --- a/benchmarks/distributed_benchmarks.py +++ b/benchmarks/distributed_benchmarks.py @@ -101,13 +101,6 @@ def point_sliding_window(data, windowsize, train=0.8, inc=0.1, models=None, part :return: DataFrame with the results """ - if benchmark_models is None and models is None: - benchmark_models = [naive.Naive, arima.ARIMA, arima.ARIMA, arima.ARIMA, arima.ARIMA, - quantreg.QuantileRegression, quantreg.QuantileRegression] - - if benchmark_models_parameters is None: - benchmark_models_parameters = [1, (1, 0, 0), (1, 0, 1), (2, 0, 1), (2, 0, 2), 1, 2] - cluster = dispy.JobCluster(run_point, nodes=nodes) #, depends=dependencies) http_server = dispy.httpd.DispyHTTPServer(cluster) @@ -116,7 +109,7 @@ def point_sliding_window(data, windowsize, train=0.8, inc=0.1, models=None, part print("Process Start: {0: %H:%M:%S}".format(datetime.datetime.now())) - pool = [] + jobs = [] objs = {} rmse = {} @@ -124,28 +117,7 @@ def point_sliding_window(data, windowsize, train=0.8, inc=0.1, models=None, part u = {} times = {} - if models is None: - models = benchmarks.get_point_methods() - - for model in models: - mfts = model("") - - if mfts.is_high_order: - for order in np.arange(1, max_order + 1): - if order >= mfts.min_order: - mfts = model("") - mfts.order = order - pool.append(mfts) - else: - mfts.order = 1 - pool.append(mfts) - - if benchmark_models is not None: - for count, model in enumerate(benchmark_models, start=0): - par = benchmark_models_parameters[count] - mfts = model(str(par if par is not None else "")) - mfts.order = par - pool.append(mfts) + pool = build_model_pool_point(models, max_order, benchmark_models, benchmark_models_parameters) experiments = 0 for ct, train, test in Util.sliding_window(data, windowsize, train, inc): @@ -204,6 +176,40 @@ def point_sliding_window(data, windowsize, train=0.8, inc=0.1, models=None, part return bUtil.save_dataframe_point(experiments, file, objs, rmse, save, sintetic, smape, times, u) +def build_model_pool_point(models, max_order, benchmark_models, benchmark_models_parameters): + pool = [] + + if benchmark_models is None and models is None: + benchmark_models = [arima.ARIMA, arima.ARIMA, arima.ARIMA, arima.ARIMA, + quantreg.QuantileRegression, quantreg.QuantileRegression] + + if benchmark_models_parameters is None: + benchmark_models_parameters = [(1, 0, 0), (1, 0, 1), (2, 0, 1), (2, 0, 2), 1, 2] + + if models is None: + models = benchmarks.get_point_methods() + for model in models: + mfts = model("") + + if mfts.is_high_order: + for order in np.arange(1, max_order + 1): + if order >= mfts.min_order: + mfts = model("") + mfts.order = order + pool.append(mfts) + else: + mfts.order = 1 + pool.append(mfts) + + if benchmark_models is not None: + for count, model in enumerate(benchmark_models, start=0): + par = benchmark_models_parameters[count] + mfts = model(str(par if par is not None else "")) + mfts.order = par + pool.append(mfts) + return pool + + def run_interval(mfts, partitioner, train_data, test_data, window_key=None, transformation=None, indexer=None): """ Interval forecast benchmark function to be executed on cluster nodes diff --git a/common/Util.py b/common/Util.py index af71d9f..ef8d1c7 100644 --- a/common/Util.py +++ b/common/Util.py @@ -26,9 +26,9 @@ def showAndSaveImage(fig,file,flag,lgd=None): if flag: plt.show() if lgd is not None: - fig.savefig(uniquefilename(file), additional_artists=lgd,bbox_inches='tight') #bbox_extra_artists=(lgd,), ) + fig.savefig(file, additional_artists=lgd,bbox_inches='tight') #bbox_extra_artists=(lgd,), ) else: - fig.savefig(uniquefilename(file)) + fig.savefig(file) plt.close(fig) diff --git a/ifts.py b/ifts.py index f04b239..822b1eb 100644 --- a/ifts.py +++ b/ifts.py @@ -15,7 +15,7 @@ class IntervalFTS(hofts.HighOrderFTS): self.detail = "Silva, P.; GuimarĂ£es, F.; Sadaei, H. (2016)" self.flrgs = {} self.has_point_forecasting = False - self.has_point_forecasting = True + self.has_interval_forecasting = True self.is_high_order = True def getUpper(self, flrg): diff --git a/pwfts.py b/pwfts.py index f88e997..184cbca 100644 --- a/pwfts.py +++ b/pwfts.py @@ -537,9 +537,11 @@ class ProbabilisticWeightedFTS(ifts.IntervalFTS): [intervals[x][0] + (intervals[x][1] - intervals[x][0]) / 2 for x in np.arange(k - self.order, k)]) grid = self.gridCount(grid, resolution, index, np.ravel(qtle_mid)) - tmp = np.array([grid[k] for k in sorted(grid)]) - - ret.append(tmp / sum(tmp)) + tmp = np.array([grid[k] for k in sorted(grid) if not np.isnan(grid[k])]) + try: + ret.append(tmp / sum(tmp)) + except Exception as ex: + ret.append(0) else: ret = [] diff --git a/sadaei.py b/sadaei.py index 438761b..f6befa1 100644 --- a/sadaei.py +++ b/sadaei.py @@ -62,7 +62,7 @@ class ExponentialyWeightedFTS(fts.FTS): flrgs[flr.LHS.name].append(flr.RHS) return (flrgs) - def train(self, data, sets,order=1,parameters=2): + def train(self, data, sets,order=1,parameters=1.05): self.c = parameters self.sets = sets ndata = self.doTransformations(data) diff --git a/tests/general.py b/tests/general.py index dd496b2..681c986 100644 --- a/tests/general.py +++ b/tests/general.py @@ -6,7 +6,7 @@ import numpy as np import pandas as pd import matplotlib as plt import matplotlib.pyplot as plt -from mpl_toolkits.mplot3d import Axes3D +#from mpl_toolkits.mplot3d import Axes3D import pandas as pd from pyFTS.partitioners import Grid, Entropy, FCM, Huarng @@ -41,28 +41,30 @@ DATASETS #taiexpd = pd.read_csv("DataSets/TAIEX.csv", sep=",") #taiex = np.array(taiexpd["avg"][:5000]) +#del(taiexpd) #nasdaqpd = pd.read_csv("DataSets/NASDAQ_IXIC.csv", sep=",") #nasdaq = np.array(nasdaqpd["avg"][0:5000]) +#del(nasdaqpd) #sp500pd = pd.read_csv("DataSets/S&P500.csv", sep=",") #sp500 = np.array(sp500pd["Avg"][11000:]) #del(sp500pd) -sondapd = pd.read_csv("DataSets/SONDA_BSB_HOURLY_AVG.csv", sep=";") -sondapd = sondapd.dropna(axis=0, how='any') -sonda = np.array(sondapd["glo_avg"]) -del(sondapd) +#sondapd = pd.read_csv("DataSets/SONDA_BSB_HOURLY_AVG.csv", sep=";") +#sondapd = sondapd.dropna(axis=0, how='any') +#sonda = np.array(sondapd["glo_avg"]) +#del(sondapd) -#bestpd = pd.read_csv("DataSets/BEST_TAVG.csv", sep=";") -#best = np.array(bestpd["Anomaly"]) -#del(bestpd) +bestpd = pd.read_csv("DataSets/BEST_TAVG.csv", sep=";") +best = np.array(bestpd["Anomaly"]) +del(bestpd) #print(lag) #print(a) -#from pyFTS.benchmarks import benchmarks as bchmk -from pyFTS.benchmarks import distributed_benchmarks as bchmk +from pyFTS.benchmarks import benchmarks as bchmk +#from pyFTS.benchmarks import distributed_benchmarks as bchmk #from pyFTS.benchmarks import parallel_benchmarks as bchmk from pyFTS.benchmarks import Util from pyFTS.benchmarks import arima, quantreg, Measures @@ -102,7 +104,7 @@ bchmk.plot_compared_series(enrollments,[tmp], ['blue','red'], points=False, inte #kk = Measures.get_interval_statistics(nasdaq[1600:1605], tmp) #print(kk) -#""" +""" """ @@ -120,9 +122,9 @@ bchmk.point_sliding_window(sonda, 9000, train=0.8, inc=0.4, #models=[yu.Weighted dump=True, save=True, file="experiments/sondaws_point_analytic_diff.csv", nodes=['192.168.0.103', '192.168.0.106', '192.168.0.108', '192.168.0.109']) #, depends=[hofts, ifts]) -""" -""" + + bchmk.interval_sliding_window(best, 5000, train=0.8, inc=0.8,#models=[yu.WeightedFTS], # # partitioners=[Grid.GridPartitioner], #Entropy.EntropyPartitioner], # FCM.FCMPartitioner, ], @@ -131,28 +133,48 @@ bchmk.interval_sliding_window(best, 5000, train=0.8, inc=0.8,#models=[yu.Weighte "_interval_analytic.csv", nodes=['192.168.0.103', '192.168.0.106', '192.168.0.108', '192.168.0.109']) #, depends=[hofts, ifts]) -bchmk.interval_sliding_window(sp500, 2000, train=0.8, inc=0.2, #models=[yu.WeightedFTS], # # + + +bchmk.interval_sliding_window(taiex, 2000, train=0.8, inc=0.1, #models=[yu.WeightedFTS], # # partitioners=[Grid.GridPartitioner], #Entropy.EntropyPartitioner], # FCM.FCMPartitioner, ], partitions= np.arange(3,20,step=2), transformation=diff, - dump=True, save=True, file="experiments/sp500_analytic_diff.csv", + dump=True, save=True, file="experiments/taiex_interval_analytic_diff.csv", nodes=['192.168.0.103', '192.168.0.106', '192.168.0.108', '192.168.0.109']) #, depends=[hofts, ifts]) + + + + +bchmk.ahead_sliding_window(sonda, 10000, steps=10, resolution=10, train=0.2, inc=0.2, + partitioners=[Grid.GridPartitioner], + partitions= np.arange(10,200,step=10), indexer=ix, + dump=True, save=True, file="experiments/sondawind_ahead_analytic.csv", + nodes=['192.168.0.106', '192.168.0.108', '192.168.0.109']) #, depends=[hofts, ifts]) + + +bchmk.ahead_sliding_window(sonda, 10000, steps=10, resolution=10, train=0.2, inc=0.2, + partitioners=[Grid.GridPartitioner], + partitions= np.arange(3,20,step=2), transformation=diff, indexer=ix, + dump=True, save=True, file="experiments/sondawind_ahead_analytic_diff.csv", + nodes=['192.168.0.106', '192.168.0.108', '192.168.0.109']) #, depends=[hofts, ifts]) + """ -#""" +from pyFTS import pwfts +from pyFTS.common import Transformations +from pyFTS.partitioners import Grid -bchmk.ahead_sliding_window(sonda, 10000, steps=10, resolution=10, train=0.2, inc=0.5, - partitioners=[Grid.GridPartitioner], - partitions= np.arange(10,200,step=10), indexer=ix, - dump=True, save=True, file="experiments/sondasolar_ahead_analytic.csv", - nodes=['192.168.0.106', '192.168.0.108', '192.168.0.109']) #, depends=[hofts, ifts]) +diff = Transformations.Differential(1) +fs = Grid.GridPartitioner(best, 190) #, transformation=diff) -bchmk.ahead_sliding_window(sonda, 10000, steps=10, resolution=10, train=0.2, inc=0.5, - partitioners=[Grid.GridPartitioner], - partitions= np.arange(3,20,step=2), transformation=diff, indexer=ix, - dump=True, save=True, file="experiments/sondasolar_ahead_analytic_diff.csv", - nodes=['192.168.0.106', '192.168.0.108', '192.168.0.109']) #, depends=[hofts, ifts]) +model = pwfts.ProbabilisticWeightedFTS("FTS 1") +#model.appendTransformation(diff) +model.train(best[0:1600],fs.sets, order=3) + +bchmk.plot_compared_intervals_ahead(best[1600:1700],[model], ['blue','red'], + distributions=[True], save=True, file="pictures/best_ahead_forecasts", + time_from=40, time_to=60, resolution=100) """ from pyFTS.partitioners import Grid