Source code for pyFTS.common.Util

"""
Common facilities for pyFTS
"""

import time
import matplotlib.pyplot as plt
import dill
import numpy as np


[docs]def plot_rules(model, size=[5, 5], axis=None, rules_by_axis=None, columns=1): if axis is None and rules_by_axis is None: rows = 1 elif axis is None and rules_by_axis is not None: rows = (((len(model.flrgs.keys())//rules_by_axis)) // columns)+1 fig, axis = plt.subplots(nrows=rows, ncols=columns, figsize=size) if rules_by_axis is None: draw_sets_on_axis(axis, model, size) _lhs = model.partitioner.ordered_sets if not model.is_high_order else model.flrgs.keys() for ct, key in enumerate(_lhs): xticks = [] xtickslabels = [] if rules_by_axis is None: ax = axis else: colcount = (ct // rules_by_axis) % columns rowcount = (ct // rules_by_axis) // columns if rows > 1 and columns > 1: ax = axis[rowcount, colcount] elif columns > 1: ax = axis[rowcount] else: ax = axis if ct % rules_by_axis == 0: draw_sets_on_axis(ax, model, size) if not model.is_high_order: if key in model.flrgs: x = (ct % rules_by_axis) + 1 flrg = model.flrgs[key] y = model.sets[key].centroid ax.plot([x],[y],'o') xticks.append(x) xtickslabels.append(key) for rhs in flrg.RHS: dest = model.sets[rhs].centroid ax.arrow(x+.1, y, 0.8, dest - y, #length_includes_head=True, head_width=0.1, head_length=0.1, shape='full', overhang=0, fc='k', ec='k') else: flrg = model.flrgs[key] x = (ct%rules_by_axis)*model.order + 1 for ct2, lhs in enumerate(flrg.LHS): y = model.sets[lhs].centroid ax.plot([x+ct2], [y], 'o') xticks.append(x+ct2) xtickslabels.append(lhs) for ct2 in range(1, model.order): fs1 = flrg.LHS[ct2-1] fs2 = flrg.LHS[ct2] y = model.sets[fs1].centroid dest = model.sets[fs2].centroid ax.plot([x+ct2-1,x+ct2], [y,dest],'-') y = model.sets[flrg.LHS[-1]].centroid for rhs in flrg.RHS: dest = model.sets[rhs].centroid ax.arrow(x + model.order -1 + .1, y, 0.8, dest - y, # length_includes_head=True, head_width=0.1, head_length=0.1, shape='full', overhang=0, fc='k', ec='k') ax.set_xticks(xticks) ax.set_xticklabels(xtickslabels) ax.set_xlim([0,rules_by_axis*model.order+1]) plt.tight_layout() plt.show()
[docs]def draw_sets_on_axis(axis, model, size): if axis is None: fig, axis = plt.subplots(nrows=1, ncols=1, figsize=size) for ct, key in enumerate(model.partitioner.ordered_sets): fs = model.sets[key] axis.plot([0, 1, 0], fs.parameters, label=fs.name) axis.axhline(fs.centroid, c="lightgray", alpha=0.5) axis.set_xlim([0, len(model.partitioner.ordered_sets)]) axis.set_xticks(range(0, len(model.partitioner.ordered_sets))) tmp = [''] tmp.extend(model.partitioner.ordered_sets) axis.set_xticklabels(tmp) axis.set_ylim([model.partitioner.min, model.partitioner.max]) axis.set_yticks([model.sets[k].centroid for k in model.partitioner.ordered_sets]) axis.set_yticklabels([str(round(model.sets[k].centroid, 1)) + " - " + k for k in model.partitioner.ordered_sets])
current_milli_time = lambda: int(round(time.time() * 1000))
[docs]def uniquefilename(name): if '.' in name: tmp = name.split('.') return tmp[0] + str(current_milli_time()) + '.' + tmp[1] else: return name + str(current_milli_time())
[docs]def show_and_save_image(fig, file, flag, lgd=None): """ Show and image and save on file :param fig: Matplotlib Figure object :param file: filename to save the picture :param flag: if True the image will be saved :param lgd: legend """ if flag: plt.show() if lgd is not None: fig.savefig(file, additional_artists=lgd,bbox_inches='tight') #bbox_extra_artists=(lgd,), ) else: fig.savefig(file) plt.close(fig)
[docs]def enumerate2(xs, start=0, step=1): for x in xs: yield (start, x) start += step
[docs]def sliding_window(data, windowsize, train=0.8, inc=0.1, **kwargs): """ Sliding window method of cross validation for time series :param data: the entire dataset :param windowsize: window size :param train: percentual of the window size will be used for training the models :param inc: percentual of data used for slide the window :return: window count, training set, test set """ l = len(data) ttrain = int(round(windowsize * train, 0)) ic = int(round(windowsize * inc, 0)) progressbar = kwargs.get('progress', None) rng = np.arange(0,l-windowsize+ic,ic) if progressbar: from tqdm import tqdm rng = tqdm(rng) for count in rng: if count + windowsize > l: _end = l else: _end = count + windowsize yield (count, data[count : count + ttrain], data[count + ttrain : _end] )
[docs]def persist_obj(obj, file): """ Persist an object on filesystem. This function depends on Dill package :param obj: object on memory :param file: file name to store the object """ with open(file, 'wb') as _file: dill.dump(obj, _file)
[docs]def load_obj(file): """ Load to memory an object stored filesystem. This function depends on Dill package :param file: file name where the object is stored :return: object """ with open(file, 'rb') as _file: obj = dill.load(_file) return obj
[docs]def persist_env(file): """ Persist an entire environment on file. This function depends on Dill package :param file: file name to store the environment """ dill.dump_session(file)
[docs]def load_env(file): dill.load_session(file)
[docs]def start_dispy_cluster(method, nodes): import dispy, dispy.httpd, logging cluster = dispy.JobCluster(method, nodes=nodes, loglevel=logging.DEBUG, ping_interval=1000) http_server = dispy.httpd.DispyHTTPServer(cluster) return cluster, http_server
[docs]def stop_dispy_cluster(cluster, http_server): cluster.wait() # wait for all jobs to finish cluster.print_status() http_server.shutdown() # this waits until browser gets all updates cluster.close()
[docs]def simple_model_train(model, data, parameters): model.train(data, **parameters) return model
[docs]def distributed_train(model, train_method, nodes, fts_method, data, num_batches=10, train_parameters={}, **kwargs): import dispy, dispy.httpd, datetime batch_save = kwargs.get('batch_save', False) # save model between batches batch_save_interval = kwargs.get('batch_save_interval', 1) file_path = kwargs.get('file_path', None) cluster, http_server = start_dispy_cluster(train_method, nodes) print("[{0: %H:%M:%S}] Distrituted Train Started".format(datetime.datetime.now())) jobs = [] n = len(data) batch_size = int(n / num_batches) bcount = 1 for ct in range(model.order, n, batch_size): if model.is_multivariate: ndata = data.iloc[ct - model.order:ct + batch_size] else: ndata = data[ct - model.order: ct + batch_size] tmp_model = fts_method(str(bcount)) tmp_model.clone_parameters(model) job = cluster.submit(tmp_model, ndata, train_parameters) job.id = bcount # associate an ID to identify jobs (if needed later) jobs.append(job) bcount += 1 for job in jobs: print("[{0: %H:%M:%S}] Processing batch ".format(datetime.datetime.now()) + str(job.id)) tmp = job() if job.status == dispy.DispyJob.Finished and tmp is not None: model.merge(tmp) if batch_save and (job.id % batch_save_interval) == 0: persist_obj(model, file_path) else: print(job.exception) print(job.stdout) print("[{0: %H:%M:%S}] Finished batch ".format(datetime.datetime.now()) + str(job.id)) print("[{0: %H:%M:%S}] Distrituted Train Finished".format(datetime.datetime.now())) stop_dispy_cluster(cluster, http_server) return model
[docs]def simple_model_predict(model, data, parameters): return model.predict(data, **parameters)
[docs]def distributed_predict(model, parameters, nodes, data, num_batches): import dispy, dispy.httpd cluster, http_server = start_dispy_cluster(simple_model_predict, nodes) jobs = [] n = len(data) batch_size = int(n / num_batches) bcount = 1 for ct in range(model.order, n, batch_size): if model.is_multivariate: ndata = data.iloc[ct - model.order:ct + batch_size] else: ndata = data[ct - model.order: ct + batch_size] job = cluster.submit(model, ndata, parameters) job.id = bcount # associate an ID to identify jobs (if needed later) jobs.append(job) bcount += 1 ret = [] for job in jobs: tmp = job() if job.status == dispy.DispyJob.Finished and tmp is not None: if job.id < batch_size: ret.extend(tmp[:-1]) else: ret.extend(tmp) else: print(job.exception) print(job.stdout) stop_dispy_cluster(cluster, http_server) return ret