From 8f2d2c8bcd47ee61ddf0312b57ab7c7c521491cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=C3=B4nio=20C=C3=A2ndido=20de=20Lima=20e=20Silva?= Date: Fri, 13 Jan 2017 21:42:00 -0200 Subject: [PATCH] =?UTF-8?q?Otimiza=C3=A7=C3=A3o=20de=20forecastAheadDistri?= =?UTF-8?q?bution=20para=20um=20m=C3=A9todo=20greedy=20mais=20confi=C3=A1v?= =?UTF-8?q?el?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- benchmarks/Measures.py | 3 + benchmarks/benchmarks.py | 48 ++++++--- common/SortedCollection.py | 211 +++++++++++++++++++++++++++++++++++++ ifts.py | 3 + partitioners/Grid.py | 2 - pfts.py | 129 +++++++++++++++++------ tests/__init__.py | 0 tests/pfts.py | 38 +++++++ 8 files changed, 386 insertions(+), 48 deletions(-) create mode 100644 common/SortedCollection.py create mode 100644 tests/__init__.py create mode 100644 tests/pfts.py diff --git a/benchmarks/Measures.py b/benchmarks/Measures.py index 510af99..7343403 100644 --- a/benchmarks/Measures.py +++ b/benchmarks/Measures.py @@ -1,3 +1,6 @@ +#!/usr/bin/python +# -*- coding: utf8 -*- + import numpy as np import pandas as pd diff --git a/benchmarks/benchmarks.py b/benchmarks/benchmarks.py index bc424a6..91df83f 100644 --- a/benchmarks/benchmarks.py +++ b/benchmarks/benchmarks.py @@ -1,14 +1,19 @@ +#!/usr/bin/python +# -*- coding: utf8 -*- + import numpy as np import pandas as pd import matplotlib as plt import matplotlib.colors as pltcolors import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D -from sklearn.cross_validation import KFold +#from sklearn.cross_validation import KFold from pyFTS.benchmarks import Measures from pyFTS.partitioners import Grid from pyFTS.common import Membership, FuzzySet, FLR, Transformations +import time +current_milli_time = lambda: int(round(time.time() * 1000)) def getIntervalStatistics(original, models): ret = "Model & RMSE & MAPE & Sharpness & Resolution & Coverage \\ \n" @@ -32,6 +37,15 @@ def plotDistribution(dist): vmin=0, vmax=1, edgecolors=None) +def uniquefilename(name): + if '.' in name: + tmp = name.split('.') + return tmp[0] + str(current_milli_time()) + '.' + tmp[1] + else: + return name + str(current_milli_time()) + + + def plotComparedSeries(original, models, colors, typeonlegend=False, save=False, file=None,tam=[20, 5]): fig = plt.figure(figsize=tam) ax = fig.add_subplot(111) @@ -76,15 +90,17 @@ def plotComparedSeries(original, models, colors, typeonlegend=False, save=False, ax.set_xlim([0, len(original)]) if save: - fig.savefig(file) + plt.show() + fig.savefig(uniquefilename(file)) plt.close(fig) -def plotComparedIntervalsAhead(original, models, colors, distributions, time_from, time_to, interpol=False, save=False, file=None,tam=[20, 5]): +def plotComparedIntervalsAhead(original, models, colors, distributions, time_from, time_to, + interpol=False, save=False, file=None,tam=[20, 5], resolution=None): fig = plt.figure(figsize=tam) ax = fig.add_subplot(111) - percentile = (max(original) - min(original))/100 + if resolution is None: resolution = (max(original) - min(original))/100 mi = [] ma = [] @@ -92,26 +108,27 @@ def plotComparedIntervalsAhead(original, models, colors, distributions, time_fro count = 0 for fts in models: if fts.hasDistributionForecasting and distributions[count]: - density = fts.forecastAheadDistribution(original[time_from - fts.order:time_from], time_to, percentile) + density = fts.forecastAheadDistribution2(original[time_from - fts.order:time_from], time_to, resolution) y = density.columns t = len(y) # interpol between time_from and time_from+1 - if interpol: - diffs = [density[q][0] / 50 for q in density] - for p in np.arange(0, 50): - xx = [(time_from - 1) + 0.02 * p for q in np.arange(0, t)] - alpha2 = np.array([diffs[q] * p for q in np.arange(0, t)]) * 100 - ax.scatter(xx, y, c=alpha2, marker='s', linewidths=0, cmap='Oranges', - norm=pltcolors.Normalize(vmin=0, vmax=1), vmin=0, vmax=1, edgecolors=None) + #if interpol: + # diffs = [density[q][0] / 50 for q in density] + # for p in np.arange(0, 50): + # xx = [(time_from - 1) + 0.02 * p for q in np.arange(0, t)] + # alpha2 = np.array([diffs[q] * p for q in np.arange(0, t)]) * 100 + # ax.scatter(xx, y, c=alpha2, marker='s', linewidths=0, cmap='Oranges', + # norm=pltcolors.Normalize(vmin=0, vmax=1), vmin=0, vmax=1, edgecolors=None) for k in density.index: alpha = np.array([density[q][k] for q in density]) * 100 x = [time_from + k for x in np.arange(0, t)] - ax.scatter(x, y, c=alpha, marker='s', linewidths=0, cmap='Oranges', - norm=pltcolors.Normalize(vmin=0, vmax=1), vmin=0, vmax=1, edgecolors=None) + for cc in np.arange(0,resolution,5): + ax.scatter(x, y+cc, c=alpha, marker='s', linewidths=0, cmap='Oranges', + norm=pltcolors.Normalize(vmin=0, vmax=1), vmin=0, vmax=1, edgecolors=None) if interpol and k < max(density.index): diffs = [(density[q][k + 1] - density[q][k])/50 for q in density] for p in np.arange(0,50): @@ -151,7 +168,8 @@ def plotComparedIntervalsAhead(original, models, colors, distributions, time_fro ax.set_xlim([0, len(original)]) if save: - fig.savefig(file) + plt.show() + fig.savefig(uniquefilename(file)) plt.close(fig) diff --git a/common/SortedCollection.py b/common/SortedCollection.py new file mode 100644 index 0000000..f2dd8de --- /dev/null +++ b/common/SortedCollection.py @@ -0,0 +1,211 @@ +from bisect import bisect_left, bisect_right + +# +# Original Source Code: https://code.activestate.com/recipes/577197-sortedcollection/ +# Author: RAYMOND HETTINGER + +class SortedCollection(object): + '''Sequence sorted by a key function. + + SortedCollection() is much easier to work with than using bisect() directly. + It supports key functions like those use in sorted(), min(), and max(). + The result of the key function call is saved so that keys can be searched + efficiently. + + Instead of returning an insertion-point which can be hard to interpret, the + five find-methods return a specific item in the sequence. They can scan for + exact matches, the last item less-than-or-equal to a key, or the first item + greater-than-or-equal to a key. + + Once found, an item's ordinal position can be located with the index() method. + New items can be added with the insert() and insert_right() methods. + Old items can be deleted with the remove() method. + + The usual sequence methods are provided to support indexing, slicing, + length lookup, clearing, copying, forward and reverse iteration, contains + checking, item counts, item removal, and a nice looking repr. + + Finding and indexing are O(log n) operations while iteration and insertion + are O(n). The initial sort is O(n log n). + + The key function is stored in the 'key' attibute for easy introspection or + so that you can assign a new key function (triggering an automatic re-sort). + + In short, the class was designed to handle all of the common use cases for + bisect but with a simpler API and support for key functions. + + >>> from pprint import pprint + >>> from operator import itemgetter + + >>> s = SortedCollection(key=itemgetter(2)) + >>> for record in [ + ... ('roger', 'young', 30), + ... ('angela', 'jones', 28), + ... ('bill', 'smith', 22), + ... ('david', 'thomas', 32)]: + ... s.insert(record) + + >>> pprint(list(s)) # show records sorted by age + [('bill', 'smith', 22), + ('angela', 'jones', 28), + ('roger', 'young', 30), + ('david', 'thomas', 32)] + + >>> s.find_le(29) # find oldest person aged 29 or younger + ('angela', 'jones', 28) + >>> s.find_lt(28) # find oldest person under 28 + ('bill', 'smith', 22) + >>> s.find_gt(28) # find youngest person over 28 + ('roger', 'young', 30) + + >>> r = s.find_ge(32) # find youngest person aged 32 or older + >>> s.index(r) # get the index of their record + 3 + >>> s[3] # fetch the record at that index + ('david', 'thomas', 32) + + >>> s.key = itemgetter(0) # now sort by first name + >>> pprint(list(s)) + [('angela', 'jones', 28), + ('bill', 'smith', 22), + ('david', 'thomas', 32), + ('roger', 'young', 30)] + + ''' + + def __init__(self, iterable=(), key=None): + self._given_key = key + key = (lambda x: x) if key is None else key + decorated = sorted((key(item), item) for item in iterable) + self._keys = [k for k, item in decorated] + self._items = [item for k, item in decorated] + self._key = key + + def _getkey(self): + return self._key + + def _setkey(self, key): + if key is not self._key: + self.__init__(self._items, key=key) + + def _delkey(self): + self._setkey(None) + + key = property(_getkey, _setkey, _delkey, 'key function') + + def clear(self): + self.__init__([], self._key) + + def copy(self): + return self.__class__(self, self._key) + + def __len__(self): + return len(self._items) + + def __getitem__(self, i): + return self._items[i] + + def __iter__(self): + return iter(self._items) + + def __reversed__(self): + return reversed(self._items) + + def __repr__(self): + return '%s(%r, key=%s)' % ( + self.__class__.__name__, + self._items, + getattr(self._given_key, '__name__', repr(self._given_key)) + ) + + def __reduce__(self): + return self.__class__, (self._items, self._given_key) + + def __contains__(self, item): + k = self._key(item) + i = bisect_left(self._keys, k) + j = bisect_right(self._keys, k) + return item in self._items[i:j] + + def index(self, item): + 'Find the position of an item. Raise ValueError if not found.' + k = self._key(item) + i = bisect_left(self._keys, k) + j = bisect_right(self._keys, k) + return self._items[i:j].index(item) + i + + def count(self, item): + 'Return number of occurrences of item' + k = self._key(item) + i = bisect_left(self._keys, k) + j = bisect_right(self._keys, k) + return self._items[i:j].count(item) + + def insert(self, item): + 'Insert a new item. If equal keys are found, add to the left' + k = self._key(item) + i = bisect_left(self._keys, k) + self._keys.insert(i, k) + self._items.insert(i, item) + + def insert_right(self, item): + 'Insert a new item. If equal keys are found, add to the right' + k = self._key(item) + i = bisect_right(self._keys, k) + self._keys.insert(i, k) + self._items.insert(i, item) + + def remove(self, item): + 'Remove first occurence of item. Raise ValueError if not found' + i = self.index(item) + del self._keys[i] + del self._items[i] + + def find(self, k): + 'Return first item with a key == k. Raise ValueError if not found.' + i = bisect_left(self._keys, k) + if i != len(self) and self._keys[i] == k: + return self._items[i] + raise ValueError('No item found with key equal to: %r' % (k,)) + + def find_le(self, k): + 'Return last item with a key <= k. Raise ValueError if not found.' + i = bisect_right(self._keys, k) + if i: + return self._items[i-1] + raise ValueError('No item found with key at or below: %r' % (k,)) + + def find_lt(self, k): + 'Return last item with a key < k. Raise ValueError if not found.' + i = bisect_left(self._keys, k) + if i: + return self._items[i-1] + raise ValueError('No item found with key below: %r' % (k,)) + + def find_ge(self, k): + 'Return first item with a key >= equal to k. Raise ValueError if not found' + i = bisect_left(self._keys, k) + if i != len(self): + return self._items[i] + raise ValueError('No item found with key at or above: %r' % (k,)) + + def find_gt(self, k): + 'Return first item with a key > k. Raise ValueError if not found' + i = bisect_right(self._keys, k) + if i != len(self): + return self._items[i] + raise ValueError('No item found with key above: %r' % (k,)) + + def between(self, ge, le): + g = bisect_left(self._keys, ge) + l = bisect_right(self._keys, le) + if g != len(self) and l != len(self): + return self._items[g : l] + raise ValueError('No item found with key at or above: %r' % (k,)) + + def inside(self, ge, le): + g = bisect_right(self._keys, ge) + l = bisect_left(self._keys, le) + if g != len(self) and l != len(self): + return self._items[g : l] + raise ValueError('No item found with key at or above: %r' % (k,)) \ No newline at end of file diff --git a/ifts.py b/ifts.py index b6f98a4..bb257ae 100644 --- a/ifts.py +++ b/ifts.py @@ -1,3 +1,6 @@ +#!/usr/bin/python +# -*- coding: utf8 -*- + import numpy as np from pyFTS.common import FuzzySet,FLR from pyFTS import hofts, fts, tree diff --git a/partitioners/Grid.py b/partitioners/Grid.py index 91bd9af..70361b9 100644 --- a/partitioners/Grid.py +++ b/partitioners/Grid.py @@ -11,10 +11,8 @@ def GridPartitionerTrimf(data, npart, names=None, prefix="A"): sets = [] dmax = max(data) dmax += dmax * 0.1 - print(dmax) dmin = min(data) dmin -= dmin * 0.1 - print(dmin) dlen = dmax - dmin partlen = math.ceil(dlen / npart) #p2 = partlen / 2 diff --git a/pfts.py b/pfts.py index de95f58..40256b4 100644 --- a/pfts.py +++ b/pfts.py @@ -1,10 +1,15 @@ +#!/usr/bin/python +# -*- coding: utf8 -*- + import numpy as np import pandas as pd import math -from pyFTS.common import FuzzySet, FLR +from operator import itemgetter +from pyFTS.common import FuzzySet, FLR, SortedCollection from pyFTS import hofts, ifts, tree + class ProbabilisticFLRG(hofts.HighOrderFLRG): def __init__(self, order): super(ProbabilisticFLRG, self).__init__(order) @@ -173,9 +178,9 @@ class ProbabilisticFTS(ifts.IntervalFTS): idx = np.ravel(tmp) # flatten the array if idx.size == 0: # the element is out of the bounds of the Universe of Discourse - if math.ceil(ndata[k]) <= self.sets[0].lower: + if ndata[k] <= self.sets[0].lower: idx = [0] - elif math.ceil(ndata[k]) >= self.sets[-1].upper: + elif ndata[k] >= self.sets[-1].upper: idx = [len(self.sets) - 1] else: raise Exception(ndata[k]) @@ -349,53 +354,111 @@ class ProbabilisticFTS(ifts.IntervalFTS): grid[sbin] = grid[sbin] + 1 return grid - def forecastDistributionAhead2(self, data, steps, resolution): + def gridCountIndexed(self, grid, resolution, index, interval): + #print(interval) + for k in index.inside(interval[0],interval[1]): + #print(k) + grid[k] += 1 + return grid + + def buildTreeWithoutOrder(self, node, lags, level): + + if level not in lags: + return + + for s in lags[level]: + node.appendChild(tree.FLRGTreeNode(s)) + + for child in node.getChildren(): + self.buildTreeWithoutOrder(child, lags, level + 1) + + def forecastAheadDistribution2(self, data, steps, resolution): ret = [] - intervals = self.forecastAhead(data, steps) + intervals = self.forecastAheadInterval(data, steps) - for k in np.arange(self.order, steps): + lags = {} - grid = self.getGridClean(resolution) - grid = self.gridCount(grid, resolution, intervals[k]) + cc = 0 - lags = {} + for i in intervals: + nq = 2 * cc + if nq == 0: nq = 1 + if nq > 50: nq = 50 + st = 50 / nq - cc = 0 - for x in np.arange(k - self.order, k): - tmp = [] - for qt in np.arange(0, 100, 5): - tmp.append(intervals[x][0] + qt * (intervals[x][1] - intervals[x][0]) / 100) - tmp.append(intervals[x][1] - qt * (intervals[x][1] - intervals[x][0]) / 100) - tmp.append(intervals[x][0] + (intervals[x][1] - intervals[x][0]) / 2) + quantiles = [] - lags[cc] = tmp + for qt in np.arange(0, 50, st): + quantiles.append(i[0] + qt * ((i[1] - i[0]) / 100)) + quantiles.append(i[0] - qt * ((i[1] - i[0]) / 100)) + quantiles.append(i[0] + ((i[1] - i[0]) / 2)) - cc = cc + 1 - # Build the tree with all possible paths + quantiles = list(set(quantiles)) - root = tree.FLRGTreeNode(None) + quantiles.sort() - self.buildTree(root, lags, 0) + #print(quantiles) - # Trace the possible paths and build the PFLRG's + lags[cc] = quantiles - for p in root.paths(): - path = list(reversed(list(filter(None.__ne__, p)))) + cc += 1 - subset = [kk for kk in path] + # Build the tree with all possible paths - qtle = self.forecast(subset) - grid = self.gridCount(grid, resolution, np.ravel(qtle)) + root = tree.FLRGTreeNode(None) - tmp = np.array([grid[k] for k in sorted(grid)]) + self.buildTreeWithoutOrder(root, lags, 0) + + #print(root) + + #return + + # Trace the possible paths and build the PFLRG's + + grid = self.getGridClean(resolution) + + ##index = SortedCollection.SortedCollection(key=lambda (k,v): itemgetter(1)(v)) + + index = SortedCollection.SortedCollection(iterable=grid.keys()) + + grids = [] + for k in np.arange(0, steps): + grids.append(self.getGridClean(resolution)) + + for p in root.paths(): + path = list(reversed(list(filter(None.__ne__, p)))) + + #print(path) + + for k in np.arange(self.order, steps + self.order): + + sample = path[k - self.order : k] + + #print(sample) + + qtle = self.forecastInterval(sample) + + #grids[k - self.order] = self.gridCountPoints(grids[k - self.order], resolution, np.ravel(qtle)) + + # grids[k - self.order] = self.gridCount(grids[k - self.order], resolution, np.ravel(qtle)) + + grids[k - self.order] = self.gridCountIndexed(grids[k - self.order], resolution, index, np.ravel(qtle)) + + #return + + #print(grid) + + for k in np.arange(0, steps): + tmp = np.array([grids[k][q] for q in sorted(grids[k])]) ret.append(tmp / sum(tmp)) grid = self.getGridClean(resolution) df = pd.DataFrame(ret, columns=sorted(grid)) return df + def forecastAheadDistribution(self, data, steps, resolution): ret = [] @@ -407,14 +470,18 @@ class ProbabilisticFTS(ifts.IntervalFTS): grid = self.getGridClean(resolution) grid = self.gridCount(grid, resolution, intervals[k]) - for qt in np.arange(1, 50, 2): + nq = 2 * k + if nq > 50: nq = 50 + st = 50 / nq + + for qt in np.arange(0, 50, st): # print(qt) qtle_lower = self.forecastInterval( - [intervals[x][0] + qt * (intervals[x][1] - intervals[x][0]) / 100 for x in + [intervals[x][0] + qt * ((intervals[x][1] - intervals[x][0]) / 100 ) for x in np.arange(k - self.order, k)]) grid = self.gridCount(grid, resolution, np.ravel(qtle_lower)) qtle_upper = self.forecastInterval( - [intervals[x][1] - qt * (intervals[x][1] - intervals[x][0]) / 100 for x in + [intervals[x][1] - qt * ((intervals[x][1] - intervals[x][0]) / 100 ) for x in np.arange(k - self.order, k)]) grid = self.gridCount(grid, resolution, np.ravel(qtle_upper)) qtle_mid = self.forecastInterval( diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/pfts.py b/tests/pfts.py new file mode 100644 index 0000000..38d107e --- /dev/null +++ b/tests/pfts.py @@ -0,0 +1,38 @@ +#!/usr/bin/python +# -*- coding: utf8 -*- + +import os +import numpy as np +import pandas as pd +import matplotlib as plt +import matplotlib.pyplot as plt +from mpl_toolkits.mplot3d import Axes3D + +import pandas as pd +from pyFTS.partitioners import Grid +from pyFTS.common import FLR,FuzzySet,Membership +from pyFTS import fts +from pyFTS import hofts +from pyFTS import ifts +from pyFTS import pfts +from pyFTS import tree +from pyFTS.benchmarks import benchmarks as bchmk + + +os.chdir("/home/petronio/dados/Dropbox/Doutorado/Disciplinas/AdvancedFuzzyTimeSeriesModels/") + +enrollments = pd.read_csv("DataSets/Enrollments.csv", sep=";") +enrollments = np.array(enrollments["Enrollments"]) + +enrollments_fs1 = Grid.GridPartitionerTrimf(enrollments,6) + +pfts1_enrollments = pfts.ProbabilisticFTS("1") +pfts1_enrollments.train(enrollments,enrollments_fs1,1) +pfts1_enrollments.shortname = "1st Order" +pfts2_enrollments = pfts.ProbabilisticFTS("2") +pfts2_enrollments.dump = False +pfts2_enrollments.shortname = "2nd Order" +pfts2_enrollments.train(enrollments,enrollments_fs1,2) + + +pfts1_enrollments.forecastAheadDistribution2(enrollments[:15],5,100)