Otimização de forecastAheadDistribution para um método greedy mais confiável

2017-01-13 21:42:00 -02:00 · 2017-01-13 21:42:00 -02:00 · 8f2d2c8bcd
commit 8f2d2c8bcd
parent 53029681d8
8 changed files with 386 additions and 48 deletions
--- a/benchmarks/Measures.py
+++ b/benchmarks/Measures.py
@ -1,3 +1,6 @@
+#!/usr/bin/python
+# -*- coding: utf8 -*-
+
 import numpy as np
 import pandas as pd

--- a/benchmarks/benchmarks.py
+++ b/benchmarks/benchmarks.py
@ -1,14 +1,19 @@
+#!/usr/bin/python
+# -*- coding: utf8 -*-
+
 import numpy as np
 import pandas as pd
 import matplotlib as plt
 import matplotlib.colors as pltcolors
 import matplotlib.pyplot as plt
 from mpl_toolkits.mplot3d import Axes3D
-from sklearn.cross_validation import KFold
+#from sklearn.cross_validation import KFold
 from pyFTS.benchmarks import Measures
 from pyFTS.partitioners import Grid
 from pyFTS.common import Membership, FuzzySet, FLR, Transformations
+import time

+current_milli_time = lambda: int(round(time.time() * 1000))

 def getIntervalStatistics(original, models):
    ret = "Model		& RMSE		& MAPE		& Sharpness		& Resolution		& Coverage	\\ \n"
@ -32,6 +37,15 @@ def plotDistribution(dist):
                    vmin=0, vmax=1, edgecolors=None)


+def uniquefilename(name):
+    if '.' in name:
+        tmp = name.split('.')
+        return  tmp[0] + str(current_milli_time()) + '.' + tmp[1]
+    else:
+        return name + str(current_milli_time())
+
+
+
 def plotComparedSeries(original, models, colors, typeonlegend=False, save=False, file=None,tam=[20, 5]):
    fig = plt.figure(figsize=tam)
    ax = fig.add_subplot(111)
@ -76,15 +90,17 @@ def plotComparedSeries(original, models, colors, typeonlegend=False, save=False,
    ax.set_xlim([0, len(original)])

    if save:
-        fig.savefig(file)
+        plt.show()
+        fig.savefig(uniquefilename(file))
        plt.close(fig)


-def plotComparedIntervalsAhead(original, models, colors, distributions, time_from, time_to, interpol=False, save=False, file=None,tam=[20, 5]):
+def plotComparedIntervalsAhead(original, models, colors, distributions, time_from, time_to,
+                               interpol=False, save=False, file=None,tam=[20, 5], resolution=None):
    fig = plt.figure(figsize=tam)
    ax = fig.add_subplot(111)

-    percentile = (max(original) - min(original))/100
+    if resolution is None: resolution = (max(original) - min(original))/100

    mi = []
    ma = []
@ -92,25 +108,26 @@ def plotComparedIntervalsAhead(original, models, colors, distributions, time_fro
    count = 0
    for fts in models:
        if fts.hasDistributionForecasting and distributions[count]:
-            density = fts.forecastAheadDistribution(original[time_from - fts.order:time_from], time_to, percentile)
+            density = fts.forecastAheadDistribution2(original[time_from - fts.order:time_from], time_to, resolution)

            y = density.columns
            t = len(y)

            # interpol between time_from and time_from+1
-            if interpol:
-                diffs = [density[q][0] / 50 for q in density]
-                for p in np.arange(0, 50):
-                    xx = [(time_from - 1) + 0.02 * p for q in np.arange(0, t)]
-                    alpha2 = np.array([diffs[q] * p for q in np.arange(0, t)]) * 100
-                    ax.scatter(xx, y, c=alpha2, marker='s', linewidths=0, cmap='Oranges',
-                               norm=pltcolors.Normalize(vmin=0, vmax=1), vmin=0, vmax=1, edgecolors=None)
+            #if interpol:
+            #    diffs = [density[q][0] / 50 for q in density]
+            #    for p in np.arange(0, 50):
+            #        xx = [(time_from - 1) + 0.02 * p for q in np.arange(0, t)]
+            #        alpha2 = np.array([diffs[q] * p for q in np.arange(0, t)]) * 100
+            #        ax.scatter(xx, y, c=alpha2, marker='s', linewidths=0, cmap='Oranges',
+            #                   norm=pltcolors.Normalize(vmin=0, vmax=1), vmin=0, vmax=1, edgecolors=None)
            for k in density.index:
                alpha = np.array([density[q][k] for q in density]) * 100

                x = [time_from  + k for x in np.arange(0, t)]

-                ax.scatter(x, y, c=alpha, marker='s', linewidths=0, cmap='Oranges',
+                for cc in np.arange(0,resolution,5):
+                    ax.scatter(x, y+cc, c=alpha, marker='s', linewidths=0, cmap='Oranges',
                               norm=pltcolors.Normalize(vmin=0, vmax=1), vmin=0, vmax=1, edgecolors=None)
                if interpol and k < max(density.index):
                    diffs = [(density[q][k + 1] - density[q][k])/50 for q in density]
@ -151,7 +168,8 @@ def plotComparedIntervalsAhead(original, models, colors, distributions, time_fro
    ax.set_xlim([0, len(original)])

    if save:
-        fig.savefig(file)
+        plt.show()
+        fig.savefig(uniquefilename(file))
        plt.close(fig)


--- a/common/SortedCollection.py
+++ b/common/SortedCollection.py
@ -0,0 +1,211 @@
+from bisect import bisect_left, bisect_right
+
+#
+# Original Source Code: https://code.activestate.com/recipes/577197-sortedcollection/
+# Author: RAYMOND HETTINGER
+
+class SortedCollection(object):
+    '''Sequence sorted by a key function.
+
+    SortedCollection() is much easier to work with than using bisect() directly.
+    It supports key functions like those use in sorted(), min(), and max().
+    The result of the key function call is saved so that keys can be searched
+    efficiently.
+
+    Instead of returning an insertion-point which can be hard to interpret, the
+    five find-methods return a specific item in the sequence. They can scan for
+    exact matches, the last item less-than-or-equal to a key, or the first item
+    greater-than-or-equal to a key.
+
+    Once found, an item's ordinal position can be located with the index() method.
+    New items can be added with the insert() and insert_right() methods.
+    Old items can be deleted with the remove() method.
+
+    The usual sequence methods are provided to support indexing, slicing,
+    length lookup, clearing, copying, forward and reverse iteration, contains
+    checking, item counts, item removal, and a nice looking repr.
+
+    Finding and indexing are O(log n) operations while iteration and insertion
+    are O(n).  The initial sort is O(n log n).
+
+    The key function is stored in the 'key' attibute for easy introspection or
+    so that you can assign a new key function (triggering an automatic re-sort).
+
+    In short, the class was designed to handle all of the common use cases for
+    bisect but with a simpler API and support for key functions.
+
+    >>> from pprint import pprint
+    >>> from operator import itemgetter
+
+    >>> s = SortedCollection(key=itemgetter(2))
+    >>> for record in [
+    ...         ('roger', 'young', 30),
+    ...         ('angela', 'jones', 28),
+    ...         ('bill', 'smith', 22),
+    ...         ('david', 'thomas', 32)]:
+    ...     s.insert(record)
+
+    >>> pprint(list(s))         # show records sorted by age
+    [('bill', 'smith', 22),
+     ('angela', 'jones', 28),
+     ('roger', 'young', 30),
+     ('david', 'thomas', 32)]
+
+    >>> s.find_le(29)           # find oldest person aged 29 or younger
+    ('angela', 'jones', 28)
+    >>> s.find_lt(28)           # find oldest person under 28
+    ('bill', 'smith', 22)
+    >>> s.find_gt(28)           # find youngest person over 28
+    ('roger', 'young', 30)
+
+    >>> r = s.find_ge(32)       # find youngest person aged 32 or older
+    >>> s.index(r)              # get the index of their record
+    3
+    >>> s[3]                    # fetch the record at that index
+    ('david', 'thomas', 32)
+
+    >>> s.key = itemgetter(0)   # now sort by first name
+    >>> pprint(list(s))
+    [('angela', 'jones', 28),
+     ('bill', 'smith', 22),
+     ('david', 'thomas', 32),
+     ('roger', 'young', 30)]
+
+    '''
+
+    def __init__(self, iterable=(), key=None):
+        self._given_key = key
+        key = (lambda x: x) if key is None else key
+        decorated = sorted((key(item), item) for item in iterable)
+        self._keys = [k for k, item in decorated]
+        self._items = [item for k, item in decorated]
+        self._key = key
+
+    def _getkey(self):
+        return self._key
+
+    def _setkey(self, key):
+        if key is not self._key:
+            self.__init__(self._items, key=key)
+
+    def _delkey(self):
+        self._setkey(None)
+
+    key = property(_getkey, _setkey, _delkey, 'key function')
+
+    def clear(self):
+        self.__init__([], self._key)
+
+    def copy(self):
+        return self.__class__(self, self._key)
+
+    def __len__(self):
+        return len(self._items)
+
+    def __getitem__(self, i):
+        return self._items[i]
+
+    def __iter__(self):
+        return iter(self._items)
+
+    def __reversed__(self):
+        return reversed(self._items)
+
+    def __repr__(self):
+        return '%s(%r, key=%s)' % (
+            self.__class__.__name__,
+            self._items,
+            getattr(self._given_key, '__name__', repr(self._given_key))
+        )
+
+    def __reduce__(self):
+        return self.__class__, (self._items, self._given_key)
+
+    def __contains__(self, item):
+        k = self._key(item)
+        i = bisect_left(self._keys, k)
+        j = bisect_right(self._keys, k)
+        return item in self._items[i:j]
+
+    def index(self, item):
+        'Find the position of an item.  Raise ValueError if not found.'
+        k = self._key(item)
+        i = bisect_left(self._keys, k)
+        j = bisect_right(self._keys, k)
+        return self._items[i:j].index(item) + i
+
+    def count(self, item):
+        'Return number of occurrences of item'
+        k = self._key(item)
+        i = bisect_left(self._keys, k)
+        j = bisect_right(self._keys, k)
+        return self._items[i:j].count(item)
+
+    def insert(self, item):
+        'Insert a new item.  If equal keys are found, add to the left'
+        k = self._key(item)
+        i = bisect_left(self._keys, k)
+        self._keys.insert(i, k)
+        self._items.insert(i, item)
+
+    def insert_right(self, item):
+        'Insert a new item.  If equal keys are found, add to the right'
+        k = self._key(item)
+        i = bisect_right(self._keys, k)
+        self._keys.insert(i, k)
+        self._items.insert(i, item)
+
+    def remove(self, item):
+        'Remove first occurence of item.  Raise ValueError if not found'
+        i = self.index(item)
+        del self._keys[i]
+        del self._items[i]
+
+    def find(self, k):
+        'Return first item with a key == k.  Raise ValueError if not found.'
+        i = bisect_left(self._keys, k)
+        if i != len(self) and self._keys[i] == k:
+            return self._items[i]
+        raise ValueError('No item found with key equal to: %r' % (k,))
+
+    def find_le(self, k):
+        'Return last item with a key <= k.  Raise ValueError if not found.'
+        i = bisect_right(self._keys, k)
+        if i:
+            return self._items[i-1]
+        raise ValueError('No item found with key at or below: %r' % (k,))
+
+    def find_lt(self, k):
+        'Return last item with a key < k.  Raise ValueError if not found.'
+        i = bisect_left(self._keys, k)
+        if i:
+            return self._items[i-1]
+        raise ValueError('No item found with key below: %r' % (k,))
+
+    def find_ge(self, k):
+        'Return first item with a key >= equal to k.  Raise ValueError if not found'
+        i = bisect_left(self._keys, k)
+        if i != len(self):
+            return self._items[i]
+        raise ValueError('No item found with key at or above: %r' % (k,))
+
+    def find_gt(self, k):
+        'Return first item with a key > k.  Raise ValueError if not found'
+        i = bisect_right(self._keys, k)
+        if i != len(self):
+            return self._items[i]
+        raise ValueError('No item found with key above: %r' % (k,))
+
+    def between(self, ge, le):
+        g = bisect_left(self._keys, ge)
+        l = bisect_right(self._keys, le)
+        if g != len(self) and l != len(self):
+            return self._items[g : l]
+        raise ValueError('No item found with key at or above: %r' % (k,))
+
+    def inside(self, ge, le):
+        g = bisect_right(self._keys, ge)
+        l = bisect_left(self._keys, le)
+        if g != len(self) and l != len(self):
+            return self._items[g : l]
+        raise ValueError('No item found with key at or above: %r' % (k,))
--- a/ifts.py
+++ b/ifts.py
@ -1,3 +1,6 @@
+#!/usr/bin/python
+# -*- coding: utf8 -*-
+
 import numpy as np
 from pyFTS.common import FuzzySet,FLR
 from pyFTS import hofts, fts, tree
--- a/partitioners/Grid.py
+++ b/partitioners/Grid.py
@ -11,10 +11,8 @@ def GridPartitionerTrimf(data, npart, names=None, prefix="A"):
    sets = []
    dmax = max(data)
    dmax += dmax * 0.1
-    print(dmax)
    dmin = min(data)
    dmin -= dmin * 0.1
-    print(dmin)
    dlen = dmax - dmin
    partlen = math.ceil(dlen / npart)
    #p2 = partlen / 2
--- a/pfts.py
+++ b/pfts.py
@ -1,10 +1,15 @@
+#!/usr/bin/python
+# -*- coding: utf8 -*-
+
 import numpy as np
 import pandas as pd
 import math
-from pyFTS.common import FuzzySet, FLR
+from operator import itemgetter
+from pyFTS.common import FuzzySet, FLR, SortedCollection
 from pyFTS import hofts, ifts, tree


+
 class ProbabilisticFLRG(hofts.HighOrderFLRG):
    def __init__(self, order):
        super(ProbabilisticFLRG, self).__init__(order)
@ -173,9 +178,9 @@ class ProbabilisticFTS(ifts.IntervalFTS):
                idx = np.ravel(tmp)  # flatten the array

                if idx.size == 0:  # the element is out of the bounds of the Universe of Discourse
-                    if math.ceil(ndata[k]) <= self.sets[0].lower:
+                    if ndata[k] <= self.sets[0].lower:
                        idx = [0]
-                    elif math.ceil(ndata[k]) >= self.sets[-1].upper:
+                    elif ndata[k] >= self.sets[-1].upper:
                        idx = [len(self.sets) - 1]
                    else:
                        raise Exception(ndata[k])
@ -349,53 +354,111 @@ class ProbabilisticFTS(ifts.IntervalFTS):
                grid[sbin] = grid[sbin] + 1
        return grid

-    def forecastDistributionAhead2(self, data, steps, resolution):
+    def gridCountIndexed(self, grid, resolution, index, interval):
+        #print(interval)
+        for k in index.inside(interval[0],interval[1]):
+            #print(k)
+            grid[k] += 1
+        return grid
+
+    def buildTreeWithoutOrder(self, node, lags, level):
+
+        if level not in lags:
+            return
+
+        for s in lags[level]:
+            node.appendChild(tree.FLRGTreeNode(s))
+
+        for child in node.getChildren():
+            self.buildTreeWithoutOrder(child, lags, level + 1)
+
+    def forecastAheadDistribution2(self, data, steps, resolution):

        ret = []

-        intervals = self.forecastAhead(data, steps)
-
-        for k in np.arange(self.order, steps):
-
-            grid = self.getGridClean(resolution)
-            grid = self.gridCount(grid, resolution, intervals[k])
+        intervals = self.forecastAheadInterval(data, steps)

        lags = {}

        cc = 0
-            for x in np.arange(k - self.order, k):
-                tmp = []
-                for qt in np.arange(0, 100, 5):
-                    tmp.append(intervals[x][0] + qt * (intervals[x][1] - intervals[x][0]) / 100)
-                    tmp.append(intervals[x][1] - qt * (intervals[x][1] - intervals[x][0]) / 100)
-                tmp.append(intervals[x][0] + (intervals[x][1] - intervals[x][0]) / 2)

-                lags[cc] = tmp
+        for i in intervals:
+            nq = 2 * cc
+            if nq == 0: nq = 1
+            if nq > 50: nq = 50
+            st = 50 / nq
+
+            quantiles = []
+
+            for qt in np.arange(0, 50, st):
+                quantiles.append(i[0] + qt * ((i[1] - i[0]) / 100))
+                quantiles.append(i[0] - qt * ((i[1] - i[0]) / 100))
+            quantiles.append(i[0] + ((i[1] - i[0]) / 2))
+
+            quantiles = list(set(quantiles))
+
+            quantiles.sort()
+
+            #print(quantiles)
+
+            lags[cc] = quantiles
+
+            cc += 1

-                cc = cc + 1
        # Build the tree with all possible paths

        root = tree.FLRGTreeNode(None)

-            self.buildTree(root, lags, 0)
+        self.buildTreeWithoutOrder(root, lags, 0)
+
+        #print(root)
+
+        #return

        # Trace the possible paths and build the PFLRG's

+        grid = self.getGridClean(resolution)
+
+        ##index = SortedCollection.SortedCollection(key=lambda (k,v): itemgetter(1)(v))
+
+        index = SortedCollection.SortedCollection(iterable=grid.keys())
+
+        grids = []
+        for k in np.arange(0, steps):
+            grids.append(self.getGridClean(resolution))
+
        for p in root.paths():
            path = list(reversed(list(filter(None.__ne__, p))))

-                subset = [kk for kk in path]
+            #print(path)

-                qtle = self.forecast(subset)
-                grid = self.gridCount(grid, resolution, np.ravel(qtle))
+            for k in np.arange(self.order, steps + self.order):

-            tmp = np.array([grid[k] for k in sorted(grid)])
+                sample = path[k - self.order : k]
+
+                #print(sample)
+
+                qtle = self.forecastInterval(sample)
+
+                #grids[k - self.order] = self.gridCountPoints(grids[k - self.order], resolution, np.ravel(qtle))
+
+                # grids[k - self.order] = self.gridCount(grids[k - self.order], resolution, np.ravel(qtle))
+
+                grids[k - self.order] = self.gridCountIndexed(grids[k - self.order], resolution, index, np.ravel(qtle))
+
+                #return
+
+                #print(grid)
+
+        for k in np.arange(0, steps):
+            tmp = np.array([grids[k][q] for q in sorted(grids[k])])
            ret.append(tmp / sum(tmp))

        grid = self.getGridClean(resolution)
        df = pd.DataFrame(ret, columns=sorted(grid))
        return df

+
    def forecastAheadDistribution(self, data, steps, resolution):

        ret = []
@ -407,14 +470,18 @@ class ProbabilisticFTS(ifts.IntervalFTS):
            grid = self.getGridClean(resolution)
            grid = self.gridCount(grid, resolution, intervals[k])

-            for qt in np.arange(1, 50, 2):
+            nq = 2 * k
+            if nq > 50: nq = 50
+            st = 50 / nq
+
+            for qt in np.arange(0, 50, st):
                # print(qt)
                qtle_lower = self.forecastInterval(
-                    [intervals[x][0] + qt * (intervals[x][1] - intervals[x][0]) / 100 for x in
+                    [intervals[x][0] + qt * ((intervals[x][1] - intervals[x][0]) / 100 ) for x in
                     np.arange(k - self.order, k)])
                grid = self.gridCount(grid, resolution, np.ravel(qtle_lower))
                qtle_upper = self.forecastInterval(
-                    [intervals[x][1] - qt * (intervals[x][1] - intervals[x][0]) / 100 for x in
+                    [intervals[x][1] - qt * ((intervals[x][1] - intervals[x][0]) / 100 ) for x in
                     np.arange(k - self.order, k)])
                grid = self.gridCount(grid, resolution, np.ravel(qtle_upper))
            qtle_mid = self.forecastInterval(
--- a/tests/init.py
+++ b/tests/init.py
--- a/tests/pfts.py
+++ b/tests/pfts.py
@ -0,0 +1,38 @@
+#!/usr/bin/python
+# -*- coding: utf8 -*-
+
+import os
+import numpy as np
+import pandas as pd
+import matplotlib as plt
+import matplotlib.pyplot as plt
+from mpl_toolkits.mplot3d import Axes3D
+
+import pandas as pd
+from pyFTS.partitioners import Grid
+from pyFTS.common import FLR,FuzzySet,Membership
+from pyFTS import fts
+from pyFTS import hofts
+from pyFTS import ifts
+from pyFTS import pfts
+from pyFTS import tree
+from pyFTS.benchmarks import benchmarks as bchmk
+
+
+os.chdir("/home/petronio/dados/Dropbox/Doutorado/Disciplinas/AdvancedFuzzyTimeSeriesModels/")
+
+enrollments = pd.read_csv("DataSets/Enrollments.csv", sep=";")
+enrollments = np.array(enrollments["Enrollments"])
+
+enrollments_fs1 = Grid.GridPartitionerTrimf(enrollments,6)
+
+pfts1_enrollments = pfts.ProbabilisticFTS("1")
+pfts1_enrollments.train(enrollments,enrollments_fs1,1)
+pfts1_enrollments.shortname = "1st Order"
+pfts2_enrollments = pfts.ProbabilisticFTS("2")
+pfts2_enrollments.dump = False
+pfts2_enrollments.shortname = "2nd Order"
+pfts2_enrollments.train(enrollments,enrollments_fs1,2)
+
+
+pfts1_enrollments.forecastAheadDistribution2(enrollments[:15],5,100)