From 1ec03d34de299eab6bfeaf48d3bc0efe976677e4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Petr=C3=B4nio=20C=C3=A2ndido?= <petronio.candido@gmail.com>
Date: Wed, 20 Sep 2017 14:31:45 -0300
Subject: [PATCH] PWFTS new one-step-ahead probabilistic and quantile-interval
 forecasts

---
 pyFTS/benchmarks/benchmarks.py                |  46 ++--
 .../probabilistic/ProbabilityDistribution.py  |  81 ++++---
 pyFTS/pwfts.py                                | 203 ++++++++++--------
 pyFTS/tests/pwfts.py                          |  35 ++-
 4 files changed, 203 insertions(+), 162 deletions(-)

diff --git a/pyFTS/benchmarks/benchmarks.py b/pyFTS/benchmarks/benchmarks.py
index 41441fa..6a35fcb 100644
--- a/pyFTS/benchmarks/benchmarks.py
+++ b/pyFTS/benchmarks/benchmarks.py
@@ -497,6 +497,20 @@ def print_interval_statistics(original, models):
     print(ret)
 
 
+def plot_interval(axis, intervals, order, label, color='red', typeonlegend=False, ls='-', linewidth=1):
+    lower = [kk[0] for kk in intervals]
+    upper = [kk[1] for kk in intervals]
+    mi = min(lower) * 0.95
+    ma = max(upper) * 1.05
+    for k in np.arange(0, order):
+        lower.insert(0, None)
+        upper.insert(0, None)
+    if typeonlegend: label += " (Interval)"
+    axis.plot(lower, color=color, label=label, ls=ls,linewidth=linewidth)
+    axis.plot(upper, color=color, ls=ls,linewidth=linewidth)
+    return [mi, ma]
+
+
 def plot_compared_series(original, models, colors, typeonlegend=False, save=False, file=None, tam=[20, 5],
                          points=True, intervals=True, linewidth=1.5):
     """
@@ -526,34 +540,28 @@ def plot_compared_series(original, models, colors, typeonlegend=False, save=Fals
 
     for count, fts in enumerate(models, start=0):
         if fts.has_point_forecasting and points:
-            forecasted = fts.forecast(original)
-            if isinstance(forecasted, np.ndarray):
-                forecasted = forecasted.tolist()
-            mi.append(min(forecasted) * 0.95)
-            ma.append(max(forecasted) * 1.05)
+            forecasts = fts.forecast(original)
+            if isinstance(forecasts, np.ndarray):
+                forecasts = forecasts.tolist()
+            mi.append(min(forecasts) * 0.95)
+            ma.append(max(forecasts) * 1.05)
             for k in np.arange(0, fts.order):
-                forecasted.insert(0, None)
+                forecasts.insert(0, None)
             lbl = fts.shortname + str(fts.order if fts.is_high_order and not fts.benchmark_only else "")
             if typeonlegend: lbl += " (Point)"
-            ax.plot(forecasted, color=colors[count], label=lbl, ls="-",linewidth=linewidth)
+            ax.plot(forecasts, color=colors[count], label=lbl, ls="-",linewidth=linewidth)
 
         if fts.has_interval_forecasting and intervals:
-            forecasted = fts.forecastInterval(original)
-            lower = [kk[0] for kk in forecasted]
-            upper = [kk[1] for kk in forecasted]
-            mi.append(min(lower) * 0.95)
-            ma.append(max(upper) * 1.05)
-            for k in np.arange(0, fts.order):
-                lower.insert(0, None)
-                upper.insert(0, None)
+            forecasts = fts.forecastInterval(original)
             lbl = fts.shortname + " " + str(fts.order if fts.is_high_order and not fts.benchmark_only else "")
-            if typeonlegend: lbl += " (Interval)"
             if not points and intervals:
                 ls = "-"
             else:
                 ls = "--"
-            ax.plot(lower, color=colors[count], label=lbl, ls=ls,linewidth=linewidth)
-            ax.plot(upper, color=colors[count], ls=ls,linewidth=linewidth)
+            tmpmi, tmpma = plot_interval(ax, forecasts, fts.order, label=lbl, typeonlegend=typeonlegend,
+                                         color=colors[count], ls=ls, linewidth=linewidth)
+            mi.append(tmpmi)
+            ma.append(tmpma)
 
         handles0, labels0 = ax.get_legend_handles_labels()
         lgd = ax.legend(handles0, labels0, loc=2, bbox_to_anchor=(1, 1))
@@ -825,8 +833,10 @@ def plot_density_rectange(ax, cmap, density, fig, resolution, time_from, time_to
     cb = fig.colorbar(pc, ax=ax)
     cb.set_label('Density')
 
+
 from pyFTS.common import Transformations
 
+
 def plot_probabilitydistribution_density(ax, cmap, probabilitydist, fig, time_from):
     from matplotlib.patches import Rectangle
     from matplotlib.collections import PatchCollection
diff --git a/pyFTS/probabilistic/ProbabilityDistribution.py b/pyFTS/probabilistic/ProbabilityDistribution.py
index c826b2a..f1eb149 100644
--- a/pyFTS/probabilistic/ProbabilityDistribution.py
+++ b/pyFTS/probabilistic/ProbabilityDistribution.py
@@ -26,11 +26,16 @@ class ProbabilityDistribution(object):
 
         if self.bins is None:
             self.bins = np.linspace(int(self.uod[0]), int(self.uod[1]), int(self.nbins)).tolist()
-            self.resolution = (self.uod[1] - self.uod[0])/self.nbins
             self.labels = [str(k) for k in self.bins]
 
-        self.index = SortedCollection.SortedCollection(iterable=sorted(self.bins))
+        if self.uod is not None:
+            self.resolution = (self.uod[1] - self.uod[0]) / self.nbins
+
+        self.bin_index = SortedCollection.SortedCollection(iterable=sorted(self.bins))
+        self.quantile_index = None
         self.distribution = {}
+        self.cdf = None
+        self.qtl = None
         self.count = 0
         for k in self.bins: self.distribution[k] = 0
 
@@ -44,13 +49,13 @@ class ProbabilityDistribution(object):
         self.name = kwargs.get("name", "")
 
     def set(self, value, density):
-        k = self.index.find_ge(np.round(value,3))
+        k = self.bin_index.find_ge(value)
         self.distribution[k] = density
 
     def append(self, values):
         if self.type == "histogram":
             for k in values:
-                v = self.index.find_ge(k)
+                v = self.bin_index.find_ge(k)
                 self.distribution[v] += 1
                 self.count += 1
         else:
@@ -63,7 +68,7 @@ class ProbabilityDistribution(object):
     def appendInterval(self, intervals):
         if self.type == "histogram":
             for interval in intervals:
-                for k in self.index.inside(interval[0], interval[1]):
+                for k in self.bin_index.inside(interval[0], interval[1]):
                     self.distribution[k] += 1
                     self.count += 1
 
@@ -77,13 +82,13 @@ class ProbabilityDistribution(object):
 
         for k in values:
             if self.type == "histogram":
-                v = self.index.find_ge(k)
+                v = self.bin_index.find_ge(k)
                 ret.append(self.distribution[v] / self.count)
             elif self.type == "KDE":
                 v = self.kde.probability(k, self.data)
                 ret.append(v)
             else:
-                v = self.index.find_ge(k)
+                v = self.bin_index.find_ge(k)
                 ret.append(self.distribution[v])
 
         if scalar:
@@ -91,28 +96,51 @@ class ProbabilityDistribution(object):
 
         return ret
 
-    def cdf(self, value):
-        ret = 0
-        for k in self.bins:
-            if k < value:
-               ret += self.density(k)
-            else:
-                return ret
+    def build_cdf_qtl(self):
+        ret = 0.0
+        self.cdf = {}
+        self.qtl = {}
+        for k in sorted(self.bins):
+            ret += self.density(k)
+            if k not in self.cdf:
+                self.cdf[k] = ret
 
-        return ret
+            if str(ret) not in self.qtl:
+                self.qtl[str(ret)] = []
 
+            self.qtl[str(ret)].append(k)
+
+        _keys = [float(k) for k in sorted(self.qtl.keys())]
+
+        self.quantile_index = SortedCollection.SortedCollection(iterable=_keys)
 
     def cummulative(self, values):
+        if self.cdf is None:
+            self.build_cdf_qtl()
+
         if isinstance(values, list):
             ret = []
-            for k in values:
-                ret.append(self.cdf(k))
+            for val in values:
+                k = self.bin_index.find_ge(val)
+                ret.append(self.cdf[k])
         else:
-            return self.cdf(values)
+            k = self.bin_index.find_ge(values)
+            return self.cdf[values]
 
+    def quantile(self, values):
+        if self.qtl is None:
+            self.build_cdf_qtl()
 
-    def quantile(self, qt):
-        pass
+        if isinstance(values, list):
+            ret = []
+            for val in values:
+                k = self.quantile_index.find_ge(val)
+                ret.append(self.qtl[str(k)][0])
+        else:
+            k = self.quantile_index.find_ge(values)
+            ret = self.qtl[str(k)[0]]
+
+        return ret
 
     def entropy(self):
         h = -sum([self.distribution[k] * np.log(self.distribution[k]) if self.distribution[k] > 0 else 0
@@ -157,6 +185,7 @@ class ProbabilityDistribution(object):
         return _s / len(data)
 
     def plot(self,axis=None,color="black",tam=[10, 6], title = None):
+
         if axis is None:
             fig = plt.figure(figsize=tam)
             axis = fig.add_subplot(111)
@@ -177,12 +206,12 @@ class ProbabilityDistribution(object):
         axis.set_ylabel('Probability')
 
     def __str__(self):
-        head = '|'
-        body = '|'
+        ret = ""
         for k in sorted(self.distribution.keys()):
-            head += str(round(k,2)) + '\t|'
+            ret += str(round(k,2)) + ':\t'
             if self.type == "histogram":
-                body += str(round(self.distribution[k]  / self.count,3)) + '\t|'
+                ret +=  str(round(self.distribution[k]  / self.count,3))
             else:
-                body += str(round(self.distribution[k], 3)) + '\t|'
-        return head + '\n' + body
+                ret += str(round(self.distribution[k], 6))
+            ret += '\n'
+        return ret
diff --git a/pyFTS/pwfts.py b/pyFTS/pwfts.py
index 068f740..5327017 100644
--- a/pyFTS/pwfts.py
+++ b/pyFTS/pwfts.py
@@ -54,8 +54,8 @@ class ProbabilisticWeightedFLRG(hofts.HighOrderFLRG):
 
     def lhs_membership(self,x):
         mv = []
-        for set in self.LHS:
-            mv.append(set.membership(x))
+        for count, set in enumerate(self.LHS):
+            mv.append(set.membership(x[count]))
 
         min_mv = np.prod(mv)
         return  min_mv
@@ -92,6 +92,8 @@ class ProbabilisticWeightedFTS(ifts.IntervalFTS):
         self.has_probability_forecasting = True
         self.is_high_order = True
         self.auto_update = kwargs.get('update',False)
+        self.interval_method = kwargs.get('interval_method','extremum')
+        self.alpha = kwargs.get('alpha', 0.05)
 
     def train(self, data, sets, order=1,parameters='Fuzzy'):
 
@@ -374,6 +376,12 @@ class ProbabilisticWeightedFTS(ifts.IntervalFTS):
 
     def forecastInterval(self, data, **kwargs):
 
+        if 'method' in kwargs:
+            self.interval_method = kwargs.get('method','quantile')
+
+        if 'alpha' in kwargs:
+            self.alpha = kwargs.get('alpha', 0.05)
+
         ndata = np.array(self.doTransformations(data))
 
         l = len(ndata)
@@ -382,105 +390,112 @@ class ProbabilisticWeightedFTS(ifts.IntervalFTS):
 
         for k in np.arange(self.order - 1, l):
 
-            # print(k)
-
-            affected_flrgs = []
-            affected_flrgs_memberships = []
-            norms = []
-
-            up = []
-            lo = []
-
-            # Find the sets which membership > 0 for each lag
-            count = 0
-            lags = {}
-            if self.order > 1:
-                subset = ndata[k - (self.order - 1): k + 1]
-
-                for instance in subset:
-                    mb = FuzzySet.fuzzyInstance(instance, self.sets)
-                    tmp = np.argwhere(mb)
-                    idx = np.ravel(tmp)  # flatten the array
-
-                    if idx.size == 0:  # the element is out of the bounds of the Universe of Discourse
-                        if math.isclose(instance, self.sets[0].lower) or instance < self.sets[0].lower:
-                            idx = [0]
-                        elif math.isclose(instance, self.sets[-1].upper) or instance > self.sets[-1].upper:
-                            idx = [len(self.sets) - 1]
-                        else:
-                            raise Exception("Data exceed the known bounds [%s, %s] of universe of discourse: %s" %
-                                            (self.sets[0].lower, self.sets[-1].upper, instance))
-
-                    lags[count] = idx
-                    count += 1
-
-                # Build the tree with all possible paths
-
-                root = tree.FLRGTreeNode(None)
-
-                self.buildTree(root, lags, 0)
-
-                # Trace the possible paths and build the PFLRG's
-
-                for p in root.paths():
-                    path = list(reversed(list(filter(None.__ne__, p))))
-                    flrg = hofts.HighOrderFLRG(self.order)
-                    for kk in path: flrg.appendLHS(self.sets[kk])
-
-                    assert len(flrg.LHS) == subset.size, str(subset) + " -> " + str([s.name for s in flrg.LHS])
-
-                    ##
-                    affected_flrgs.append(flrg)
-
-                    # Find the general membership of FLRG
-                    affected_flrgs_memberships.append(min(self.getSequenceMembership(subset, flrg.LHS)))
-
+            if self.interval_method == 'extremum':
+                self.interval_extremum(k, ndata, ret)
             else:
-
-                mv = FuzzySet.fuzzyInstance(ndata[k], self.sets)  # get all membership values
-                tmp = np.argwhere(mv)  # get the indices of values > 0
-                idx = np.ravel(tmp)  # flatten the array
-
-                if idx.size == 0:  # the element is out of the bounds of the Universe of Discourse
-                    if math.isclose(ndata[k], self.sets[0].lower) or ndata[k] < self.sets[0].lower:
-                        idx = [0]
-                    elif math.isclose(ndata[k], self.sets[-1].upper) or ndata[k] > self.sets[-1].upper:
-                        idx = [len(self.sets) - 1]
-                    else:
-                        raise Exception("Data exceed the known bounds [%s, %s] of universe of discourse: %s" %
-                                        (self.sets[0].lower, self.sets[-1].upper, ndata[k]))
-
-                for kk in idx:
-                    flrg = hofts.HighOrderFLRG(self.order)
-                    flrg.appendLHS(self.sets[kk])
-                    affected_flrgs.append(flrg)
-                    affected_flrgs_memberships.append(mv[kk])
-
-            for count, flrg in enumerate(affected_flrgs):
-                # achar o os bounds de cada FLRG, ponderados pela probabilidade e pertinência
-                norm = self.get_flrg_global_probability(flrg) * affected_flrgs_memberships[count]
-                if norm == 0:
-                    norm = self.get_flrg_global_probability(flrg)  # * 0.001
-                up.append(norm * self.getUpper(flrg))
-                lo.append(norm * self.getLower(flrg))
-                norms.append(norm)
-
-            # gerar o intervalo
-            norm = sum(norms)
-            if norm == 0:
-                ret.append([0, 0])
-            else:
-                lo_ = sum(lo) / norm
-                up_ = sum(up) / norm
-                ret.append([lo_, up_])
+                self.interval_quantile(k, ndata, ret)
 
         ret = self.doInverseTransformations(ret, params=[data[self.order - 1:]], interval=True)
 
         return ret
 
+    def interval_quantile(self, k, ndata, ret):
+        dist = self.forecastDistribution(ndata)
+        lo_qt = dist[0].quantile(self.alpha)
+        up_qt = dist[0].quantile(1.0 - self.alpha)
+        ret.append([lo_qt, up_qt])
+
+    def interval_extremum(self, k, ndata, ret):
+        affected_flrgs = []
+        affected_flrgs_memberships = []
+        norms = []
+        up = []
+        lo = []
+        # Find the sets which membership > 0 for each lag
+        count = 0
+        lags = {}
+        if self.order > 1:
+            subset = ndata[k - (self.order - 1): k + 1]
+
+            for instance in subset:
+                mb = FuzzySet.fuzzyInstance(instance, self.sets)
+                tmp = np.argwhere(mb)
+                idx = np.ravel(tmp)  # flatten the array
+
+                if idx.size == 0:  # the element is out of the bounds of the Universe of Discourse
+                    if math.isclose(instance, self.sets[0].lower) or instance < self.sets[0].lower:
+                        idx = [0]
+                    elif math.isclose(instance, self.sets[-1].upper) or instance > self.sets[-1].upper:
+                        idx = [len(self.sets) - 1]
+                    else:
+                        raise Exception("Data exceed the known bounds [%s, %s] of universe of discourse: %s" %
+                                        (self.sets[0].lower, self.sets[-1].upper, instance))
+
+                lags[count] = idx
+                count += 1
+
+            # Build the tree with all possible paths
+
+            root = tree.FLRGTreeNode(None)
+
+            self.buildTree(root, lags, 0)
+
+            # Trace the possible paths and build the PFLRG's
+
+            for p in root.paths():
+                path = list(reversed(list(filter(None.__ne__, p))))
+                flrg = hofts.HighOrderFLRG(self.order)
+                for kk in path: flrg.appendLHS(self.sets[kk])
+
+                assert len(flrg.LHS) == subset.size, str(subset) + " -> " + str([s.name for s in flrg.LHS])
+
+                ##
+                affected_flrgs.append(flrg)
+
+                # Find the general membership of FLRG
+                affected_flrgs_memberships.append(min(self.getSequenceMembership(subset, flrg.LHS)))
+
+        else:
+
+            mv = FuzzySet.fuzzyInstance(ndata[k], self.sets)  # get all membership values
+            tmp = np.argwhere(mv)  # get the indices of values > 0
+            idx = np.ravel(tmp)  # flatten the array
+
+            if idx.size == 0:  # the element is out of the bounds of the Universe of Discourse
+                if math.isclose(ndata[k], self.sets[0].lower) or ndata[k] < self.sets[0].lower:
+                    idx = [0]
+                elif math.isclose(ndata[k], self.sets[-1].upper) or ndata[k] > self.sets[-1].upper:
+                    idx = [len(self.sets) - 1]
+                else:
+                    raise Exception("Data exceed the known bounds [%s, %s] of universe of discourse: %s" %
+                                    (self.sets[0].lower, self.sets[-1].upper, ndata[k]))
+
+            for kk in idx:
+                flrg = hofts.HighOrderFLRG(self.order)
+                flrg.appendLHS(self.sets[kk])
+                affected_flrgs.append(flrg)
+                affected_flrgs_memberships.append(mv[kk])
+        for count, flrg in enumerate(affected_flrgs):
+            # achar o os bounds de cada FLRG, ponderados pela probabilidade e pertinência
+            norm = self.get_flrg_global_probability(flrg) * affected_flrgs_memberships[count]
+            if norm == 0:
+                norm = self.get_flrg_global_probability(flrg)  # * 0.001
+            up.append(norm * self.getUpper(flrg))
+            lo.append(norm * self.getLower(flrg))
+            norms.append(norm)
+
+        # gerar o intervalo
+        norm = sum(norms)
+        if norm == 0:
+            ret.append([0, 0])
+        else:
+            lo_ = sum(lo) / norm
+            up_ = sum(up) / norm
+            ret.append([lo_, up_])
+
     def forecastDistribution(self, data, **kwargs):
 
-        smooth = kwargs.get("smooth", "histogram")
+        smooth = kwargs.get("smooth", "none")
         nbins = kwargs.get("num_bins", 100)
 
         ndata = np.array(self.doTransformations(data))
@@ -513,8 +528,6 @@ class ProbabilisticWeightedFTS(ifts.IntervalFTS):
 
         return ret
 
-
-
     def forecastAhead(self, data, steps, **kwargs):
         ret = [data[k] for k in np.arange(len(data) - self.order, len(data))]
 
diff --git a/pyFTS/tests/pwfts.py b/pyFTS/tests/pwfts.py
index 749e539..437847a 100644
--- a/pyFTS/tests/pwfts.py
+++ b/pyFTS/tests/pwfts.py
@@ -38,19 +38,24 @@ from pyFTS.benchmarks import benchmarks as bchmk
 #uod = [10162, 21271]
 
 enrollments_fs1 = Grid.GridPartitioner(enrollments, 6)
-for s in enrollments_fs1.sets:
-    print(s) #.partition_function(uod, 100))
+#for s in enrollments_fs1.sets:
+#    print(s) #.partition_function(uod, 100))
 
 pfts1_enrollments = pwfts.ProbabilisticWeightedFTS("1", partitioner=enrollments_fs1)
 pfts1_enrollments.train(enrollments, None, 1)
 pfts1_enrollments.shortname = "1st Order"
 
-print(pfts1_enrollments)
+#print(pfts1_enrollments)
+
+tmp = pfts1_enrollments.forecastDistribution([15000])
+#print(tmp[0])
+
+print(tmp[0].quantile([0.05, 0.95]))
 
 #pfts1_enrollments.AprioriPDF
-norm = pfts1_enrollments.global_frequency_count
-uod = pfts1_enrollments.get_UoD()
-print(uod)
+#norm = pfts1_enrollments.global_frequency_count
+#uod = pfts1_enrollments.get_UoD()
+
 #for k in sorted(pfts1_enrollments.flrgs.keys())
 #    flrg = pfts1_enrollments.flrgs[k]
 #    tmp = flrg.get_LHSprobability(15000, norm, uod, 100)
@@ -62,23 +67,7 @@ print(uod)
     #print(flrg.get_LHSprobability(15000, norm, uod, 100))
 #    print(sum([flrg.get_LHSprobability(k, norm, uod, 100) for k in np.linspace(uod[0],uod[1],100)]))
 
-print("P(T+1 | T")
-sets = pfts1_enrollments.setsDict
-t = 15000
-pf = 0.0
-for t1 in np.linspace(uod[0], uod[1], 100):
-    num = []
-    den = []
-    for s in sorted(pfts1_enrollments.flrgs.keys()):
-        flrg = pfts1_enrollments.flrgs[s]
-        pk = flrg.get_LHSprobability(t, norm, uod, 100)
-        wi = flrg.get_RHS_conditional_probability(t1, sets, uod, 100)
-        num.append(wi * pk)
-        den.append(pk)
-    pt1 = sum(num)/sum(den)
-    pf += pt1
-    print(str(round(t1,0)) + ": " + str(round(pt1, 3))) #/sum(den))
-print(pf)
+
 
 '''
 pfts2_enrollments = pwfts.ProbabilisticWeightedFTS("2")