Minor bugfixes on pwfts.models

2018-04-24 12:57:40 -03:00 · 2018-04-24 12:57:40 -03:00 · 33dbeb8965
commit 33dbeb8965
parent f3c6eda2ec
11 changed files with 102 additions and 55 deletions
--- a/pyFTS/benchmarks/Measures.py
+++ b/pyFTS/benchmarks/Measures.py
@ -215,6 +215,17 @@ def pinball_mean(tau, targets, forecasts):
        print(ex)


+def brier_score(targets, densities):
+    '''Brier (1950). "Verification of Forecasts Expressed in Terms of Probability". Monthly Weather Review. 78: 1–3. '''
+    ret = []
+    for ct, d in enumerate(densities):
+        v = d.bin_index.find_ge(targets[ct])
+        score = sum([d.distribution[k] ** 2 for k in d.bins if k != v])
+        score += (d.distribution[v] - 1) ** 2
+        ret.append(score)
+    return sum(ret)/len(ret)
+
+
 def pmf_to_cdf(density):
    ret = []
    for row in density.index:
@ -236,7 +247,6 @@ def heavyside_cdf(bins, targets):
    df = pd.DataFrame(ret, columns=bins)
    return df

-
 def crps(targets, densities):
    '''
    Continuous Ranked Probability Score
@ -277,6 +287,7 @@ def get_point_statistics(data, model, **kwargs):
    '''

    steps_ahead = kwargs.get('steps_ahead',1)
+    kwargs['type'] = 'point'

    indexer = kwargs.get('indexer', None)

@ -301,7 +312,7 @@ def get_point_statistics(data, model, **kwargs):
        nforecasts = []
        for k in np.arange(model.order, len(ndata)-steps_ahead,steps_ahead_sampler):
            sample = ndata[k - model.order: k]
-            tmp = model.forecast_ahead(sample, steps_ahead, **kwargs)
+            tmp = model.predict(sample, **kwargs)
            nforecasts.append(tmp[-1])

        start = model.order + steps_ahead -1
@ -323,6 +334,7 @@ def get_interval_statistics(data, model, **kwargs):
    '''

    steps_ahead = kwargs.get('steps_ahead', 1)
+    kwargs['type'] = 'interval'

    ret = list()

@ -339,7 +351,7 @@ def get_interval_statistics(data, model, **kwargs):
        forecasts = []
        for k in np.arange(model.order, len(data) - steps_ahead):
            sample = data[k - model.order: k]
-            tmp = model.predict(sample, steps_ahead, **kwargs)
+            tmp = model.predict(sample, **kwargs)
            forecasts.append(tmp[-1])

        start = model.order + steps_ahead -1
@ -362,12 +374,13 @@ def get_distribution_statistics(data, model, **kwargs):
    :return: a list with the CRPS and execution time
    '''
    steps_ahead = kwargs.get('steps_ahead', 1)
+    kwargs['type'] = 'distribution'

    ret = list()

    if steps_ahead == 1:
        _s1 = time.time()
-        forecasts = model.forecast_distribution(data, **kwargs)
+        forecasts = model.predict(data, **kwargs)
        _e1 = time.time()
        ret.append(round(crps(data, forecasts), 3))
        ret.append(round(_e1 - _s1, 3))
@ -377,7 +390,7 @@ def get_distribution_statistics(data, model, **kwargs):
        _s1 = time.time()
        for k in np.arange(model.order, len(data) - steps_ahead, skip):
            sample = data[k - model.order: k]
-            tmp = model.forecast_ahead_distribution(sample, steps_ahead, **kwargs)
+            tmp = model.predict(sample, **kwargs)
            forecasts.append(tmp[-1])
        _e1 = time.time()

--- a/pyFTS/benchmarks/benchmarks.py
+++ b/pyFTS/benchmarks/benchmarks.py
@ -56,10 +56,13 @@ def sliding_window_benchmarks(data, windowsize, train=0.8, **kwargs):
    partitions and partitioning method will be created a partitioner model. And for each partitioner, order,
    steps ahead and FTS method a foreasting model will be trained.

-    Then all trained models are benchmarked on the test data and the metrics are stored in a datafame for
-    posterior analysis.
+    Then all trained models are benchmarked on the test data and the metrics are stored on a sqlite3 database
+    (identified by the 'file' parameter) for posterior analysis.

-    The number of experiments is determined by the windowsize and inc.
+    All these process can be distributed on a dispy cluster, setting the atributed 'distributed' to true and
+    informing the list of dispy nodes on 'nodes' parameter.
+
+    The number of experiments is determined by 'windowsize' and 'inc' parameters.

    :param data: test data
    :param windowsize: size of sliding window
@ -67,35 +70,31 @@ def sliding_window_benchmarks(data, windowsize, train=0.8, **kwargs):
    :param kwargs: dict, optional arguments

    :keyword
+        benchmark_methods:  a list with Non FTS models to benchmark. The default is None.
+        benchmark_methods_parameters:  a list with Non FTS models parameters. The default is None.
+        dataset: the dataset name to identify the current set of benchmarks results on database.
+        distributed: A boolean value indicating if the forecasting procedure will be distributed in a dispy cluster. . The default is False
+        file: file path to save the results. The default is benchmarks.db.
        inc: a float on interval [0,1] indicating the percentage of the windowsize to move the window
-        models: a list with prebuilt FTS objects. The default is None.
        methods: a list with FTS class names. The default depends on the forecasting type and contains the list of all FTS methods.
+        models: a list with prebuilt FTS objects. The default is None.
+        nodes: a list with the dispy cluster nodes addresses. The default is [127.0.0.1].
+        orders: a list with orders of the models (for high order models). The default is [1,2,3].
+        partitions: a list with the numbers of partitions on the Universe of Discourse. The default is [10].
        partitioners_models: a list with prebuilt Universe of Discourse partitioners objects. The default is None.
        partitioners_methods: a list with Universe of Discourse partitioners class names. The default is [partitioners.Grid.GridPartitioner].
-        partitions: a list with the numbers of partitions on the Universe of Discourse. The default is [10].
-        orders: a list with orders of the models (for high order models). The default is [1,2,3].
-        type: the forecasting type, one of these values: point(default), interval or distribution. . The default is point.
-        steps_ahead: a list with  the forecasting horizons, i. e., the number of steps ahead to forecast. The default is 1.
-        start: in the multi step forecasting, the index of the data where to start forecasting. The default is 0.
-        transformation: data transformation . The default is None.
-        indexer: seasonal indexer. . The default is None.
        progress: If true a progress bar will be displayed during the benchmarks. The default is False.
-        distributed: A boolean value indicating if the forecasting procedure will be distributed in a dispy cluster. . The default is False
-        nodes: a list with the dispy cluster nodes addresses. The default is [127.0.0.1].
-        benchmark_methods:  a list with Non FTS models to benchmark. The default is None.
-        benchmark_methods_parameters:  a list with Non FTS models parameters. . The default is None.
-        save: save results. The default is False.
-        file: file path to save the results. The default is None.
-        sintetic: if true only the average and standard deviation of the results. The de fault is False.
-
-    :return: DataFrame with the benchmark results
+        start: in the multi step forecasting, the index of the data where to start forecasting. The default is 0.
+        steps_ahead: a list with  the forecasting horizons, i. e., the number of steps ahead to forecast. The default is 1.
+        tag: a name to identify the current set of benchmarks results on database.
+        type: the forecasting type, one of these values: point(default), interval or distribution. The default is point.
+        transformations: a list with data transformations do apply . The default is [None].
    """

    tag = __pop('tag', None, kwargs)
    dataset = __pop('dataset', None, kwargs)

    distributed = __pop('distributed', False, kwargs)
-    save = __pop('save', False, kwargs)

    transformations = kwargs.get('transformations', [None])
    progress = kwargs.get('progress', None)
--- a/pyFTS/common/Transformations.py
+++ b/pyFTS/common/Transformations.py
@ -71,6 +71,7 @@ class Differential(Transformation):
    def inverse(self, data, param, **kwargs):

        type = kwargs.get("type","point")
+        steps_ahead = kwargs.get("steps_ahead", 1)

        if isinstance(data, (np.ndarray, np.generic)):
            data = data.tolist()
@ -83,14 +84,30 @@ class Differential(Transformation):
 #        print(n)
 #        print(len(param))

-        if type == "point":
-            inc = [data[t] + param[t] for t in np.arange(0, n)]
-        elif type == "interval":
-            inc = [[data[t][0] + param[t], data[t][1] + param[t]] for t in np.arange(0, n)]
-        elif type == "distribution":
-            for t in np.arange(0, n):
-                data[t].differential_offset(param[t])
-            inc = data
+        if steps_ahead == 1:
+            if type == "point":
+                inc = [data[t] + param[t] for t in np.arange(0, n)]
+            elif type == "interval":
+                inc = [[data[t][0] + param[t], data[t][1] + param[t]] for t in np.arange(0, n)]
+            elif type == "distribution":
+                for t in np.arange(0, n):
+                    data[t].differential_offset(param[t])
+                inc = data
+        else:
+            if type == "point":
+                inc = [data[0] + param[0]]
+                for t in np.arange(1, steps_ahead):
+                    inc.append(data[t] + inc[t-1])
+            elif type == "interval":
+                inc = [[data[0][0] + param[0], data[0][1] + param[0]]]
+                for t in np.arange(1, steps_ahead):
+                    inc.append([data[t][0] + np.nanmean(inc[t-1]), data[t][1] + np.nanmean(inc[t-1])])
+            elif type == "distribution":
+                data[0].differential_offset(param[0])
+                for t in np.arange(1, steps_ahead):
+                    ex = data[t-1].expected_value()
+                    data[t].differential_offset(ex)
+                inc = data

        if n == 1:
            return inc[0]
--- a/pyFTS/common/fts.py
+++ b/pyFTS/common/fts.py
@ -114,9 +114,9 @@ class FTS(object):

            ret = Util.distributed_predict(self, kwargs, nodes, ndata, num_batches)

-        if type != 'distribution' and not self.is_multivariate:
-            interval = True if type == 'interval' else False
-            ret = self.apply_inverse_transformations(ret, params=[data[self.order - 1:]], interval=interval)
+        if not self.is_multivariate:
+            kwargs['type'] = type
+            ret = self.apply_inverse_transformations(ret, params=[data[self.order - 1:]], **kwargs)

        return ret

--- a/pyFTS/models/chen.py
+++ b/pyFTS/models/chen.py
@ -50,7 +50,10 @@ class ConventionalFTS(fts.FTS):
    def train(self, data, **kwargs):
        if kwargs.get('sets', None) is not None:
            self.sets = kwargs.get('sets', None)
-        tmpdata = FuzzySet.fuzzyfy_series_old(data, self.sets)
+        else:
+            self.sets = self.partitioner.sets
+
+        tmpdata = FuzzySet.fuzzyfy_series(data, self.sets, method='maximum')
        flrs = FLR.generate_non_recurrent_flrs(tmpdata)
        self.generate_flrg(flrs)

--- a/pyFTS/models/hofts.py
+++ b/pyFTS/models/hofts.py
@ -97,6 +97,8 @@ class HighOrderFTS(fts.FTS):

        if kwargs.get('sets', None) is not None:
            self.sets = kwargs.get('sets', None)
+        else:
+            self.sets = self.partitioner.sets

        self.generate_flrg(data)

--- a/pyFTS/models/ismailefendi.py
+++ b/pyFTS/models/ismailefendi.py
@ -63,8 +63,10 @@ class ImprovedWeightedFTS(fts.FTS):
    def train(self, ndata, **kwargs):
        if kwargs.get('sets', None) is not None:
            self.sets = kwargs.get('sets', None)
+        else:
+            self.sets = self.partitioner.sets

-        tmpdata = FuzzySet.fuzzyfy_series(ndata, self.sets, method="maximum")
+        tmpdata = FuzzySet.fuzzyfy_series(ndata, self.sets, method='maximum')
        flrs = FLR.generate_recurrent_flrs(tmpdata)
        self.generate_flrg(flrs)

--- a/pyFTS/models/sadaei.py
+++ b/pyFTS/models/sadaei.py
@ -69,6 +69,9 @@ class ExponentialyWeightedFTS(fts.FTS):
        self.c = kwargs.get('parameters', default_c)
        if kwargs.get('sets', None) is not None:
            self.sets = kwargs.get('sets', None)
+        else:
+            self.sets = self.partitioner.sets
+
        tmpdata = FuzzySet.fuzzyfy_series(data, self.sets, method='maximum')
        flrs = FLR.generate_recurrent_flrs(tmpdata)
        self.generate_flrg(flrs, self.c)
@ -78,7 +81,7 @@ class ExponentialyWeightedFTS(fts.FTS):

        ordered_sets = FuzzySet.set_ordered(self.sets)

-        data = np.array(data)
+        data = np.array(ndata)

        l = len(ndata)

--- a/pyFTS/models/song.py
+++ b/pyFTS/models/song.py
@ -39,7 +39,7 @@ class ConventionalFTS(fts.FTS):

    def operation_matrix(self, flrs):
        l = len(self.sets)
-        if self.R is None:
+        if self.R is None or len(self.R) == 0 :
            self.R = np.zeros((l, l))
        for k in flrs:
            mm = self.flr_membership_matrix(k)
@ -51,6 +51,8 @@ class ConventionalFTS(fts.FTS):
    def train(self, data, **kwargs):
        if kwargs.get('sets', None) is not None:
            self.sets = kwargs.get('sets', None)
+        else:
+            self.sets = self.partitioner.sets

        tmpdata = FuzzySet.fuzzyfy_series(data, self.sets, method='maximum')
        flrs = FLR.generate_non_recurrent_flrs(tmpdata)
--- a/pyFTS/models/yu.py
+++ b/pyFTS/models/yu.py
@ -60,8 +60,10 @@ class WeightedFTS(fts.FTS):
    def train(self, ndata, **kwargs):
        if kwargs.get('sets', None) is not None:
            self.sets = kwargs.get('sets', None)
+        else:
+            self.sets = self.partitioner.sets

-        tmpdata = FuzzySet.fuzzyfy_series_old(ndata, self.sets)
+        tmpdata = FuzzySet.fuzzyfy_series(ndata, self.sets, method='maximum')
        flrs = FLR.generate_recurrent_flrs(tmpdata)
        self.generate_FLRG(flrs)

--- a/pyFTS/tests/general.py
+++ b/pyFTS/tests/general.py
@ -15,33 +15,37 @@ from pyFTS.data import TAIEX

 dataset = TAIEX.get_data()

-from pyFTS.benchmarks import benchmarks as bchmk, Util as bUtil
+from pyFTS.benchmarks import benchmarks as bchmk, Util as bUtil, Measures

 from pyFTS.models import pwfts
-
+'''
 from pyFTS.partitioners import Grid, Util as pUtil
 partitioner = Grid.GridPartitioner(data=dataset[:800], npart=10, transformation=tdiff)

 model = pwfts.ProbabilisticWeightedFTS('',partitioner=partitioner)
-#model.append_transformation(tdiff)
+model.append_transformation(tdiff)
 model.fit(dataset[:800])
-print(model.predict(dataset[800:1000], type='interval'))
-
-
+print(Measures.get_distribution_statistics(dataset[800:1000], model, steps_ahead=7))
+#tmp = model.predict(dataset[800:1000], type='distribution', steps_ahead=7)
+#for tmp2 in tmp:
+#    print(tmp2)
 '''
-bchmk.sliding_window_benchmarks(dataset, 1000, train=0.8, inc=0.2, methods=[pwfts.ProbabilisticWeightedFTS],
+
+#'''
+bchmk.sliding_window_benchmarks(dataset[:1000], 1000, train=0.8, inc=0.2,
+                                #methods=[pwfts.ProbabilisticWeightedFTS],
                                benchmark_models=False,
                                #transformations=[tdiff],
-                                orders=[1, 2, 3],
-                                partitions=np.arange(10, 100, 5),
-                                progress=False, type='distribution',
+                                orders=[1], #[1, 2, 3],
+                                partitions=[20], #np.arange(10, 100, 5),
+                                progress=True, type='point',
                                #steps_ahead=[1,4,7,10], #steps_ahead=[1]
-                                distributed=True, nodes=['192.168.0.110', '192.168.0.100','192.168.0.106'],
-                                file="benchmarks.db", dataset="TAIEX", tag="partitioning")
+                                #distributed=True, nodes=['192.168.0.110', '192.168.0.105','192.168.0.106'],
+                                file="benchmarks.tmp", dataset="TAIEX", tag="comparisons")
                                #save=True, file="tmp.db")


-'''
+#'''
 '''
 dat = pd.read_csv('pwfts_taiex_partitioning.csv', sep=';')
 print(bUtil.analytic_tabular_dataframe(dat))