Improvements of hyperparam.Evolutive, flexibilization and modularization of the GA

2019-06-17 13:50:40 -03:00 · 2019-06-17 13:50:40 -03:00 · f2bf2b5d87
commit f2bf2b5d87
parent 6e32f1ecb9
2 changed files with 180 additions and 139 deletions
--- a/pyFTS/hyperparam/Evolutionary.py
+++ b/pyFTS/hyperparam/Evolutionary.py
@ -19,7 +19,7 @@ __measures = ['f1', 'f2', 'rmse', 'size']
 def genotype(mf, npart, partitioner, order, alpha, lags, f1, f2):
-    '''
+    """
    Create the individual genotype
    :param mf: membership function
@ -31,18 +31,18 @@ def genotype(mf, npart, partitioner, order, alpha, lags, f1, f2):
    :param f1: accuracy fitness value
    :param f2: parsimony fitness value
    :return: the genotype, a dictionary with all hyperparameters
-    '''
+    """
    ind = dict(mf=mf, npart=npart, partitioner=partitioner, order=order,
               alpha=alpha, lags=lags, f1=f1, f2=f2)
    return ind
 def random_genotype():
-    '''
+    """
    Create random genotype
    :return: the genotype, a dictionary with all hyperparameters
-    '''
+    """
    order = random.randint(1, 3)
    lags = [k for k in np.arange(1, order+1)]
    return genotype(
@ -59,58 +59,54 @@ def random_genotype():
 #
 def initial_population(n):
-    '''
+    """
    Create a random population of size n
    :param n: the size of the population
    :return: a list with n random individuals
-    '''
+    """
    pop = []
    for i in range(n):
        pop.append(random_genotype())
    return pop
-def phenotype(individual, train, parameters={}):
+def phenotype(individual, train, fts_method=hofts.WeightedHighOrderFTS, parameters={}):
-    '''
+    """
    Instantiate the genotype, creating a fitted model with the genotype hyperparameters
    :param individual: a genotype
    :param train: the training dataset
    :param fts_method: the FTS method 
    :param parameters: dict with model specific arguments for fit method.
    :return: a fitted FTS model
-    '''
+    """
-    try:
+    if individual['mf'] == 1:
-        if individual['mf'] == 1:
+        mf = Membership.trimf
-            mf = Membership.trimf
+    elif individual['mf'] == 2:
-        elif individual['mf'] == 2:
+        mf = Membership.trapmf
-            mf = Membership.trapmf
+    elif individual['mf'] == 3 and individual['partitioner'] != 2:
-        elif individual['mf'] == 3 and individual['partitioner'] != 2:
+        mf = Membership.gaussmf
-            mf = Membership.gaussmf
+    else:
-        else:
+        mf = Membership.trimf
            mf = Membership.trimf
-        #if individual['partitioner'] == 1:
+    #if individual['partitioner'] == 1:
-        partitioner = Grid.GridPartitioner(data=train, npart=individual['npart'], func=mf)
+    partitioner = Grid.GridPartitioner(data=train, npart=individual['npart'], func=mf)
-        #elif individual['partitioner'] == 2:
+    #elif individual['partitioner'] == 2:
-        #    partitioner = Entropy.EntropyPartitioner(data=train, npart=individual['npart'], func=mf)
+    #    partitioner = Entropy.EntropyPartitioner(data=train, npart=individual['npart'], func=mf)
-        model = hofts.WeightedHighOrderFTS(partitioner=partitioner,
+    model = fts_method(partitioner=partitioner,
-                                           lags=individual['lags'],
+                                       lags=individual['lags'],
-                                           alpha_cut=individual['alpha'],
+                                       alpha_cut=individual['alpha'],
-                                           order=individual['order'])
+                                       order=individual['order'])
-        model.fit(train, **parameters)
+    model.fit(train, **parameters)
-        return model
+    return model
    except Exception as ex:
        print("PHENOTYPE EXCEPTION!", str(ex), str(individual))
        return None
 def evaluate(dataset, individual, **kwargs):
-    '''
+    """
    Evaluate an individual using a sliding window cross validation over the dataset.
    :param dataset: Evaluation dataset
@ -120,7 +116,7 @@ def evaluate(dataset, individual, **kwargs):
    :param increment_rate: The increment of the scrolling window, relative to the window_size ([0,1])
    :param parameters: dict with model specific arguments for fit method.
    :return: a tuple (len_lags, rmse) with the parsimony fitness value and the accuracy fitness value
-    '''
+    """
    from pyFTS.common import Util
    from pyFTS.benchmarks import Measures
    from pyFTS.hyperparam.Evolutionary import phenotype, __measures
@ -129,73 +125,60 @@ def evaluate(dataset, individual, **kwargs):
    window_size = kwargs.get('window_size', 800)
    train_rate = kwargs.get('train_rate', .8)
    increment_rate = kwargs.get('increment_rate', .2)
    fts_method = kwargs.get('fts_method', hofts.WeightedHighOrderFTS)
    parameters = kwargs.get('parameters',{})
    if individual['f1'] is not None and individual['f2'] is not None:
        return { key: individual[key] for key in __measures }
-    try:
+    errors = []
-        errors = []
+    lengths = []
        lengths = []
-        for count, train, test in Util.sliding_window(dataset, window_size, train=train_rate, inc=increment_rate):
+    for count, train, test in Util.sliding_window(dataset, window_size, train=train_rate, inc=increment_rate):
-            model = phenotype(individual, train, parameters=parameters)
+        model = phenotype(individual, train, fts_method=fts_method, parameters=parameters)
-            if model is None:
+        forecasts = model.predict(test)
                raise Exception("Phenotype returned None")
-            forecasts = model.predict(test)
+        rmse = Measures.rmse(test[model.max_lag:], forecasts[:-1])
        lengths.append(len(model))
-            rmse = Measures.rmse(test[model.max_lag:], forecasts) #.get_point_statistics(test, model)
+        errors.append(rmse)
            lengths.append(len(model))
-            errors.append(rmse)
+    _lags = sum(model.lags) * 100
-        _lags = sum(model.lags) * 100
+    _rmse = np.nanmean(errors)
    _len = np.nanmean(lengths)
-        _rmse = np.nanmean(errors)
+    f1 = np.nansum([.6 * _rmse, .4 * np.nanstd(errors)])
-        _len = np.nanmean(lengths)
+    f2 = np.nansum([.4 * _len, .6 * _lags])
-        f1 = np.nansum([.6 * _rmse, .4 * np.nanstd(errors)])
+    return {'f1': f1, 'f2': f2, 'rmse': _rmse, 'size': _len }
        f2 = np.nansum([.4 * _len, .6 * _lags])
        #print("EVALUATION {}".format(individual))
        return {'f1': f1, 'f2': f2, 'rmse': _rmse, 'size': _len }
    except Exception as ex:
        #print("EVALUATION EXCEPTION!", str(ex), str(individual))
        return {'f1': np.inf, 'f2': np.inf, 'rmse': np.inf, 'size': np.inf }
-def tournament(population, objective):
+def tournament(population, objective, **kwargs):
-    '''
+    """
    Simple tournament selection strategy.
    :param population: the population
    :param objective: the objective to be considered on tournament
    :return:
-    '''
+    """
    n = len(population) - 1
-    try:
+    r1 = random.randint(0, n) if n > 2 else 0
-        r1 = random.randint(0, n) if n > 2 else 0
+    r2 = random.randint(0, n) if n > 2 else 1
-        r2 = random.randint(0, n) if n > 2 else 1
+    ix = r1 if population[r1][objective] < population[r2][objective] else r2
-        ix = r1 if population[r1][objective] < population[r2][objective] else r2
+    return population[ix]
        return population[ix]
    except Exception as ex:
        print(r1, population[r1])
        print(r2, population[r2])
        raise ex
-def double_tournament(population):
+def double_tournament(population, **kwargs):
-    '''
+    """
    Double tournament selection strategy.
    :param population:
    :return:
-    '''
+    """
    ancestor1 = tournament(population, 'f1')
    ancestor2 = tournament(population, 'f1')
@ -206,13 +189,13 @@ def double_tournament(population):
 def lag_crossover2(best, worst):
-    '''
+    """
    Cross over two lag genes
    :param best: best genotype
    :param worst: worst genotype
    :return: a tuple (order, lags)
-    '''
+    """
    order = int(round(.7 * best['order'] + .3 * worst['order']))
    lags = []
@ -233,27 +216,26 @@ def lag_crossover2(best, worst):
    return order, lags
-# Cruzamento
+def crossover(population, **kwargs):
-def crossover(parents):
+    """
    '''
    Crossover operation between two parents
-    :param parents: a list with two genotypes
+    :param population: the original population
    :return: a genotype
-    '''
+    """
    import random
-    n = len(parents) - 1
+    n = len(population) - 1
    r1 = random.randint(0, n)
    r2 = random.randint(0, n)
-    if parents[r1]['f1'] < parents[r2]['f1']:
+    if population[r1]['f1'] < population[r2]['f1']:
-        best = parents[r1]
+        best = population[r1]
-        worst = parents[r2]
+        worst = population[r2]
    else:
-        best = parents[r2]
+        best = population[r2]
-        worst = parents[r1]
+        worst = population[r1]
    npart = int(round(.7 * best['npart'] + .3 * worst['npart']))
    alpha = float(.7 * best['alpha'] + .3 * worst['alpha'])
@ -272,13 +254,13 @@ def crossover(parents):
 def mutation_lags(lags, order):
-    '''
+    """
    Mutation operation for lags gene
    :param lags:
    :param order:
    :return:
-    '''
+    """
    try:
        l = len(lags)
        new = []
@ -298,43 +280,40 @@ def mutation_lags(lags, order):
        print(lags, order, new, lag)
-def mutation(individual, pmut):
+def mutation(individual, **kwargs):
-    '''
+    """
    Mutation operator
-    :param population:
+    :param individual: an individual genotype
    :param pmut: individual probability o
    :return:
-    '''
+    """
    import numpy.random
-    rnd = random.uniform(0, 1)
+    print('mutation')
-    if rnd < pmut:
+    individual['npart'] = min(50, max(3, int(individual['npart'] + np.random.normal(0, 4))))
    individual['alpha'] = min(.5, max(0, individual['alpha'] + np.random.normal(0, .5)))
    individual['mf'] = random.randint(1, 2)
    individual['partitioner'] = random.randint(1, 2)
    individual['order'] = min(5, max(1, int(individual['order'] + np.random.normal(0, 1))))
    # Chama a função mutation_lags
    individual['lags'] = mutation_lags( individual['lags'],  individual['order'])
-        print('mutation')
+    individual['f1'] = None
-
+    individual['f2'] = None
        individual['npart'] = min(50, max(3, int(individual['npart'] + np.random.normal(0, 4))))
        individual['alpha'] = min(.5, max(0, individual['alpha'] + np.random.normal(0, .5)))
        individual['mf'] = random.randint(1, 2)
        individual['partitioner'] = random.randint(1, 2)
        individual['order'] = min(5, max(1, int(individual['order'] + np.random.normal(0, 1))))
        # Chama a função mutation_lags
        individual['lags'] = mutation_lags( individual['lags'],  individual['order'])
        individual['f1'] = None
        individual['f2'] = None
    return individual
 def elitism(population, new_population):
-    '''
+    """
    Elitism operation, always select the best individual of the population and discard the worst
    :param population:
    :param new_population:
    :return:
-    '''
+    """
    population = sorted(population, key=itemgetter('f1'))
    best = population[0]
@ -348,31 +327,55 @@ def elitism(population, new_population):
 def GeneticAlgorithm(dataset, **kwargs):
-    '''
+    """
    Genetic algoritm for hyperparameter optimization
-    :param dataset:
+    :param dataset: The time series to optimize the FTS
-    :param ngen: Max number of generations
+    :keyword ngen: An integer value with the maximum number of generations, default value: 30
-    :param mgen: Max number of generations without improvement
+    :keyword mgen: An integer value with the maximum number of generations without improvement to stop, default value 7
-    :param npop: Population size
+    :keyword npop: An integer value with the population size, default value: 20
-    :param pcruz: Probability of crossover
+    :keyword pcross: A float value between 0 and 1 with the probability of crossover, default: .5
-    :param pmut: Probability of mutation
+    :keyword psel: A float value between 0 and 1 with the probability of selection, default: .5
-    :param window_size: The length of scrolling window for train/test on dataset
+    :keyword pmut: A float value between 0 and 1 with the probability of mutation, default: .3
-    :param train_rate: The train/test split ([0,1])
+    :keyword fts_method: The FTS method to optimize
-    :param increment_rate: The increment of the scrolling window, relative to the window_size ([0,1])
+    :keyword parameters: dict with model specific arguments for fts_method
-    :param parameters: dict with model specific arguments for fit method.
+    :keyword elitism: A boolean value indicating if the best individual must always survive to next population
    :keyword initial_operator: a function that receives npop and return a random population with size npop
    :keyword evalutation_operator: a function that receives a dataset and an individual and return its fitness
    :keyword selection_operator: a function that receives the whole population and return a selected individual
    :keyword crossover_operator: a function that receives the whole population and return a descendent individual
    :keyword mutation_operator: a function that receives one individual and return a changed individual
    :keyword window_size: An integer value with the the length of scrolling window for train/test on dataset
    :keyword train_rate: A float value between 0 and 1 with the train/test split ([0,1])
    :keyword increment_rate: A float value between 0 and 1 with the the increment of the scrolling window,
             relative to the window_size ([0,1])
    :keyword collect_statistics: A boolean value indicating to collect statistics for each generation
    :keyword distributed: A value indicating it the execution will be local and sequential (distributed=False),
             or parallel and distributed (distributed='dispy' or distributed='spark')
    :keyword cluster: If distributed='dispy' the list of cluster nodes, else if distributed='spark' it is the master node
    :return: the best genotype
-    '''
+    """
    statistics = []
    ngen = kwargs.get('ngen',30)
    mgen = kwargs.get('mgen', 7)
    npop = kwargs.get('npop',20)
-    pcruz = kwargs.get('pcruz',.5)
+    psel = kwargs.get('psel', .5)
    pcross = kwargs.get('pcross',.5)
    pmut = kwargs.get('pmut',.3)
    distributed = kwargs.get('distributed', False)
    initial_operator = kwargs.get('initial_operator', initial_population)
    evaluation_operator = kwargs.get('evaluation_operator', evaluate)
    selection_operator = kwargs.get('selection_operator', double_tournament)
    crossover_operator = kwargs.get('crossover_operator', crossover)
    mutation_operator = kwargs.get('mutation_operator', mutation)
    _elitism = kwargs.get('elitism', True)
    elitism_operator = kwargs.get('elitism_operator', elitism)
    if distributed == 'dispy':
        cluster = kwargs.pop('cluster', None)
@ -382,7 +385,7 @@ def GeneticAlgorithm(dataset, **kwargs):
    new_population = []
-    population = initial_population(npop)
+    population = initial_operator(npop)
    last_best = population[0]
    best = population[1]
@ -390,7 +393,7 @@ def GeneticAlgorithm(dataset, **kwargs):
    print("Evaluating initial population {}".format(time.time()))
    if not distributed:
        for individual in population:
-            ret = evaluate(dataset, individual, **kwargs)
+            ret = evaluation_operator(dataset, individual, **kwargs)
            for key in __measures:
                individual[key] = ret[key]
    elif distributed=='dispy':
@ -414,19 +417,21 @@ def GeneticAlgorithm(dataset, **kwargs):
        generation_statistics = {}
        # Selection
-        for j in range(int(npop / 2)):
+        for j in range(int(npop * psel)):
-            new_population.append(double_tournament(population))
+            new_population.append(selection_operator(population))
            new_population.append(double_tournament(population))
        # Crossover
        new = []
-        for j in range(int(npop * pcruz)):
+        for j in range(int(npop * pcross)):
-            new.append(crossover(new_population))
+            new.append(crossover_operator(new_population))
        new_population.extend(new)
        # Mutation
        for ct, individual in enumerate(new_population):
-            new_population[ct] = mutation(individual, pmut)
+            rnd = random.uniform(0, 1)
            if rnd < pmut:
                new_population[ct] = mutation_operator(individual)
        # Evaluation
        if collect_statistics:
@ -436,7 +441,7 @@ def GeneticAlgorithm(dataset, **kwargs):
        if not distributed:
            for individual in new_population:
-                ret = evaluate(dataset, individual, **kwargs)
+                ret = evaluation_operator(dataset, individual, **kwargs)
                for key in __measures:
                    individual[key] = ret[key]
                    if collect_statistics: stats[key].append(ret[key])
@ -466,7 +471,8 @@ def GeneticAlgorithm(dataset, **kwargs):
            generation_statistics['population'] = mean_stats
        # Elitism
-        population = elitism(population, new_population)
+        if _elitism:
            population = elitism_operator(population, new_population)
        population = population[:npop]
@ -487,14 +493,13 @@ def GeneticAlgorithm(dataset, **kwargs):
            pmut += .05
        else:
            no_improvement_count = 0
-            pcruz = kwargs.get('pcruz', .5)
+            pcross = kwargs.get('pcross', .5)
            pmut = kwargs.get('pmut', .3)
            print(best)
        if no_improvement_count == mgen:
            break
    return best, statistics
@ -506,7 +511,7 @@ def process_experiment(result, datasetname, conn):
 def persist_statistics(statistics):
    import json
-    with open('statistics{}.txt'.format(time.time()), 'w') as file:
+    with open('statistics{}.json'.format(time.time()), 'w') as file:
        file.write(json.dumps(statistics))
@ -523,7 +528,41 @@ def log_result(conn, datasetname, result):
 def execute(datasetname, dataset, **kwargs):
-    conn = hUtil.open_hyperparam_db('hyperparam.db')
+    """
    :param datasetname:
    :param dataset: The time series to optimize the FTS
    :keyword database_file:
    :keyword experiments:
    :keyword distributed:
    :keyword ngen: An integer value with the maximum number of generations, default value: 30
    :keyword mgen: An integer value with the maximum number of generations without improvement to stop, default value 7
    :keyword npop: An integer value with the population size, default value: 20
    :keyword pcross: A float value between 0 and 1 with the probability of crossover, default: .5
    :keyword psel: A float value between 0 and 1 with the probability of selection, default: .5
    :keyword pmut: A float value between 0 and 1 with the probability of mutation, default: .3
    :keyword fts_method: The FTS method to optimize
    :keyword parameters: dict with model specific arguments for fts_method
    :keyword elitism: A boolean value indicating if the best individual must always survive to next population
    :keyword initial_operator: a function that receives npop and return a random population with size npop
    :keyword evalutation_operator: a function that receives a dataset and an individual and return its fitness
    :keyword selection_operator: a function that receives the whole population and return a selected individual
    :keyword crossover_operator: a function that receives the whole population and return a descendent individual
    :keyword mutation_operator: a function that receives one individual and return a changed individual
    :keyword window_size: An integer value with the the length of scrolling window for train/test on dataset
    :keyword train_rate: A float value between 0 and 1 with the train/test split ([0,1])
    :keyword increment_rate: A float value between 0 and 1 with the the increment of the scrolling window,
             relative to the window_size ([0,1])
    :keyword collect_statistics: A boolean value indicating to collect statistics for each generation
    :keyword distributed: A value indicating it the execution will be local and sequential (distributed=False),
             or parallel and distributed (distributed='dispy' or distributed='spark')
    :keyword cluster: If distributed='dispy' the list of cluster nodes, else if distributed='spark' it is the master node
    :return: the best genotype
    """
    file = kwargs.get('database_file', 'hyperparam.db')
    conn = hUtil.open_hyperparam_db(file)
    experiments = kwargs.get('experiments', 30)
@ -550,4 +589,3 @@ def execute(datasetname, dataset, **kwargs):
        dUtil.stop_dispy_cluster(cluster, http_server)
    return ret
--- a/pyFTS/tests/hyperparam.py
+++ b/pyFTS/tests/hyperparam.py
@ -1,6 +1,7 @@
 import numpy as np
 import pandas as pd
 from pyFTS.hyperparam import GridSearch, Evolutionary
 from pyFTS.models import pwfts
 def get_dataset():
    from pyFTS.data import SONDA
@ -41,10 +42,12 @@ datsetname, dataset  = get_dataset()
 #                   window_size=10000, train_rate=.9, increment_rate=1,)
 ret = Evolutionary.execute(datsetname, dataset,
-                            ngen=30, npop=20, pcruz=.5, pmut=.3,
+                           ngen=30, npop=20,psel=0.6, pcross=.5, pmut=.3,
-                            window_size=10000, train_rate=.9, increment_rate=1,
+                           window_size=10000, train_rate=.9, increment_rate=1,
-                            experiments=1,
+                           experiments=1,
-                            distributed='dispy', nodes=nodes)
+                           fts_method=pwfts.ProbabilisticWeightedFTS,
                           database_file='experiments.db',
                           distributed=False, nodes=nodes)
 #res = GridSearch.cluster_method({'mf':1, 'partitioner': 1, 'npart': 10, 'lags':[1], 'alpha': 0.0, 'order': 1},
 #                          dataset, window_size = 10000, train_rate = .9, increment_rate = 1)