Improvements of hyperparam.Evolutive, flexibilization and modularization of the GA

This commit is contained in:
Petrônio Cândido 2019-06-17 13:50:40 -03:00
parent 6e32f1ecb9
commit f2bf2b5d87
2 changed files with 180 additions and 139 deletions

View File

@ -19,7 +19,7 @@ __measures = ['f1', 'f2', 'rmse', 'size']
def genotype(mf, npart, partitioner, order, alpha, lags, f1, f2): def genotype(mf, npart, partitioner, order, alpha, lags, f1, f2):
''' """
Create the individual genotype Create the individual genotype
:param mf: membership function :param mf: membership function
@ -31,18 +31,18 @@ def genotype(mf, npart, partitioner, order, alpha, lags, f1, f2):
:param f1: accuracy fitness value :param f1: accuracy fitness value
:param f2: parsimony fitness value :param f2: parsimony fitness value
:return: the genotype, a dictionary with all hyperparameters :return: the genotype, a dictionary with all hyperparameters
''' """
ind = dict(mf=mf, npart=npart, partitioner=partitioner, order=order, ind = dict(mf=mf, npart=npart, partitioner=partitioner, order=order,
alpha=alpha, lags=lags, f1=f1, f2=f2) alpha=alpha, lags=lags, f1=f1, f2=f2)
return ind return ind
def random_genotype(): def random_genotype():
''' """
Create random genotype Create random genotype
:return: the genotype, a dictionary with all hyperparameters :return: the genotype, a dictionary with all hyperparameters
''' """
order = random.randint(1, 3) order = random.randint(1, 3)
lags = [k for k in np.arange(1, order+1)] lags = [k for k in np.arange(1, order+1)]
return genotype( return genotype(
@ -59,28 +59,28 @@ def random_genotype():
# #
def initial_population(n): def initial_population(n):
''' """
Create a random population of size n Create a random population of size n
:param n: the size of the population :param n: the size of the population
:return: a list with n random individuals :return: a list with n random individuals
''' """
pop = [] pop = []
for i in range(n): for i in range(n):
pop.append(random_genotype()) pop.append(random_genotype())
return pop return pop
def phenotype(individual, train, parameters={}): def phenotype(individual, train, fts_method=hofts.WeightedHighOrderFTS, parameters={}):
''' """
Instantiate the genotype, creating a fitted model with the genotype hyperparameters Instantiate the genotype, creating a fitted model with the genotype hyperparameters
:param individual: a genotype :param individual: a genotype
:param train: the training dataset :param train: the training dataset
:param fts_method: the FTS method
:param parameters: dict with model specific arguments for fit method. :param parameters: dict with model specific arguments for fit method.
:return: a fitted FTS model :return: a fitted FTS model
''' """
try:
if individual['mf'] == 1: if individual['mf'] == 1:
mf = Membership.trimf mf = Membership.trimf
elif individual['mf'] == 2: elif individual['mf'] == 2:
@ -95,7 +95,7 @@ def phenotype(individual, train, parameters={}):
#elif individual['partitioner'] == 2: #elif individual['partitioner'] == 2:
# partitioner = Entropy.EntropyPartitioner(data=train, npart=individual['npart'], func=mf) # partitioner = Entropy.EntropyPartitioner(data=train, npart=individual['npart'], func=mf)
model = hofts.WeightedHighOrderFTS(partitioner=partitioner, model = fts_method(partitioner=partitioner,
lags=individual['lags'], lags=individual['lags'],
alpha_cut=individual['alpha'], alpha_cut=individual['alpha'],
order=individual['order']) order=individual['order'])
@ -104,13 +104,9 @@ def phenotype(individual, train, parameters={}):
return model return model
except Exception as ex:
print("PHENOTYPE EXCEPTION!", str(ex), str(individual))
return None
def evaluate(dataset, individual, **kwargs): def evaluate(dataset, individual, **kwargs):
''' """
Evaluate an individual using a sliding window cross validation over the dataset. Evaluate an individual using a sliding window cross validation over the dataset.
:param dataset: Evaluation dataset :param dataset: Evaluation dataset
@ -120,7 +116,7 @@ def evaluate(dataset, individual, **kwargs):
:param increment_rate: The increment of the scrolling window, relative to the window_size ([0,1]) :param increment_rate: The increment of the scrolling window, relative to the window_size ([0,1])
:param parameters: dict with model specific arguments for fit method. :param parameters: dict with model specific arguments for fit method.
:return: a tuple (len_lags, rmse) with the parsimony fitness value and the accuracy fitness value :return: a tuple (len_lags, rmse) with the parsimony fitness value and the accuracy fitness value
''' """
from pyFTS.common import Util from pyFTS.common import Util
from pyFTS.benchmarks import Measures from pyFTS.benchmarks import Measures
from pyFTS.hyperparam.Evolutionary import phenotype, __measures from pyFTS.hyperparam.Evolutionary import phenotype, __measures
@ -129,25 +125,22 @@ def evaluate(dataset, individual, **kwargs):
window_size = kwargs.get('window_size', 800) window_size = kwargs.get('window_size', 800)
train_rate = kwargs.get('train_rate', .8) train_rate = kwargs.get('train_rate', .8)
increment_rate = kwargs.get('increment_rate', .2) increment_rate = kwargs.get('increment_rate', .2)
fts_method = kwargs.get('fts_method', hofts.WeightedHighOrderFTS)
parameters = kwargs.get('parameters',{}) parameters = kwargs.get('parameters',{})
if individual['f1'] is not None and individual['f2'] is not None: if individual['f1'] is not None and individual['f2'] is not None:
return { key: individual[key] for key in __measures } return { key: individual[key] for key in __measures }
try:
errors = [] errors = []
lengths = [] lengths = []
for count, train, test in Util.sliding_window(dataset, window_size, train=train_rate, inc=increment_rate): for count, train, test in Util.sliding_window(dataset, window_size, train=train_rate, inc=increment_rate):
model = phenotype(individual, train, parameters=parameters) model = phenotype(individual, train, fts_method=fts_method, parameters=parameters)
if model is None:
raise Exception("Phenotype returned None")
forecasts = model.predict(test) forecasts = model.predict(test)
rmse = Measures.rmse(test[model.max_lag:], forecasts) #.get_point_statistics(test, model) rmse = Measures.rmse(test[model.max_lag:], forecasts[:-1])
lengths.append(len(model)) lengths.append(len(model))
errors.append(rmse) errors.append(rmse)
@ -160,42 +153,32 @@ def evaluate(dataset, individual, **kwargs):
f1 = np.nansum([.6 * _rmse, .4 * np.nanstd(errors)]) f1 = np.nansum([.6 * _rmse, .4 * np.nanstd(errors)])
f2 = np.nansum([.4 * _len, .6 * _lags]) f2 = np.nansum([.4 * _len, .6 * _lags])
#print("EVALUATION {}".format(individual))
return {'f1': f1, 'f2': f2, 'rmse': _rmse, 'size': _len } return {'f1': f1, 'f2': f2, 'rmse': _rmse, 'size': _len }
except Exception as ex:
#print("EVALUATION EXCEPTION!", str(ex), str(individual))
return {'f1': np.inf, 'f2': np.inf, 'rmse': np.inf, 'size': np.inf }
def tournament(population, objective, **kwargs):
def tournament(population, objective): """
'''
Simple tournament selection strategy. Simple tournament selection strategy.
:param population: the population :param population: the population
:param objective: the objective to be considered on tournament :param objective: the objective to be considered on tournament
:return: :return:
''' """
n = len(population) - 1 n = len(population) - 1
try:
r1 = random.randint(0, n) if n > 2 else 0 r1 = random.randint(0, n) if n > 2 else 0
r2 = random.randint(0, n) if n > 2 else 1 r2 = random.randint(0, n) if n > 2 else 1
ix = r1 if population[r1][objective] < population[r2][objective] else r2 ix = r1 if population[r1][objective] < population[r2][objective] else r2
return population[ix] return population[ix]
except Exception as ex:
print(r1, population[r1])
print(r2, population[r2])
raise ex
def double_tournament(population): def double_tournament(population, **kwargs):
''' """
Double tournament selection strategy. Double tournament selection strategy.
:param population: :param population:
:return: :return:
''' """
ancestor1 = tournament(population, 'f1') ancestor1 = tournament(population, 'f1')
ancestor2 = tournament(population, 'f1') ancestor2 = tournament(population, 'f1')
@ -206,13 +189,13 @@ def double_tournament(population):
def lag_crossover2(best, worst): def lag_crossover2(best, worst):
''' """
Cross over two lag genes Cross over two lag genes
:param best: best genotype :param best: best genotype
:param worst: worst genotype :param worst: worst genotype
:return: a tuple (order, lags) :return: a tuple (order, lags)
''' """
order = int(round(.7 * best['order'] + .3 * worst['order'])) order = int(round(.7 * best['order'] + .3 * worst['order']))
lags = [] lags = []
@ -233,27 +216,26 @@ def lag_crossover2(best, worst):
return order, lags return order, lags
# Cruzamento def crossover(population, **kwargs):
def crossover(parents): """
'''
Crossover operation between two parents Crossover operation between two parents
:param parents: a list with two genotypes :param population: the original population
:return: a genotype :return: a genotype
''' """
import random import random
n = len(parents) - 1 n = len(population) - 1
r1 = random.randint(0, n) r1 = random.randint(0, n)
r2 = random.randint(0, n) r2 = random.randint(0, n)
if parents[r1]['f1'] < parents[r2]['f1']: if population[r1]['f1'] < population[r2]['f1']:
best = parents[r1] best = population[r1]
worst = parents[r2] worst = population[r2]
else: else:
best = parents[r2] best = population[r2]
worst = parents[r1] worst = population[r1]
npart = int(round(.7 * best['npart'] + .3 * worst['npart'])) npart = int(round(.7 * best['npart'] + .3 * worst['npart']))
alpha = float(.7 * best['alpha'] + .3 * worst['alpha']) alpha = float(.7 * best['alpha'] + .3 * worst['alpha'])
@ -272,13 +254,13 @@ def crossover(parents):
def mutation_lags(lags, order): def mutation_lags(lags, order):
''' """
Mutation operation for lags gene Mutation operation for lags gene
:param lags: :param lags:
:param order: :param order:
:return: :return:
''' """
try: try:
l = len(lags) l = len(lags)
new = [] new = []
@ -298,19 +280,16 @@ def mutation_lags(lags, order):
print(lags, order, new, lag) print(lags, order, new, lag)
def mutation(individual, pmut): def mutation(individual, **kwargs):
''' """
Mutation operator Mutation operator
:param population: :param individual: an individual genotype
:param pmut: individual probability o
:return: :return:
''' """
import numpy.random import numpy.random
rnd = random.uniform(0, 1)
if rnd < pmut:
print('mutation') print('mutation')
individual['npart'] = min(50, max(3, int(individual['npart'] + np.random.normal(0, 4)))) individual['npart'] = min(50, max(3, int(individual['npart'] + np.random.normal(0, 4))))
@ -328,13 +307,13 @@ def mutation(individual, pmut):
def elitism(population, new_population): def elitism(population, new_population):
''' """
Elitism operation, always select the best individual of the population and discard the worst Elitism operation, always select the best individual of the population and discard the worst
:param population: :param population:
:param new_population: :param new_population:
:return: :return:
''' """
population = sorted(population, key=itemgetter('f1')) population = sorted(population, key=itemgetter('f1'))
best = population[0] best = population[0]
@ -348,31 +327,55 @@ def elitism(population, new_population):
def GeneticAlgorithm(dataset, **kwargs): def GeneticAlgorithm(dataset, **kwargs):
''' """
Genetic algoritm for hyperparameter optimization Genetic algoritm for hyperparameter optimization
:param dataset: :param dataset: The time series to optimize the FTS
:param ngen: Max number of generations :keyword ngen: An integer value with the maximum number of generations, default value: 30
:param mgen: Max number of generations without improvement :keyword mgen: An integer value with the maximum number of generations without improvement to stop, default value 7
:param npop: Population size :keyword npop: An integer value with the population size, default value: 20
:param pcruz: Probability of crossover :keyword pcross: A float value between 0 and 1 with the probability of crossover, default: .5
:param pmut: Probability of mutation :keyword psel: A float value between 0 and 1 with the probability of selection, default: .5
:param window_size: The length of scrolling window for train/test on dataset :keyword pmut: A float value between 0 and 1 with the probability of mutation, default: .3
:param train_rate: The train/test split ([0,1]) :keyword fts_method: The FTS method to optimize
:param increment_rate: The increment of the scrolling window, relative to the window_size ([0,1]) :keyword parameters: dict with model specific arguments for fts_method
:param parameters: dict with model specific arguments for fit method. :keyword elitism: A boolean value indicating if the best individual must always survive to next population
:keyword initial_operator: a function that receives npop and return a random population with size npop
:keyword evalutation_operator: a function that receives a dataset and an individual and return its fitness
:keyword selection_operator: a function that receives the whole population and return a selected individual
:keyword crossover_operator: a function that receives the whole population and return a descendent individual
:keyword mutation_operator: a function that receives one individual and return a changed individual
:keyword window_size: An integer value with the the length of scrolling window for train/test on dataset
:keyword train_rate: A float value between 0 and 1 with the train/test split ([0,1])
:keyword increment_rate: A float value between 0 and 1 with the the increment of the scrolling window,
relative to the window_size ([0,1])
:keyword collect_statistics: A boolean value indicating to collect statistics for each generation
:keyword distributed: A value indicating it the execution will be local and sequential (distributed=False),
or parallel and distributed (distributed='dispy' or distributed='spark')
:keyword cluster: If distributed='dispy' the list of cluster nodes, else if distributed='spark' it is the master node
:return: the best genotype :return: the best genotype
''' """
statistics = [] statistics = []
ngen = kwargs.get('ngen',30) ngen = kwargs.get('ngen',30)
mgen = kwargs.get('mgen', 7) mgen = kwargs.get('mgen', 7)
npop = kwargs.get('npop',20) npop = kwargs.get('npop',20)
pcruz = kwargs.get('pcruz',.5) psel = kwargs.get('psel', .5)
pcross = kwargs.get('pcross',.5)
pmut = kwargs.get('pmut',.3) pmut = kwargs.get('pmut',.3)
distributed = kwargs.get('distributed', False) distributed = kwargs.get('distributed', False)
initial_operator = kwargs.get('initial_operator', initial_population)
evaluation_operator = kwargs.get('evaluation_operator', evaluate)
selection_operator = kwargs.get('selection_operator', double_tournament)
crossover_operator = kwargs.get('crossover_operator', crossover)
mutation_operator = kwargs.get('mutation_operator', mutation)
_elitism = kwargs.get('elitism', True)
elitism_operator = kwargs.get('elitism_operator', elitism)
if distributed == 'dispy': if distributed == 'dispy':
cluster = kwargs.pop('cluster', None) cluster = kwargs.pop('cluster', None)
@ -382,7 +385,7 @@ def GeneticAlgorithm(dataset, **kwargs):
new_population = [] new_population = []
population = initial_population(npop) population = initial_operator(npop)
last_best = population[0] last_best = population[0]
best = population[1] best = population[1]
@ -390,7 +393,7 @@ def GeneticAlgorithm(dataset, **kwargs):
print("Evaluating initial population {}".format(time.time())) print("Evaluating initial population {}".format(time.time()))
if not distributed: if not distributed:
for individual in population: for individual in population:
ret = evaluate(dataset, individual, **kwargs) ret = evaluation_operator(dataset, individual, **kwargs)
for key in __measures: for key in __measures:
individual[key] = ret[key] individual[key] = ret[key]
elif distributed=='dispy': elif distributed=='dispy':
@ -414,19 +417,21 @@ def GeneticAlgorithm(dataset, **kwargs):
generation_statistics = {} generation_statistics = {}
# Selection # Selection
for j in range(int(npop / 2)): for j in range(int(npop * psel)):
new_population.append(double_tournament(population)) new_population.append(selection_operator(population))
new_population.append(double_tournament(population))
# Crossover # Crossover
new = [] new = []
for j in range(int(npop * pcruz)): for j in range(int(npop * pcross)):
new.append(crossover(new_population)) new.append(crossover_operator(new_population))
new_population.extend(new) new_population.extend(new)
# Mutation # Mutation
for ct, individual in enumerate(new_population): for ct, individual in enumerate(new_population):
new_population[ct] = mutation(individual, pmut) rnd = random.uniform(0, 1)
if rnd < pmut:
new_population[ct] = mutation_operator(individual)
# Evaluation # Evaluation
if collect_statistics: if collect_statistics:
@ -436,7 +441,7 @@ def GeneticAlgorithm(dataset, **kwargs):
if not distributed: if not distributed:
for individual in new_population: for individual in new_population:
ret = evaluate(dataset, individual, **kwargs) ret = evaluation_operator(dataset, individual, **kwargs)
for key in __measures: for key in __measures:
individual[key] = ret[key] individual[key] = ret[key]
if collect_statistics: stats[key].append(ret[key]) if collect_statistics: stats[key].append(ret[key])
@ -466,7 +471,8 @@ def GeneticAlgorithm(dataset, **kwargs):
generation_statistics['population'] = mean_stats generation_statistics['population'] = mean_stats
# Elitism # Elitism
population = elitism(population, new_population) if _elitism:
population = elitism_operator(population, new_population)
population = population[:npop] population = population[:npop]
@ -487,14 +493,13 @@ def GeneticAlgorithm(dataset, **kwargs):
pmut += .05 pmut += .05
else: else:
no_improvement_count = 0 no_improvement_count = 0
pcruz = kwargs.get('pcruz', .5) pcross = kwargs.get('pcross', .5)
pmut = kwargs.get('pmut', .3) pmut = kwargs.get('pmut', .3)
print(best) print(best)
if no_improvement_count == mgen: if no_improvement_count == mgen:
break break
return best, statistics return best, statistics
@ -506,7 +511,7 @@ def process_experiment(result, datasetname, conn):
def persist_statistics(statistics): def persist_statistics(statistics):
import json import json
with open('statistics{}.txt'.format(time.time()), 'w') as file: with open('statistics{}.json'.format(time.time()), 'w') as file:
file.write(json.dumps(statistics)) file.write(json.dumps(statistics))
@ -523,7 +528,41 @@ def log_result(conn, datasetname, result):
def execute(datasetname, dataset, **kwargs): def execute(datasetname, dataset, **kwargs):
conn = hUtil.open_hyperparam_db('hyperparam.db') """
:param datasetname:
:param dataset: The time series to optimize the FTS
:keyword database_file:
:keyword experiments:
:keyword distributed:
:keyword ngen: An integer value with the maximum number of generations, default value: 30
:keyword mgen: An integer value with the maximum number of generations without improvement to stop, default value 7
:keyword npop: An integer value with the population size, default value: 20
:keyword pcross: A float value between 0 and 1 with the probability of crossover, default: .5
:keyword psel: A float value between 0 and 1 with the probability of selection, default: .5
:keyword pmut: A float value between 0 and 1 with the probability of mutation, default: .3
:keyword fts_method: The FTS method to optimize
:keyword parameters: dict with model specific arguments for fts_method
:keyword elitism: A boolean value indicating if the best individual must always survive to next population
:keyword initial_operator: a function that receives npop and return a random population with size npop
:keyword evalutation_operator: a function that receives a dataset and an individual and return its fitness
:keyword selection_operator: a function that receives the whole population and return a selected individual
:keyword crossover_operator: a function that receives the whole population and return a descendent individual
:keyword mutation_operator: a function that receives one individual and return a changed individual
:keyword window_size: An integer value with the the length of scrolling window for train/test on dataset
:keyword train_rate: A float value between 0 and 1 with the train/test split ([0,1])
:keyword increment_rate: A float value between 0 and 1 with the the increment of the scrolling window,
relative to the window_size ([0,1])
:keyword collect_statistics: A boolean value indicating to collect statistics for each generation
:keyword distributed: A value indicating it the execution will be local and sequential (distributed=False),
or parallel and distributed (distributed='dispy' or distributed='spark')
:keyword cluster: If distributed='dispy' the list of cluster nodes, else if distributed='spark' it is the master node
:return: the best genotype
"""
file = kwargs.get('database_file', 'hyperparam.db')
conn = hUtil.open_hyperparam_db(file)
experiments = kwargs.get('experiments', 30) experiments = kwargs.get('experiments', 30)
@ -550,4 +589,3 @@ def execute(datasetname, dataset, **kwargs):
dUtil.stop_dispy_cluster(cluster, http_server) dUtil.stop_dispy_cluster(cluster, http_server)
return ret return ret

View File

@ -1,6 +1,7 @@
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from pyFTS.hyperparam import GridSearch, Evolutionary from pyFTS.hyperparam import GridSearch, Evolutionary
from pyFTS.models import pwfts
def get_dataset(): def get_dataset():
from pyFTS.data import SONDA from pyFTS.data import SONDA
@ -41,10 +42,12 @@ datsetname, dataset = get_dataset()
# window_size=10000, train_rate=.9, increment_rate=1,) # window_size=10000, train_rate=.9, increment_rate=1,)
ret = Evolutionary.execute(datsetname, dataset, ret = Evolutionary.execute(datsetname, dataset,
ngen=30, npop=20, pcruz=.5, pmut=.3, ngen=30, npop=20,psel=0.6, pcross=.5, pmut=.3,
window_size=10000, train_rate=.9, increment_rate=1, window_size=10000, train_rate=.9, increment_rate=1,
experiments=1, experiments=1,
distributed='dispy', nodes=nodes) fts_method=pwfts.ProbabilisticWeightedFTS,
database_file='experiments.db',
distributed=False, nodes=nodes)
#res = GridSearch.cluster_method({'mf':1, 'partitioner': 1, 'npart': 10, 'lags':[1], 'alpha': 0.0, 'order': 1}, #res = GridSearch.cluster_method({'mf':1, 'partitioner': 1, 'npart': 10, 'lags':[1], 'alpha': 0.0, 'order': 1},
# dataset, window_size = 10000, train_rate = .9, increment_rate = 1) # dataset, window_size = 10000, train_rate = .9, increment_rate = 1)