Spark Distributed; Hyperparameter optimization

This commit is contained in:
Petrônio Cândido 2019-01-18 09:06:53 -02:00
parent 87686e5ff0
commit 2e1d7fa11a
20 changed files with 904 additions and 301 deletions

View File

@ -0,0 +1,178 @@
<!doctype html>
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" /><script type="text/javascript">
var _gaq = _gaq || [];
_gaq.push(['_setAccount', 'UA-55120145-3']);
_gaq.push(['_trackPageview']);
(function() {
var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
})();
</script>
<title>pyFTS.partitioners.Simple &#8212; pyFTS 1.4 documentation</title>
<link rel="stylesheet" href="../../../_static/bizstyle.css" type="text/css" />
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
<script type="text/javascript" src="../../../_static/documentation_options.js"></script>
<script type="text/javascript" src="../../../_static/jquery.js"></script>
<script type="text/javascript" src="../../../_static/underscore.js"></script>
<script type="text/javascript" src="../../../_static/doctools.js"></script>
<script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>
<script type="text/javascript" src="../../../_static/bizstyle.js"></script>
<link rel="index" title="Index" href="../../../genindex.html" />
<link rel="search" title="Search" href="../../../search.html" />
<meta name="viewport" content="width=device-width,initial-scale=1.0">
<!--[if lt IE 9]>
<script type="text/javascript" src="_static/css3-mediaqueries.js"></script>
<![endif]-->
</head><body>
<div class="related" role="navigation" aria-label="related navigation">
<h3>Navigation</h3>
<ul>
<li class="right" style="margin-right: 10px">
<a href="../../../genindex.html" title="General Index"
accesskey="I">index</a></li>
<li class="right" >
<a href="../../../py-modindex.html" title="Python Module Index"
>modules</a> |</li>
<li class="nav-item nav-item-0"><a href="../../../index.html">pyFTS 1.4 documentation</a> &#187;</li>
<li class="nav-item nav-item-1"><a href="../../index.html" accesskey="U">Module code</a> &#187;</li>
</ul>
</div>
<div class="sphinxsidebar" role="navigation" aria-label="main navigation">
<div class="sphinxsidebarwrapper">
<p class="logo"><a href="../../../index.html">
<img class="logo" src="../../../_static/logo_heading2.png" alt="Logo"/>
</a></p>
<div id="searchbox" style="display: none" role="search">
<h3>Quick search</h3>
<div class="searchformwrapper">
<form class="search" action="../../../search.html" method="get">
<input type="text" name="q" />
<input type="submit" value="Go" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div>
<script type="text/javascript">$('#searchbox').show(0);</script>
</div>
</div>
<div class="document">
<div class="documentwrapper">
<div class="bodywrapper">
<div class="body" role="main">
<h1>Source code for pyFTS.partitioners.Simple</h1><div class="highlight"><pre>
<span></span><span class="sd">&quot;&quot;&quot;Simple Partitioner for manually informed fuzzy sets&quot;&quot;&quot;</span>
<span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
<span class="kn">import</span> <span class="nn">math</span>
<span class="kn">import</span> <span class="nn">random</span> <span class="k">as</span> <span class="nn">rnd</span>
<span class="kn">import</span> <span class="nn">functools</span><span class="o">,</span> <span class="nn">operator</span>
<span class="kn">from</span> <span class="nn">pyFTS.common</span> <span class="k">import</span> <span class="n">FuzzySet</span><span class="p">,</span> <span class="n">Membership</span>
<span class="kn">from</span> <span class="nn">pyFTS.partitioners</span> <span class="k">import</span> <span class="n">partitioner</span>
<div class="viewcode-block" id="SimplePartitioner"><a class="viewcode-back" href="../../../pyFTS.partitioners.html#pyFTS.partitioners.Simple.SimplePartitioner">[docs]</a><span class="k">class</span> <span class="nc">SimplePartitioner</span><span class="p">(</span><span class="n">partitioner</span><span class="o">.</span><span class="n">Partitioner</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Simple Partitioner for manually informed fuzzy sets&quot;&quot;&quot;</span>
<span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Simple Partitioner - the fuzzy sets are informed manually</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span><span class="p">[</span><span class="s1">&#39;preprocess&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">False</span>
<span class="nb">super</span><span class="p">(</span><span class="n">SimplePartitioner</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="s2">&quot;Simple&quot;</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">partitions</span> <span class="o">=</span> <span class="mi">0</span>
<div class="viewcode-block" id="SimplePartitioner.append"><a class="viewcode-back" href="../../../pyFTS.partitioners.html#pyFTS.partitioners.Simple.SimplePartitioner.append">[docs]</a> <span class="k">def</span> <span class="nf">append</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">,</span> <span class="n">mf</span><span class="p">,</span> <span class="n">parameters</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Append a new partition (fuzzy set) to the partitioner</span>
<span class="sd"> :param name: Fuzzy set name</span>
<span class="sd"> :param mf: One of the pyFTS.common.Membership functions</span>
<span class="sd"> :param parameters: A list with the parameters for the membership function</span>
<span class="sd"> :param kwargs: Optional arguments for the fuzzy set</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">name</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="nb">len</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;The name of the fuzzy set cannot be empty&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">name</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">sets</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;This name has already been used&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">mf</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="n">mf</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">(</span><span class="n">Membership</span><span class="o">.</span><span class="n">trimf</span><span class="p">,</span> <span class="n">Membership</span><span class="o">.</span><span class="n">gaussmf</span><span class="p">,</span>
<span class="n">Membership</span><span class="o">.</span><span class="n">trapmf</span><span class="p">,</span> <span class="n">Membership</span><span class="o">.</span><span class="n">singleton</span><span class="p">,</span>
<span class="n">Membership</span><span class="o">.</span><span class="n">sigmf</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;The mf parameter should be one of pyFTS.common.Membership functions&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">mf</span> <span class="o">==</span> <span class="n">Membership</span><span class="o">.</span><span class="n">trimf</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">parameters</span><span class="p">)</span> <span class="o">!=</span> <span class="mi">3</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Incorrect number of parameters for the Membership.trimf&quot;</span><span class="p">)</span>
<span class="n">centroid</span> <span class="o">=</span> <span class="n">parameters</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span>
<span class="k">elif</span> <span class="n">mf</span> <span class="o">==</span> <span class="n">Membership</span><span class="o">.</span><span class="n">gaussmf</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">parameters</span><span class="p">)</span> <span class="o">!=</span> <span class="mi">2</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Incorrect number of parameters for the Membership.gaussmf&quot;</span><span class="p">)</span>
<span class="n">centroid</span> <span class="o">=</span> <span class="n">parameters</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="k">elif</span> <span class="n">mf</span> <span class="o">==</span> <span class="n">Membership</span><span class="o">.</span><span class="n">trapmf</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">parameters</span><span class="p">)</span> <span class="o">!=</span> <span class="mi">4</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Incorrect number of parameters for the Membership.trapmf&quot;</span><span class="p">)</span>
<span class="n">centroid</span> <span class="o">=</span> <span class="p">(</span><span class="n">parameters</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">+</span><span class="n">parameters</span><span class="p">[</span><span class="mi">2</span><span class="p">])</span><span class="o">/</span><span class="mi">2</span>
<span class="k">elif</span> <span class="n">mf</span> <span class="o">==</span> <span class="n">Membership</span><span class="o">.</span><span class="n">singleton</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">parameters</span><span class="p">)</span> <span class="o">!=</span> <span class="mi">1</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Incorrect number of parameters for the Membership.singleton&quot;</span><span class="p">)</span>
<span class="n">centroid</span> <span class="o">=</span> <span class="n">parameters</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="k">elif</span> <span class="n">mf</span> <span class="o">==</span> <span class="n">Membership</span><span class="o">.</span><span class="n">sigmf</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">parameters</span><span class="p">)</span> <span class="o">!=</span> <span class="mi">2</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Incorrect number of parameters for the Membership.sigmf&quot;</span><span class="p">)</span>
<span class="n">centroid</span> <span class="o">=</span> <span class="n">parameters</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="o">+</span> <span class="p">(</span><span class="n">parameters</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="o">/</span> <span class="p">(</span><span class="mi">2</span> <span class="o">*</span> <span class="n">parameters</span><span class="p">[</span><span class="mi">0</span><span class="p">]))</span>
<span class="bp">self</span><span class="o">.</span><span class="n">sets</span><span class="p">[</span><span class="n">name</span><span class="p">]</span> <span class="o">=</span> <span class="n">FuzzySet</span><span class="o">.</span><span class="n">FuzzySet</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">mf</span><span class="p">,</span> <span class="n">parameters</span><span class="p">,</span> <span class="n">centroid</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">partitions</span> <span class="o">+=</span> <span class="mi">1</span>
<span class="bp">self</span><span class="o">.</span><span class="n">ordered_sets</span> <span class="o">=</span> <span class="p">[</span><span class="n">key</span> <span class="k">for</span> <span class="n">key</span> <span class="ow">in</span> <span class="nb">sorted</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">sets</span><span class="o">.</span><span class="n">keys</span><span class="p">(),</span> <span class="n">key</span><span class="o">=</span><span class="k">lambda</span> <span class="n">k</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">sets</span><span class="p">[</span><span class="n">k</span><span class="p">]</span><span class="o">.</span><span class="n">centroid</span><span class="p">)]</span>
<span class="bp">self</span><span class="o">.</span><span class="n">min</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">sets</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">ordered_sets</span><span class="p">[</span><span class="mi">0</span><span class="p">]]</span><span class="o">.</span><span class="n">lower</span>
<span class="bp">self</span><span class="o">.</span><span class="n">max</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">sets</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">ordered_sets</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">]]</span><span class="o">.</span><span class="n">upper</span></div></div>
</pre></div>
</div>
</div>
</div>
<div class="clearer"></div>
</div>
<div class="related" role="navigation" aria-label="related navigation">
<h3>Navigation</h3>
<ul>
<li class="right" style="margin-right: 10px">
<a href="../../../genindex.html" title="General Index"
>index</a></li>
<li class="right" >
<a href="../../../py-modindex.html" title="Python Module Index"
>modules</a> |</li>
<li class="nav-item nav-item-0"><a href="../../../index.html">pyFTS 1.4 documentation</a> &#187;</li>
<li class="nav-item nav-item-1"><a href="../../index.html" >Module code</a> &#187;</li>
</ul>
</div>
<div class="footer" role="contentinfo">
&#169; Copyright 2018, Machine Intelligence and Data Science Laboratory - UFMG - Brazil.
Created using <a href="http://sphinx-doc.org/">Sphinx</a> 1.7.2.
</div>
</body>
</html>

View File

@ -26,6 +26,10 @@ class FuzzySet(FuzzySet.FuzzySet):
self.mf = [] self.mf = []
self.parameters = [] self.parameters = []
self.lower = None
self.upper = None
self.centroid = None
def membership(self, x): def membership(self, x):
""" """
@ -62,3 +66,13 @@ class FuzzySet(FuzzySet.FuzzySet):
:return: :return:
""" """
self.sets.append(set) self.sets.append(set)
if self.lower is None or self.lower > set.lower:
self.lower = set.lower
if self.upper is None or self.upper < set.upper:
self.upper = set.upper
if self.centroid is None or self.centroid < set.centroid:
self.centroid = set.centroid

View File

@ -125,6 +125,7 @@ def fuzzyfy(data, partitioner, **kwargs):
:keyword method: the fuzzyfication method (fuzzy: all fuzzy memberships, maximum: only the maximum membership) :keyword method: the fuzzyfication method (fuzzy: all fuzzy memberships, maximum: only the maximum membership)
:keyword mode: the fuzzyfication mode (sets: return the fuzzy sets names, vector: return a vector with the membership :keyword mode: the fuzzyfication mode (sets: return the fuzzy sets names, vector: return a vector with the membership
values for all fuzzy sets, both: return a list with tuples (fuzzy set, membership value) ) values for all fuzzy sets, both: return a list with tuples (fuzzy set, membership value) )
:returns a list with the fuzzyfied values, depending on the mode :returns a list with the fuzzyfied values, depending on the mode
""" """
alpha_cut = kwargs.get('alpha_cut', 0.) alpha_cut = kwargs.get('alpha_cut', 0.)

View File

@ -74,7 +74,7 @@ def sigmf(x, parameters):
:param x: :param x:
:param parameters: an list with 2 real values (smoothness and midpoint) :param parameters: an list with 2 real values (smoothness and midpoint)
:return: :return
""" """
return 1 / (1 + math.exp(-parameters[0] * (x - parameters[1]))) return 1 / (1 + math.exp(-parameters[0] * (x - parameters[1])))

View File

@ -38,6 +38,9 @@ class FTS(object):
"""A boolean value indicating if the model support probabilistic forecasting, default: False""" """A boolean value indicating if the model support probabilistic forecasting, default: False"""
self.is_multivariate = False self.is_multivariate = False
"""A boolean value indicating if the model support multivariate time series (Pandas DataFrame), default: False""" """A boolean value indicating if the model support multivariate time series (Pandas DataFrame), default: False"""
self.is_clustered = False
"""A boolean value indicating if the model support multivariate time series (Pandas DataFrame), but works like
a monovariate method, default: False"""
self.dump = False self.dump = False
self.transformations = [] self.transformations = []
"""A list with the data transformations (common.Transformations) applied on model pre and post processing, default: []""" """A list with the data transformations (common.Transformations) applied on model pre and post processing, default: []"""
@ -61,6 +64,8 @@ class FTS(object):
"""Flag indicating if the test data will be clipped inside the training Universe of Discourse""" """Flag indicating if the test data will be clipped inside the training Universe of Discourse"""
self.alpha_cut = kwargs.get("alpha_cut", 0.0) self.alpha_cut = kwargs.get("alpha_cut", 0.0)
"""A float with the minimal membership to be considered on fuzzyfication process""" """A float with the minimal membership to be considered on fuzzyfication process"""
self.lags = kwargs.get("lags", None)
"""The list of lag indexes for high order models"""
self.max_lag = self.order self.max_lag = self.order
"""A integer indicating the largest lag used by the model. This value also indicates the minimum number of past lags """A integer indicating the largest lag used by the model. This value also indicates the minimum number of past lags
needed to forecast a single step ahead""" needed to forecast a single step ahead"""

View File

@ -3,6 +3,7 @@ import pandas as pd
from pyFTS.data import Enrollments, TAIEX from pyFTS.data import Enrollments, TAIEX
from pyFTS.partitioners import Grid, Simple from pyFTS.partitioners import Grid, Simple
from pyFTS.models.multivariate import partitioner as mv_partitioner
from pyFTS.models import hofts from pyFTS.models import hofts
from pyspark import SparkConf from pyspark import SparkConf
@ -10,44 +11,141 @@ from pyspark import SparkContext
import os import os
# make sure pyspark tells workers to use python3 not 2 if both are installed # make sure pyspark tells workers to use python3 not 2 if both are installed
SPARK_ADDR = 'spark://192.168.0.110:7077'
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3' os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/usr/bin/python3' os.environ['PYSPARK_DRIVER_PYTHON'] = '/usr/bin/python3'
def get_partitioner(shared_partitioner): def get_partitioner(shared_partitioner, type='common', variables=[]):
""" """
:param part: :param part:
:return: :return:
""" """
fs_tmp = Simple.SimplePartitioner() if type=='common':
fs_tmp = Simple.SimplePartitioner()
for fset in shared_partitioner.value.keys(): for fset in shared_partitioner.value.keys():
fz = shared_partitioner.value[fset] fz = shared_partitioner.value[fset]
fs_tmp.append(fset, fz.mf, fz.parameters) if type=='common':
fs_tmp.append_complex(fz)
elif type == 'multivariate':
fs_tmp.append(fz)
return fs_tmp return fs_tmp
def slave_train(data, shared_method, shared_partitioner, shared_order): def get_clustered_partitioner(explanatory_variables, target_variable, **parameters):
from pyFTS.models.multivariate.common import MultivariateFuzzySet
fs_tmp = mv_partitioner.MultivariatePartitioner(explanatory_variables=explanatory_variables,
target_variable=target_variable)
for tmp in parameters['partitioner_names'].value:
fs = MultivariateFuzzySet(target_variable=target_variable)
for var, fset in parameters['partitioner_{}'.format(tmp)].value:
fs.append_set(var, fset)
fs_tmp.append(fs)
fs_tmp.build_index()
return fs_tmp
def get_variables(**parameters):
explanatory_variables = []
target_variable = None
for name in parameters['variables'].value:
from pyFTS.models.multivariate import common, variable
var = variable.Variable(name,
type=parameters['{}_type'.format(name)].value,
data_label=parameters['{}_label'.format(name)].value,
alpha_cut=parameters['{}_alpha'.format(name)].value,
#data_type=parameters['{}_data_type'.format(name)].value,
#mask=parameters['{}_mask'.format(name)].value,
)
var.partitioner = get_partitioner(parameters['{}_partitioner'.format(name)])
var.partitioner.type = parameters['{}_partitioner_type'.format(name)].value
explanatory_variables.append(var)
if var.name == parameters['target'].value:
target_variable = var
return (explanatory_variables, target_variable)
def slave_train_univariate(data, **parameters):
""" """
:param data: :param data:
:return: :return:
""" """
model = shared_method.value(partitioner=get_partitioner(shared_partitioner), if parameters['type'].value == 'common':
order=shared_order.value)
ndata = [k for k in data] if parameters['order'].value > 1:
model = parameters['method'].value(partitioner=get_partitioner(parameters['partitioner']),
order=parameters['order'].value, alpha_cut=parameters['alpha_cut'].value,
lags=parameters['lags'].value)
else:
model = parameters['method'].value(partitioner=get_partitioner(parameters['partitioner']),
alpha_cut=parameters['alpha_cut'].value)
ndata = [k for k in data]
else:
pass
model.train(ndata) model.train(ndata)
return [(k, model.flrgs[k]) for k in model.flrgs] return [(k, model.flrgs[k]) for k in model.flrgs.keys()]
def distributed_train(model, data, url='spark://192.168.0.110:7077', app='pyFTS'): def slave_train_multivariate(data, **parameters):
explanatory_variables, target_variable = get_variables(**parameters)
#vars = [(v.name, v.name) for v in explanatory_variables]
#return [('vars', vars), ('target',[target_variable.name])]
if parameters['type'].value == 'clustered':
fs = get_clustered_partitioner(explanatory_variables, target_variable, **parameters)
model = parameters['method'].value(explanatory_variables=explanatory_variables,
target_variable=target_variable,
partitioner=fs,
order=parameters['order'].value,
alpha_cut=parameters['alpha_cut'].value,
lags=parameters['lags'].value)
else:
if parameters['order'].value > 1:
model = parameters['method'].value(explanatory_variables=explanatory_variables,
target_variable=target_variable,
order=parameters['order'].value,
alpha_cut=parameters['alpha_cut'].value,
lags=parameters['lags'].value)
else:
model = parameters['method'].value(explanatory_variables=explanatory_variables,
target_variable=target_variable,
alpha_cut=parameters['alpha_cut'].value)
rows = [k for k in data]
ndata = pd.DataFrame.from_records(rows, columns=parameters['columns'].value)
model.train(ndata)
if parameters['type'].value == 'clustered':
counts = [(fset, count) for fset,count in model.partitioner.count.items()]
flrgs = [(k, v) for k,v in model.flrgs.items()]
return [('counts', counts), ('flrgs', flrgs)]
else:
return [(k, v) for k,v in model.flrgs.items()]
def distributed_train(model, data, url=SPARK_ADDR, app='pyFTS'):
""" """
@ -61,22 +159,92 @@ def distributed_train(model, data, url='spark://192.168.0.110:7077', app='pyFTS'
conf = SparkConf() conf = SparkConf()
conf.setMaster(url) conf.setMaster(url)
conf.setAppName(app) conf.setAppName(app)
conf.set("spark.executor.memory", "2g")
conf.set("spark.driver.memory", "2g")
conf.set("spark.memory.offHeap.enabled",True)
conf.set("spark.memory.offHeap.size","16g")
parameters = {}
with SparkContext(conf=conf) as context: with SparkContext(conf=conf) as context:
shared_partitioner = context.broadcast(model.partitioner.sets)
shared_order = context.broadcast(model.order)
shared_method = context.broadcast(type(model))
func = lambda x: slave_train(x, shared_method, shared_partitioner, shared_order) nodes = context.defaultParallelism
flrgs = context.parallelize(data).mapPartitions(func) if not model.is_multivariate:
parameters['type'] = context.broadcast('common')
parameters['partitioner'] = context.broadcast(model.partitioner.sets)
parameters['alpha_cut'] = context.broadcast(model.alpha_cut)
parameters['order'] = context.broadcast(model.order)
parameters['method'] = context.broadcast(type(model))
parameters['lags'] = context.broadcast(model.lags)
parameters['max_lag'] = context.broadcast(model.max_lag)
for k in flrgs.collect(): func = lambda x: slave_train_univariate(x, **parameters)
model.append_rule(k[1])
return model flrgs = context.parallelize(data).repartition(nodes*2).mapPartitions(func)
for k in flrgs.collect():
model.append_rule(k[1])
return model
else:
if model.is_clustered:
parameters['type'] = context.broadcast('clustered')
names = []
for name, fset in model.partitioner.sets.items():
names.append(name)
parameters['partitioner_{}'.format(name)] = context.broadcast([(k,v) for k,v in fset.sets.items()])
parameters['partitioner_names'] = context.broadcast(names)
else:
parameters['type'] = context.broadcast('multivariate')
names = []
for var in model.explanatory_variables:
#if var.data_type is None:
# raise Exception("It is mandatory to inform the data_type parameter for each variable when the training is distributed! ")
names.append(var.name)
parameters['{}_type'.format(var.name)] = context.broadcast(var.type)
#parameters['{}_data_type'.format(var.name)] = context.broadcast(var.data_type)
#parameters['{}_mask'.format(var.name)] = context.broadcast(var.mask)
parameters['{}_label'.format(var.name)] = context.broadcast(var.data_label)
parameters['{}_alpha'.format(var.name)] = context.broadcast(var.alpha_cut)
parameters['{}_partitioner'.format(var.name)] = context.broadcast(var.partitioner.sets)
parameters['{}_partitioner_type'.format(var.name)] = context.broadcast(var.partitioner.type)
parameters['variables'] = context.broadcast(names)
parameters['target'] = context.broadcast(model.target_variable.name)
parameters['columns'] = context.broadcast(data.columns.values)
data = data.to_dict(orient='records')
parameters['alpha_cut'] = context.broadcast(model.alpha_cut)
parameters['order'] = context.broadcast(model.order)
parameters['method'] = context.broadcast(type(model))
parameters['lags'] = context.broadcast(model.lags)
parameters['max_lag'] = context.broadcast(model.max_lag)
func = lambda x: slave_train_multivariate(x, **parameters)
flrgs = context.parallelize(data).mapPartitions(func)
for k in flrgs.collect():
print(k)
#for g in k:
# print(g)
#return
if parameters['type'].value == 'clustered':
if k[0] == 'counts':
for fset, count in k[1]:
model.partitioner.count[fset] = count
elif k[0] == 'flrgs':
model.append_rule(k[1])
else:
model.append_rule(k[1])
return model
def distributed_predict(data, model, url=SPARK_ADDR, app='pyFTS'):
def distributed_predict(data, model, url='spark://192.168.0.110:7077', app='pyFTS'):
return None return None

View File

@ -15,15 +15,32 @@ from pyFTS.common import Membership
from pyFTS.hyperparam import Util as hUtil from pyFTS.hyperparam import Util as hUtil
# Gera indivíduos após operadores #
def genotype(mf, npart, partitioner, order, alpha, lags, len_lags, rmse): def genotype(mf, npart, partitioner, order, alpha, lags, len_lags, rmse):
ind = dict(mf=mf, npart=npart, partitioner=partitioner, order=order, alpha=alpha, lags=lags, len_lags=len_lags, '''
rmse=rmse) Create the individual genotype
:param mf: membership function
:param npart: number of partitions
:param partitioner: partitioner method
:param order: model order
:param alpha: alpha-cut
:param lags: array with lag indexes
:param len_lags: parsimony fitness value
:param rmse: accuracy fitness value
:return: the genotype, a dictionary with all hyperparameters
'''
ind = dict(mf=mf, npart=npart, partitioner=partitioner, order=order,
alpha=alpha, lags=lags, len_lags=len_lags, rmse=rmse)
return ind return ind
# Gera indivíduos
def random_genotype(): def random_genotype():
'''
Create random genotype
:return: the genotype, a dictionary with all hyperparameters
'''
order = random.randint(1, 3) order = random.randint(1, 3)
return genotype( return genotype(
random.randint(1, 4), random.randint(1, 4),
@ -32,21 +49,34 @@ def random_genotype():
order, order,
random.uniform(0, .5), random.uniform(0, .5),
sorted(random.sample(range(1, 50), order)), sorted(random.sample(range(1, 50), order)),
[], None,
[] None
) )
# Gera uma população de tamanho n #
def initial_population(n): def initial_population(n):
'''
Create a random population of size n
:param n: the size of the population
:return: a list with n random individuals
'''
pop = [] pop = []
for i in range(n): for i in range(n):
pop.append(random_genotype()) pop.append(random_genotype())
return pop return pop
# Função de avaliação def phenotype(individual, train, parameters={}):
def phenotype(individual, train): '''
Instantiate the genotype, creating a fitted model with the genotype hyperparameters
:param individual: a genotype
:param train: the training dataset
:param parameters: dict with model specific arguments for fit method.
:return: a fitted FTS model
'''
try: try:
if individual['mf'] == 1: if individual['mf'] == 1:
mf = Membership.trimf mf = Membership.trimf
@ -67,28 +97,48 @@ def phenotype(individual, train):
alpha_cut=individual['alpha'], alpha_cut=individual['alpha'],
order=individual['order']) order=individual['order'])
model.fit(train) model.fit(train, **parameters)
return model return model
except Exception as ex: except Exception as ex:
print("EXCEPTION!", str(ex), str(individual)) print("PHENOTYPE EXCEPTION!", str(ex), str(individual))
return None return None
def evaluation1(dataset, individual): def evaluate(dataset, individual, **kwargs):
'''
Evaluate an individual using a sliding window cross validation over the dataset.
:param dataset: Evaluation dataset
:param individual: genotype to be tested
:param window_size: The length of scrolling window for train/test on dataset
:param train_rate: The train/test split ([0,1])
:param increment_rate: The increment of the scrolling window, relative to the window_size ([0,1])
:param parameters: dict with model specific arguments for fit method.
:return: a tuple (len_lags, rmse) with the parsimony fitness value and the accuracy fitness value
'''
from pyFTS.common import Util from pyFTS.common import Util
from pyFTS.benchmarks import Measures from pyFTS.benchmarks import Measures
window_size = kwargs.get('window_size', 800)
train_rate = kwargs.get('train_rate', .8)
increment_rate = kwargs.get('increment_rate', .2)
parameters = kwargs.get('parameters',{})
if individual['rmse'] is not None and individual['len_lags'] is not None:
return individual['len_lags'], individual['rmse']
try: try:
results = [] results = []
lengths = [] lengths = []
for count, train, test in Util.sliding_window(dataset, 800, train=.8, inc=.25): for count, train, test in Util.sliding_window(dataset, window_size, train=train_rate, inc=increment_rate):
model = phenotype(individual, train)
model = phenotype(individual, train, parameters=parameters)
if model is None: if model is None:
return (None) raise Exception("Phenotype returned None")
rmse, _, _ = Measures.get_point_statistics(test, model) rmse, _, _ = Measures.get_point_statistics(test, model)
lengths.append(len(model)) lengths.append(len(model))
@ -100,36 +150,59 @@ def evaluation1(dataset, individual):
rmse = np.nansum([.6 * np.nanmean(results), .4 * np.nanstd(results)]) rmse = np.nansum([.6 * np.nanmean(results), .4 * np.nanstd(results)])
len_lags = np.nansum([.4 * np.nanmean(lengths), .6 * _lags]) len_lags = np.nansum([.4 * np.nanmean(lengths), .6 * _lags])
#print("EVALUATION {}".format(individual))
return len_lags, rmse return len_lags, rmse
except Exception as ex: except Exception as ex:
print("EXCEPTION!", str(ex), str(individual)) print("EVALUATION EXCEPTION!", str(ex), str(individual))
return np.inf return np.inf, np.inf
def tournament(population, objective): def tournament(population, objective):
'''
Simple tournament selection strategy.
:param population: the population
:param objective: the objective to be considered on tournament
:return:
'''
n = len(population) - 1 n = len(population) - 1
r1 = random.randint(0, n) if n > 2 else 0 try:
r2 = random.randint(0, n) if n > 2 else 1 r1 = random.randint(0, n) if n > 2 else 0
ix = r1 if population[r1][objective] < population[r2][objective] else r2 r2 = random.randint(0, n) if n > 2 else 1
return population[ix] ix = r1 if population[r1][objective] < population[r2][objective] else r2
return population[ix]
except Exception as ex:
print(r1, population[r1])
print(r2, population[r2])
raise ex
def selection1(population): def double_tournament(population):
pais = [] '''
prob = .8 Double tournament selection strategy.
# for i in range(len(population)): :param population:
pai1 = tournament(population, 'rmse') :return:
pai2 = tournament(population, 'rmse') '''
finalista = tournament([pai1, pai2], 'len_lags') ancestor1 = tournament(population, 'rmse')
ancestor2 = tournament(population, 'rmse')
return finalista selected = tournament([ancestor1, ancestor2], 'len_lags')
return selected
def lag_crossover2(best, worst): def lag_crossover2(best, worst):
'''
Cross over two lag genes
:param best: best genotype
:param worst: worst genotype
:return: a tuple (order, lags)
'''
order = int(round(.7 * best['order'] + .3 * worst['order'])) order = int(round(.7 * best['order'] + .3 * worst['order']))
lags = [] lags = []
@ -151,15 +224,26 @@ def lag_crossover2(best, worst):
# Cruzamento # Cruzamento
def crossover(pais): def crossover(parents):
'''
Crossover operation between two parents
:param parents: a list with two genotypes
:return: a genotype
'''
import random import random
if pais[0]['rmse'] < pais[1]['rmse']: n = len(parents) - 1
best = pais[0]
worst = pais[1] r1 = random.randint(0, n)
r2 = random.randint(0, n)
if parents[r1]['rmse'] < parents[r2]['rmse']:
best = parents[r1]
worst = parents[r2]
else: else:
best = pais[1] best = parents[r2]
worst = pais[0] worst = parents[r1]
npart = int(round(.7 * best['npart'] + .3 * worst['npart'])) npart = int(round(.7 * best['npart'] + .3 * worst['npart']))
alpha = float(.7 * best['alpha'] + .3 * worst['alpha']) alpha = float(.7 * best['alpha'] + .3 * worst['alpha'])
@ -172,119 +256,197 @@ def crossover(pais):
order, lags = lag_crossover2(best, worst) order, lags = lag_crossover2(best, worst)
rmse = [] descendent = genotype(mf, npart, partitioner, order, alpha, lags, None, None)
len_lags = []
filho = genotype(mf, npart, partitioner, order, alpha, lags, len_lags, rmse) return descendent
return filho
# Mutação | p é a probabilidade de mutação
def mutation_lags(lags, order): def mutation_lags(lags, order):
new = sorted(random.sample(range(1, 50), order)) '''
for lag in np.arange(len(lags) - 1): Mutation operation for lags gene
new[lag] = min(50, max(1, int(lags[lag] + np.random.normal(0, 0.5))))
if order > 1: :param lags:
for k in np.arange(1, order): :param order:
while new[k] <= new[k - 1]: :return:
new[k] = int(new[k] + np.random.randint(1, 5)) '''
try:
l = len(lags)
new = []
for lag in np.arange(order):
if lag < l:
new.append( min(50, max(1, int(lags[lag] + np.random.randint(-5, 5)))) )
else:
new.append( new[-1] + np.random.randint(1, 5) )
return new if order > 1:
for k in np.arange(1, order):
while new[k] <= new[k - 1]:
new[k] = int(new[k] + np.random.randint(1, 5))
return new
except Exception as ex:
print(lags, order, new, lag)
def mutation(individual): def mutation(individual, pmut):
'''
Mutation operator
:param population:
:return:
'''
import numpy.random import numpy.random
individual['npart'] = min(50, max(3, int(individual['npart'] + np.random.normal(0, 2))))
individual['alpha'] = min(.5, max(0, individual['alpha'] + np.random.normal(0, .1))) rnd = random.uniform(0, 1)
individual['mf'] = random.randint(1, 2)
individual['partitioner'] = random.randint(1, 2) if rnd < pmut:
individual['order'] = min(5, max(1, int(individual['order'] + np.random.normal(0, 0.5))))
# Chama a função mutation_lags print('mutation')
individual['lags'] = mutation_lags( individual['lags'], individual['order'])
#individual['lags'] = sorted(random.sample(range(1, 50), individual['order'])) individual['npart'] = min(50, max(3, int(individual['npart'] + np.random.normal(0, 4))))
individual['alpha'] = min(.5, max(0, individual['alpha'] + np.random.normal(0, .5)))
individual['mf'] = random.randint(1, 2)
individual['partitioner'] = random.randint(1, 2)
individual['order'] = min(5, max(1, int(individual['order'] + np.random.normal(0, 1))))
# Chama a função mutation_lags
individual['lags'] = mutation_lags( individual['lags'], individual['order'])
individual['rmse'] = None
individual['len_lags'] = None
return individual return individual
# Elitismo
def elitism(population, new_population): def elitism(population, new_population):
# Pega melhor indivíduo da população corrente '''
Elitism operation, always select the best individual of the population and discard the worst
:param population:
:param new_population:
:return:
'''
population = sorted(population, key=itemgetter('rmse')) population = sorted(population, key=itemgetter('rmse'))
best = population[0] best = population[0]
# Ordena a nova população e insere o melhor1 no lugar do pior
new_population = sorted(new_population, key=itemgetter('rmse'))
new_population[-1] = best
# Ordena novamente e pega o melhor
new_population = sorted(new_population, key=itemgetter('rmse')) new_population = sorted(new_population, key=itemgetter('rmse'))
if new_population[0]["rmse"] > best["rmse"]:
new_population.insert(0,best)
return new_population return new_population
def genetico(dataset, ngen, npop, pcruz, pmut, option=1): def GeneticAlgorithm(dataset, **kwargs):
new_populacao = populacao_nova = [] '''
# Gerar população inicial Genetic algoritm for hyperparameter optimization
populacao = initial_population(npop)
# Avaliar população inicial :param dataset:
result = [evaluation1(dataset, k) for k in populacao] :param ngen: Max number of generations
:param mgen: Max number of generations without improvement
:param npop: Population size
:param pcruz: Probability of crossover
:param pmut: Probability of mutation
:param window_size: The length of scrolling window for train/test on dataset
:param train_rate: The train/test split ([0,1])
:param increment_rate: The increment of the scrolling window, relative to the window_size ([0,1])
:param parameters: dict with model specific arguments for fit method.
:return: the best genotype
'''
for i in range(npop): statistics = []
if option == 1:
populacao[i]['len_lags'], populacao[i]['rmse'] = result[i] ngen = kwargs.get('ngen',30)
else: mgen = kwargs.get('mgen', 7)
populacao[i]['rmse'] = result[i] npop = kwargs.get('npop',20)
pcruz = kwargs.get('pcruz',.5)
pmut = kwargs.get('pmut',.3)
collect_statistics = kwargs.get('collect_statistics', False)
no_improvement_count = 0
new_population = []
population = initial_population(npop)
last_best = population[0]
best = population[1]
for individual in population:
individual['len_lags'], individual['rmse'] = evaluate(dataset, individual, **kwargs)
# Gerações
for i in range(ngen): for i in range(ngen):
# Iteração para gerar a nova população print("GENERATION {}".format(i))
generation_statistics = {}
# Selection
for j in range(int(npop / 2)): for j in range(int(npop / 2)):
# Selecao de pais new_population.append(double_tournament(population))
pais = [] new_population.append(double_tournament(population))
pais.append(selection1(populacao))
pais.append(selection1(populacao))
# Cruzamento com probabilidade pcruz # Crossover
rnd = random.uniform(0, 1) new = []
filho1 = crossover(pais) if pcruz > rnd else pais[0] for j in range(int(npop * pcruz)):
rnd = random.uniform(0, 1) new.append(crossover(new_population))
filho2 = crossover(pais) if pcruz > rnd else pais[1] new_population.extend(new)
# Mutação com probabilidade pmut # Mutation
rnd = random.uniform(0, 1) for ct, individual in enumerate(new_population):
filho11 = mutation(filho1) if pmut > rnd else filho1 new_population[ct] = mutation(individual, pmut)
rnd = random.uniform(0, 1)
filho22 = mutation(filho2) if pmut > rnd else filho2
# Insere filhos na nova população # Evaluation
new_populacao.append(filho11) _f1 = _f2 = []
new_populacao.append(filho22) for individual in new_population:
f1, f2 = evaluate(dataset, individual, **kwargs)
individual['len_lags'], individual['rmse'] = f1, f2
if collect_statistics:
_f1.append(f1)
_f2.append(f2)
#print('eval {}'.format(individual))
result = [evaluation1(dataset, k) for k in new_populacao] if collect_statistics:
generation_statistics['population'] = {'f1': np.nanmedian(_f1), 'f2': np.nanmedian(_f2)}
for i in range(len(new_populacao)): # Elitism
new_populacao[i]['len_lags'], new_populacao[i]['rmse'] = result[i] population = elitism(population, new_population)
populacao = elitism(populacao, new_populacao) population = population[:npop]
new_populacao = [] new_population = []
melhorT = sorted(populacao, key=lambda item: item['rmse'])[0] last_best = best
return melhorT best = population[0]
if collect_statistics:
generation_statistics['best'] = {'f1': best["len_lags"], 'f2': best["rmse"]}
statistics.append(generation_statistics)
if last_best['rmse'] <= best['rmse'] and last_best['len_lags'] <= best['len_lags']:
no_improvement_count += 1
#print("WITHOUT IMPROVEMENT {}".format(no_improvement_count))
pmut += .05
else:
no_improvement_count = 0
pcruz = kwargs.get('pcruz', .5)
pmut = kwargs.get('pmut', .3)
#print(best)
if no_improvement_count == mgen:
break
if collect_statistics:
return best, generation_statistics
else:
return best
def cluster_method(dataset, ngen, npop, pcruz, pmut, option=1): def cluster_method(dataset, **kwargs):
print(ngen, npop, pcruz, pmut, option) from pyFTS.hyperparam.Evolutionary import GeneticAlgorithm
from pyFTS.hyperparam.Evolutionary import genetico
inicio = time.time() inicio = time.time()
ret = genetico(dataset, ngen, npop, pcruz, pmut, option) ret = GeneticAlgorithm(dataset, **kwargs)
fim = time.time() fim = time.time()
ret['time'] = fim - inicio ret['time'] = fim - inicio
ret['size'] = ret['len_lags'] ret['size'] = ret['len_lags']
@ -297,16 +459,7 @@ def process_jobs(jobs, datasetname, conn):
if job.status == dispy.DispyJob.Finished and result is not None: if job.status == dispy.DispyJob.Finished and result is not None:
print("Processing result of {}".format(result)) print("Processing result of {}".format(result))
metrics = ['rmse', 'size', 'time'] log_result(conn, datasetname, result)
for metric in metrics:
record = (datasetname, 'Evolutive', 'WHOFTS', None, result['mf'],
result['order'], result['partitioner'], result['npart'],
result['alpha'], str(result['lags']), metric, result[metric])
print(record)
hUtil.insert_hyperparam(record, conn)
else: else:
@ -314,25 +467,47 @@ def process_jobs(jobs, datasetname, conn):
print(job.stdout) print(job.stdout)
def execute(datasetname, dataset, **kwargs): def log_result(conn, datasetname, result):
nodes = kwargs.get('nodes', ['127.0.0.1']) metrics = ['rmse', 'size', 'time']
for metric in metrics:
record = (datasetname, 'Evolutive', 'WHOFTS', None, result['mf'],
result['order'], result['partitioner'], result['npart'],
result['alpha'], str(result['lags']), metric, result[metric])
cluster, http_server = Util.start_dispy_cluster(cluster_method, nodes=nodes) print(record)
hUtil.insert_hyperparam(record, conn)
def execute(datasetname, dataset, **kwargs):
conn = hUtil.open_hyperparam_db('hyperparam.db') conn = hUtil.open_hyperparam_db('hyperparam.db')
ngen = kwargs.get('ngen', 70) distributed = kwargs.get('distributed', False)
npop = kwargs.get('npop', 20)
pcruz = kwargs.get('pcruz', .8)
pmut = kwargs.get('pmut', .2)
option = kwargs.get('option', 1)
jobs = [] experiments = kwargs.get('experiments', 30)
for i in range(kwargs.get('experiments', 30)): if not distributed:
print("Experiment {}".format(i)) ret = []
job = cluster.submit(dataset, ngen, npop, pcruz, pmut, option) for i in range(experiments):
jobs.append(job) result = cluster_method(dataset, **kwargs)
log_result(conn, datasetname, result)
ret.append(result)
process_jobs(jobs, datasetname, conn) return result
Util.stop_dispy_cluster(cluster, http_server) elif distributed=='dispy':
nodes = kwargs.get('nodes', ['127.0.0.1'])
cluster, http_server = Util.start_dispy_cluster(cluster_method, nodes=nodes)
jobs = []
for i in range(experiments):
print("Experiment {}".format(i))
job = cluster.submit(dataset, **kwargs)
jobs.append(job)
process_jobs(jobs, datasetname, conn)
Util.stop_dispy_cluster(cluster, http_server)

View File

@ -90,7 +90,6 @@ class HighOrderFTS(fts.FTS):
self.is_high_order = True self.is_high_order = True
self.min_order = 1 self.min_order = 1
self.order= kwargs.get("order", self.min_order) self.order= kwargs.get("order", self.min_order)
self.lags = kwargs.get("lags", None)
self.configure_lags(**kwargs) self.configure_lags(**kwargs)
def configure_lags(self, **kwargs): def configure_lags(self, **kwargs):

View File

@ -19,7 +19,7 @@ class FLR(object):
self.RHS = set self.RHS = set
def __str__(self): def __str__(self):
return str([self.LHS[k] for k in self.LHS.keys()]) + " -> " + self.RHS return "{} -> {}".format([self.LHS[k] for k in self.LHS.keys()], self.RHS)

View File

@ -13,13 +13,6 @@ class ClusteredMVFTS(mvfts.MVFTS):
def __init__(self, **kwargs): def __init__(self, **kwargs):
super(ClusteredMVFTS, self).__init__(**kwargs) super(ClusteredMVFTS, self).__init__(**kwargs)
self.cluster_method = kwargs.get('cluster_method', grid.GridCluster)
"""The cluster method to be called when a new model is build"""
self.cluster_params = kwargs.get('cluster_params', {})
"""The cluster method parameters"""
self.cluster = kwargs.get('cluster', None)
"""The trained clusterer"""
self.fts_method = kwargs.get('fts_method', hofts.WeightedHighOrderFTS) self.fts_method = kwargs.get('fts_method', hofts.WeightedHighOrderFTS)
"""The FTS method to be called when a new model is build""" """The FTS method to be called when a new model is build"""
self.fts_params = kwargs.get('fts_params', {}) self.fts_params = kwargs.get('fts_params', {})
@ -30,6 +23,8 @@ class ClusteredMVFTS(mvfts.MVFTS):
self.is_high_order = True self.is_high_order = True
self.is_clustered = True
self.order = kwargs.get("order", 2) self.order = kwargs.get("order", 2)
self.lags = kwargs.get("lags", None) self.lags = kwargs.get("lags", None)
self.alpha_cut = kwargs.get('alpha_cut', 0.25) self.alpha_cut = kwargs.get('alpha_cut', 0.25)
@ -43,16 +38,13 @@ class ClusteredMVFTS(mvfts.MVFTS):
ndata = [] ndata = []
for index, row in data.iterrows(): for index, row in data.iterrows():
data_point = self.format_data(row) data_point = self.format_data(row)
ndata.append(common.fuzzyfy_instance_clustered(data_point, self.cluster, alpha_cut=self.alpha_cut)) ndata.append(common.fuzzyfy_instance_clustered(data_point, self.partitioner, alpha_cut=self.alpha_cut))
return ndata return ndata
def train(self, data, **kwargs): def train(self, data, **kwargs):
if self.cluster is None: self.model = self.fts_method(partitioner=self.partitioner, **self.fts_params)
self.cluster = self.cluster_method(data=data, mvfts=self, neighbors=self.knn, **self.cluster_params)
self.model = self.fts_method(partitioner=self.cluster, **self.fts_params)
if self.model.is_high_order: if self.model.is_high_order:
self.model.order = self.order self.model.order = self.order
@ -60,7 +52,7 @@ class ClusteredMVFTS(mvfts.MVFTS):
self.model.train(ndata, fuzzyfied=self.pre_fuzzyfy) self.model.train(ndata, fuzzyfied=self.pre_fuzzyfy)
self.cluster.prune() self.partitioner.prune()
def check_data(self, data): def check_data(self, data):
if self.pre_fuzzyfy: if self.pre_fuzzyfy:
@ -84,8 +76,8 @@ class ClusteredMVFTS(mvfts.MVFTS):
for var in self.explanatory_variables: for var in self.explanatory_variables:
if self.target_variable.name != var.name: if self.target_variable.name != var.name:
self.target_variable = var self.target_variable = var
self.cluster.change_target_variable(var) self.partitioner.change_target_variable(var)
self.model.partitioner = self.cluster self.model.partitioner = self.partitioner
self.model.reset_calculated_values() self.model.reset_calculated_values()
ret[var.name] = self.model.forecast(ndata, fuzzyfied=self.pre_fuzzyfy, **kwargs) ret[var.name] = self.model.forecast(ndata, fuzzyfied=self.pre_fuzzyfy, **kwargs)

View File

@ -7,12 +7,12 @@ class MultivariateFuzzySet(Composite.FuzzySet):
""" """
Multivariate Composite Fuzzy Set Multivariate Composite Fuzzy Set
""" """
def __init__(self, name, **kwargs): def __init__(self, **kwargs):
""" """
Create an empty composite fuzzy set Create an empty composite fuzzy set
:param name: fuzzy set name :param name: fuzzy set name
""" """
super(MultivariateFuzzySet, self).__init__(name) super(MultivariateFuzzySet, self).__init__("")
self.sets = {} self.sets = {}
self.target_variable = kwargs.get('target_variable',None) self.target_variable = kwargs.get('target_variable',None)
@ -28,10 +28,10 @@ class MultivariateFuzzySet(Composite.FuzzySet):
if variable == self.target_variable.name: if variable == self.target_variable.name:
self.centroid = set.centroid self.centroid = set.centroid
self.name += set.name
def set_target_variable(self, variable): def set_target_variable(self, variable):
#print(self.target_variable, variable)
self.target_variable = variable self.target_variable = variable
#print(self.centroid,self.sets[variable.name].centroid)
self.centroid = self.sets[variable.name].centroid self.centroid = self.sets[variable.name].centroid
def membership(self, x): def membership(self, x):
@ -42,7 +42,6 @@ class MultivariateFuzzySet(Composite.FuzzySet):
return np.nanmin(mv) return np.nanmin(mv)
def fuzzyfy_instance(data_point, var): def fuzzyfy_instance(data_point, var):
fsets = FuzzySet.fuzzyfy(data_point, var.partitioner, mode='sets', method='fuzzy', alpha_cut=var.alpha_cut) fsets = FuzzySet.fuzzyfy(data_point, var.partitioner, mode='sets', method='fuzzy', alpha_cut=var.alpha_cut)
return [(var.name, fs) for fs in fsets] return [(var.name, fs) for fs in fsets]

View File

@ -1,4 +1,4 @@
from pyFTS.partitioners import partitioner from pyFTS.models.multivariate import partitioner
from pyFTS.models.multivariate.common import MultivariateFuzzySet, fuzzyfy_instance_clustered from pyFTS.models.multivariate.common import MultivariateFuzzySet, fuzzyfy_instance_clustered
from itertools import product from itertools import product
from scipy.spatial import KDTree from scipy.spatial import KDTree
@ -6,106 +6,28 @@ import numpy as np
import pandas as pd import pandas as pd
class GridCluster(partitioner.Partitioner): class GridCluster(partitioner.MultivariatePartitioner):
""" """
A cartesian product of all fuzzy sets of all variables A cartesian product of all fuzzy sets of all variables
""" """
def __init__(self, **kwargs): def __init__(self, **kwargs):
super(GridCluster, self).__init__(name="GridCluster", preprocess=False, **kwargs) super(GridCluster, self).__init__(**kwargs)
self.name="GridCluster"
self.mvfts = kwargs.get('mvfts', None) self.build(None)
self.sets = {}
self.kdtree = None
self.index = {}
self.neighbors = kwargs.get('neighbors', 2)
self.optmize = kwargs.get('optmize', False)
if self.optmize:
self.count = {}
data = kwargs.get('data', [None])
self.build(data)
def build(self, data): def build(self, data):
fsets = [[x for x in k.partitioner.sets.values()] fsets = [[x for x in k.partitioner.sets.values()]
for k in self.mvfts.explanatory_variables] for k in self.explanatory_variables]
midpoints = []
c = 0 c = 0
for k in product(*fsets): for k in product(*fsets):
#key = self.prefix+str(c) mvfset = MultivariateFuzzySet(target_variable=self.target_variable)
mvfset = MultivariateFuzzySet(name="", target_variable=self.mvfts.target_variable)
mp = []
_key = ""
for fset in k: for fset in k:
mvfset.append_set(fset.variable, fset) mvfset.append_set(fset.variable, fset)
mp.append(fset.centroid)
_key += fset.name self.sets[mvfset.name] = mvfset
mvfset.name = _key
self.sets[_key] = mvfset
midpoints.append(mp)
self.index[c] = _key
c += 1 c += 1
import sys self.build_index()
sys.setrecursionlimit(100000)
self.kdtree = KDTree(midpoints)
sys.setrecursionlimit(1000)
def prune(self):
if not self.optmize:
return
for fset in [fs for fs in self.sets.keys()]:
if fset not in self.count:
fs = self.sets.pop(fset)
del (fs)
vars = [k.name for k in self.mvfts.explanatory_variables]
midpoints = []
self.index = {}
for ct, fset in enumerate(self.sets.values()):
mp = []
for vr in vars:
mp.append(fset.sets[vr].centroid)
midpoints.append(mp)
self.index[ct] = fset.name
import sys
sys.setrecursionlimit(100000)
self.kdtree = KDTree(midpoints)
sys.setrecursionlimit(1000)
def knn(self, data):
tmp = [data[k.name]
for k in self.mvfts.explanatory_variables]
tmp, ix = self.kdtree.query(tmp, self.neighbors)
if not isinstance(ix, (list, np.ndarray)):
ix = [ix]
if self.optmize:
tmp = []
for k in ix:
tmp.append(self.index[k])
self.count[self.index[k]] = 1
return tmp
else:
return [self.index[k] for k in ix]
def fuzzyfy(self, data, **kwargs):
return fuzzyfy_instance_clustered(data, self, **kwargs)
def change_target_variable(self, variable):
for fset in self.sets:
self.sets[fset].set_target_variable(variable)

View File

@ -12,8 +12,8 @@ class MVFTS(fts.FTS):
""" """
def __init__(self, **kwargs): def __init__(self, **kwargs):
super(MVFTS, self).__init__(**kwargs) super(MVFTS, self).__init__(**kwargs)
self.explanatory_variables = [] self.explanatory_variables = kwargs.get('explanatory_variables',[])
self.target_variable = None self.target_variable = kwargs.get('target_variable',None)
self.flrgs = {} self.flrgs = {}
self.is_multivariate = True self.is_multivariate = True
self.shortname = "MVFTS" self.shortname = "MVFTS"

View File

@ -0,0 +1,90 @@
from pyFTS.partitioners import partitioner
from pyFTS.models.multivariate.common import MultivariateFuzzySet, fuzzyfy_instance_clustered
from itertools import product
from scipy.spatial import KDTree
import numpy as np
import pandas as pd
class MultivariatePartitioner(partitioner.Partitioner):
"""
Base class for partitioners which use the MultivariateFuzzySet
"""
def __init__(self, **kwargs):
super(MultivariatePartitioner, self).__init__(name="MultivariatePartitioner", preprocess=False, **kwargs)
self.type = 'multivariate'
self.sets = {}
self.kdtree = None
self.index = {}
self.explanatory_variables = kwargs.get('explanatory_variables', [])
self.target_variable = kwargs.get('target_variable', None)
self.neighbors = kwargs.get('neighbors', 2)
self.optimize = kwargs.get('optimize', True)
if self.optimize:
self.count = {}
data = kwargs.get('data', None)
self.build(data)
def build(self, data):
pass
def append(self, fset):
self.sets[fset.name] = fset
def prune(self):
if not self.optimize:
return
for fset in [fs for fs in self.sets.keys()]:
if fset not in self.count:
fs = self.sets.pop(fset)
del (fs)
self.build_index()
def knn(self, data):
tmp = [data[k.name]
for k in self.explanatory_variables]
tmp, ix = self.kdtree.query(tmp, self.neighbors)
if not isinstance(ix, (list, np.ndarray)):
ix = [ix]
if self.optimize:
tmp = []
for k in ix:
tmp.append(self.index[k])
self.count[self.index[k]] = 1
return tmp
else:
return [self.index[k] for k in ix]
def fuzzyfy(self, data, **kwargs):
return fuzzyfy_instance_clustered(data, self, **kwargs)
def change_target_variable(self, variable):
for fset in self.sets.values():
fset.set_target_variable(variable)
def build_index(self):
midpoints = []
self.index = {}
for ct, fset in enumerate(self.sets.values()):
mp = []
for vr in self.explanatory_variables:
mp.append(fset.sets[vr.name].centroid)
midpoints.append(mp)
self.index[ct] = fset.name
import sys
sys.setrecursionlimit(100000)
self.kdtree = KDTree(midpoints)
sys.setrecursionlimit(1000)

View File

@ -1,3 +1,4 @@
import pandas as pd
from pyFTS.common import fts, FuzzySet, FLR, Membership, tree from pyFTS.common import fts, FuzzySet, FLR, Membership, tree
from pyFTS.partitioners import Grid from pyFTS.partitioners import Grid
from pyFTS.models.multivariate import FLR as MVFLR from pyFTS.models.multivariate import FLR as MVFLR
@ -24,6 +25,10 @@ class Variable:
self.data_label = kwargs.get('data_label', self.name) self.data_label = kwargs.get('data_label', self.name)
"""A string with the column name on DataFrame""" """A string with the column name on DataFrame"""
self.type = kwargs.get('type', 'common') self.type = kwargs.get('type', 'common')
self.data_type = kwargs.get('data_type', None)
"""The type of the data column on Pandas Dataframe"""
self.mask = kwargs.get('mask', None)
"""The mask for format the data column on Pandas Dataframe"""
self.transformation = kwargs.get('transformation', None) self.transformation = kwargs.get('transformation', None)
self.transformation_params = kwargs.get('transformation_params', None) self.transformation_params = kwargs.get('transformation_params', None)
self.partitioner = None self.partitioner = None

View File

@ -20,11 +20,12 @@ class WeightedFLRG(mvflrg.FLRG):
self.w = None self.w = None
def append_rhs(self, fset, **kwargs): def append_rhs(self, fset, **kwargs):
count = kwargs.get('count', 1.0)
if fset not in self.RHS: if fset not in self.RHS:
self.RHS[fset] = 1.0 self.RHS[fset] = count
else: else:
self.RHS[fset] += 1.0 self.RHS[fset] += count
self.count += 1.0 self.count += count
def weights(self): def weights(self):
if self.w is None: if self.w is None:
@ -51,10 +52,6 @@ class WeightedMVFTS(mvfts.MVFTS):
""" """
def __init__(self, **kwargs): def __init__(self, **kwargs):
super(WeightedMVFTS, self).__init__(order=1, **kwargs) super(WeightedMVFTS, self).__init__(order=1, **kwargs)
self.explanatory_variables = []
self.target_variable = None
self.flrgs = {}
self.is_multivariate = True
self.shortname = "WeightedMVFTS" self.shortname = "WeightedMVFTS"
self.name = "Weighted Multivariate FTS" self.name = "Weighted Multivariate FTS"

View File

@ -21,6 +21,15 @@ class SimplePartitioner(partitioner.Partitioner):
self.partitions = 0 self.partitions = 0
def append_complex(self, fs):
self.sets[fs.name] = fs
self.partitions += 1
self.ordered_sets = [key for key in sorted(self.sets.keys(), key=lambda k: self.sets[k].centroid)]
self.min = self.sets[self.ordered_sets[0]].lower
self.max = self.sets[self.ordered_sets[-1]].upper
def append(self, name, mf, parameters, **kwargs): def append(self, name, mf, parameters, **kwargs):
""" """
Append a new partition (fuzzy set) to the partitioner Append a new partition (fuzzy set) to the partitioner
@ -39,7 +48,7 @@ class SimplePartitioner(partitioner.Partitioner):
if mf is None or mf not in (Membership.trimf, Membership.gaussmf, if mf is None or mf not in (Membership.trimf, Membership.gaussmf,
Membership.trapmf, Membership.singleton, Membership.trapmf, Membership.singleton,
Membership.sigmf): Membership.sigmf):
raise ValueError("The mf parameter should be one of pyFTS.common.Membership functions") raise ValueError("The mf parameter should be one of pyFTS.common.Membership functions, not {}".format(mf))
if mf == Membership.trimf: if mf == Membership.trimf:
if len(parameters) != 3: if len(parameters) != 3:

View File

@ -2,14 +2,14 @@ import numpy as np
from pyFTS.hyperparam import GridSearch, Evolutionary from pyFTS.hyperparam import GridSearch, Evolutionary
def get_dataset(): def get_dataset():
#from pyFTS.data import SONDA
from pyFTS.data import Malaysia from pyFTS.data import Malaysia
ds = Malaysia.get_data('temperature')[:1000] #data = SONDA.get_data('temperature')[:1000]
# ds = pd.read_csv('Malaysia.csv',delimiter=',' )[['temperature']].values[:2000].flatten().tolist() data = Malaysia.get_data('temperature')[:1000]
#train = ds[:800]
#test = ds[800:]
return 'Malaysia.temperature', ds #train, test #return 'SONDA.glo_avg', data #train, test
return 'Malaysia.temperature', data #train, test
""" """
hyperparams = { hyperparams = {
@ -39,4 +39,28 @@ datsetname, dataset = get_dataset()
#Evolutionary.cluster_method(dataset, 70, 20, .8, .3, 1) #Evolutionary.cluster_method(dataset, 70, 20, .8, .3, 1)
Evolutionary.execute(datsetname, dataset, nodes=nodes, ngen=50, npop=30, ) '''
from pyFTS.models import hofts
from pyFTS.partitioners import Grid
from pyFTS.benchmarks import Measures
fs = Grid.GridPartitioner(data=dataset[:800], npart=30)
model = hofts.WeightedHighOrderFTS(partitioner=fs, order=2)
model.fit(dataset[:800])
model.predict(dataset[800:1000])
Measures.get_point_statistics(dataset[800:1000], model)
print(model)
'''
ret = Evolutionary.execute(datsetname, dataset,
ngen=30, npop=20, pcruz=.5, pmut=.3,
window_size=800, experiments=30)
#parameters={'distributed': 'spark', 'url': 'spark://192.168.0.106:7077'})
print(ret)
#'''

View File

@ -28,8 +28,6 @@ test_uv = dataset['value'].values[24505:]
train_mv = dataset.iloc[:24505] train_mv = dataset.iloc[:24505]
test_mv = dataset.iloc[24505:] test_mv = dataset.iloc[24505:]
print(train_mv)
sp = {'seasonality': DateTime.minute_of_day, 'names': [str(k)+'hs' for k in range(0,24)]} sp = {'seasonality': DateTime.minute_of_day, 'names': [str(k)+'hs' for k in range(0,24)]}
vhour = variable.Variable("Hour", data_label="date", partitioner=seasonal.TimeGridPartitioner, npart=24, vhour = variable.Variable("Hour", data_label="date", partitioner=seasonal.TimeGridPartitioner, npart=24,
@ -46,21 +44,16 @@ parameters = [
{'order':2, 'knn': 3}, {'order':2, 'knn': 3},
] ]
for ct, method in enumerate([mvfts.MVFTS, wmvfts.WeightedMVFTS, #for ct, method in enumerate([, wmvfts.WeightedMVFTS,
cmvfts.ClusteredMVFTS,cmvfts.ClusteredMVFTS,cmvfts.ClusteredMVFTS]): # cmvfts.ClusteredMVFTS,cmvfts.ClusteredMVFTS,cmvfts.ClusteredMVFTS]):
print(method) model = mvfts.MVFTS()
model = method(**parameters[ct])
model.shortname += str(ct)
model.append_variable(vhour)
model.append_variable(vvalue)
model.target_variable = vvalue
model.fit(train_mv)
Util.persist_obj(model, model.shortname) model.append_variable(vhour)
model.append_variable(vvalue)
model.target_variable = vvalue
model.fit(train_mv)
forecasts = model.predict(test_mv.iloc[:100]) print(model)
print(model)

View File

@ -1,7 +1,8 @@
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import time
from pyFTS.data import Enrollments, TAIEX from pyFTS.data import Enrollments, TAIEX, SONDA
from pyFTS.partitioners import Grid, Simple from pyFTS.partitioners import Grid, Simple
from pyFTS.models import hofts from pyFTS.models import hofts
@ -12,20 +13,51 @@ import os
# make sure pyspark tells workers to use python3 not 2 if both are installed # make sure pyspark tells workers to use python3 not 2 if both are installed
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3' os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/usr/bin/python3' os.environ['PYSPARK_DRIVER_PYTHON'] = '/usr/bin/python3'
#'''
data = TAIEX.get_data() data = SONDA.get_data('glo_avg')
fs = Grid.GridPartitioner(data=data, npart=50) fs = Grid.GridPartitioner(data=data, npart=50)
model = hofts.WeightedHighOrderFTS(partitioner=fs, order=2) model = hofts.WeightedHighOrderFTS(partitioner=fs, order=2)
model.fit(data, distributed='spark', url='spark://192.168.0.110:7077') _s1 = time.time()
model.fit(data, distributed='spark', url='spark://192.168.0.106:7077')
_s2 = time.time()
print(_s2-_s1)
#model.fit(data, distributed='dispy', nodes=['192.168.0.110']) #model.fit(data, distributed='dispy', nodes=['192.168.0.110'])
'''
from pyFTS.models.multivariate import common, variable, mvfts, wmvfts, cmvfts, grid
from pyFTS.models.seasonal import partitioner as seasonal
from pyFTS.models.seasonal.common import DateTime
dataset = pd.read_csv('/home/petronio/Downloads/kalang.csv', sep=',')
dataset['date'] = pd.to_datetime(dataset["date"], format='%Y-%m-%d %H:%M:%S')
train_mv = dataset.iloc[:24505]
test_mv = dataset.iloc[24505:]
sp = {'seasonality': DateTime.minute_of_day, 'names': [str(k)+'hs' for k in range(0,24)]}
vhour = variable.Variable("Hour", data_label="date", partitioner=seasonal.TimeGridPartitioner, npart=24,
data=train_mv, partitioner_specific=sp, data_type=pd.datetime, mask='%Y-%m-%d %H:%M:%S')
vvalue = variable.Variable("Pollution", data_label="value", alias='value',
partitioner=Grid.GridPartitioner, npart=35, data_type=np.float64,
data=train_mv)
fs = grid.GridCluster(explanatory_variables=[vhour, vvalue], target_variable=vvalue)
#model = wmvfts.WeightedMVFTS(explanatory_variables=[vhour, vvalue], target_variable=vvalue)
model = cmvfts.ClusteredMVFTS(explanatory_variables=[vhour, vvalue], target_variable=vvalue,
partitioner=fs)
model.fit(train_mv, distributed='spark', url='spark://192.168.0.106:7077')
#'''
print(model) print(model)
''' '''
def fun(x): def fun(x):
return (x, x % 2) return (x, x % 2)