Spark Distributed; Hyperparameter optimization

This commit is contained in:
Petrônio Cândido 2019-01-18 09:06:53 -02:00
parent 87686e5ff0
commit 2e1d7fa11a
20 changed files with 904 additions and 301 deletions

View File

@ -0,0 +1,178 @@
<!doctype html>
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" /><script type="text/javascript">
var _gaq = _gaq || [];
_gaq.push(['_setAccount', 'UA-55120145-3']);
_gaq.push(['_trackPageview']);
(function() {
var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
})();
</script>
<title>pyFTS.partitioners.Simple &#8212; pyFTS 1.4 documentation</title>
<link rel="stylesheet" href="../../../_static/bizstyle.css" type="text/css" />
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
<script type="text/javascript" src="../../../_static/documentation_options.js"></script>
<script type="text/javascript" src="../../../_static/jquery.js"></script>
<script type="text/javascript" src="../../../_static/underscore.js"></script>
<script type="text/javascript" src="../../../_static/doctools.js"></script>
<script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>
<script type="text/javascript" src="../../../_static/bizstyle.js"></script>
<link rel="index" title="Index" href="../../../genindex.html" />
<link rel="search" title="Search" href="../../../search.html" />
<meta name="viewport" content="width=device-width,initial-scale=1.0">
<!--[if lt IE 9]>
<script type="text/javascript" src="_static/css3-mediaqueries.js"></script>
<![endif]-->
</head><body>
<div class="related" role="navigation" aria-label="related navigation">
<h3>Navigation</h3>
<ul>
<li class="right" style="margin-right: 10px">
<a href="../../../genindex.html" title="General Index"
accesskey="I">index</a></li>
<li class="right" >
<a href="../../../py-modindex.html" title="Python Module Index"
>modules</a> |</li>
<li class="nav-item nav-item-0"><a href="../../../index.html">pyFTS 1.4 documentation</a> &#187;</li>
<li class="nav-item nav-item-1"><a href="../../index.html" accesskey="U">Module code</a> &#187;</li>
</ul>
</div>
<div class="sphinxsidebar" role="navigation" aria-label="main navigation">
<div class="sphinxsidebarwrapper">
<p class="logo"><a href="../../../index.html">
<img class="logo" src="../../../_static/logo_heading2.png" alt="Logo"/>
</a></p>
<div id="searchbox" style="display: none" role="search">
<h3>Quick search</h3>
<div class="searchformwrapper">
<form class="search" action="../../../search.html" method="get">
<input type="text" name="q" />
<input type="submit" value="Go" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div>
<script type="text/javascript">$('#searchbox').show(0);</script>
</div>
</div>
<div class="document">
<div class="documentwrapper">
<div class="bodywrapper">
<div class="body" role="main">
<h1>Source code for pyFTS.partitioners.Simple</h1><div class="highlight"><pre>
<span></span><span class="sd">&quot;&quot;&quot;Simple Partitioner for manually informed fuzzy sets&quot;&quot;&quot;</span>
<span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
<span class="kn">import</span> <span class="nn">math</span>
<span class="kn">import</span> <span class="nn">random</span> <span class="k">as</span> <span class="nn">rnd</span>
<span class="kn">import</span> <span class="nn">functools</span><span class="o">,</span> <span class="nn">operator</span>
<span class="kn">from</span> <span class="nn">pyFTS.common</span> <span class="k">import</span> <span class="n">FuzzySet</span><span class="p">,</span> <span class="n">Membership</span>
<span class="kn">from</span> <span class="nn">pyFTS.partitioners</span> <span class="k">import</span> <span class="n">partitioner</span>
<div class="viewcode-block" id="SimplePartitioner"><a class="viewcode-back" href="../../../pyFTS.partitioners.html#pyFTS.partitioners.Simple.SimplePartitioner">[docs]</a><span class="k">class</span> <span class="nc">SimplePartitioner</span><span class="p">(</span><span class="n">partitioner</span><span class="o">.</span><span class="n">Partitioner</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Simple Partitioner for manually informed fuzzy sets&quot;&quot;&quot;</span>
<span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Simple Partitioner - the fuzzy sets are informed manually</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">kwargs</span><span class="p">[</span><span class="s1">&#39;preprocess&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">False</span>
<span class="nb">super</span><span class="p">(</span><span class="n">SimplePartitioner</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="s2">&quot;Simple&quot;</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">partitions</span> <span class="o">=</span> <span class="mi">0</span>
<div class="viewcode-block" id="SimplePartitioner.append"><a class="viewcode-back" href="../../../pyFTS.partitioners.html#pyFTS.partitioners.Simple.SimplePartitioner.append">[docs]</a> <span class="k">def</span> <span class="nf">append</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">,</span> <span class="n">mf</span><span class="p">,</span> <span class="n">parameters</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Append a new partition (fuzzy set) to the partitioner</span>
<span class="sd"> :param name: Fuzzy set name</span>
<span class="sd"> :param mf: One of the pyFTS.common.Membership functions</span>
<span class="sd"> :param parameters: A list with the parameters for the membership function</span>
<span class="sd"> :param kwargs: Optional arguments for the fuzzy set</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">name</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="nb">len</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;The name of the fuzzy set cannot be empty&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">name</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">sets</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;This name has already been used&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">mf</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="n">mf</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">(</span><span class="n">Membership</span><span class="o">.</span><span class="n">trimf</span><span class="p">,</span> <span class="n">Membership</span><span class="o">.</span><span class="n">gaussmf</span><span class="p">,</span>
<span class="n">Membership</span><span class="o">.</span><span class="n">trapmf</span><span class="p">,</span> <span class="n">Membership</span><span class="o">.</span><span class="n">singleton</span><span class="p">,</span>
<span class="n">Membership</span><span class="o">.</span><span class="n">sigmf</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;The mf parameter should be one of pyFTS.common.Membership functions&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">mf</span> <span class="o">==</span> <span class="n">Membership</span><span class="o">.</span><span class="n">trimf</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">parameters</span><span class="p">)</span> <span class="o">!=</span> <span class="mi">3</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Incorrect number of parameters for the Membership.trimf&quot;</span><span class="p">)</span>
<span class="n">centroid</span> <span class="o">=</span> <span class="n">parameters</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span>
<span class="k">elif</span> <span class="n">mf</span> <span class="o">==</span> <span class="n">Membership</span><span class="o">.</span><span class="n">gaussmf</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">parameters</span><span class="p">)</span> <span class="o">!=</span> <span class="mi">2</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Incorrect number of parameters for the Membership.gaussmf&quot;</span><span class="p">)</span>
<span class="n">centroid</span> <span class="o">=</span> <span class="n">parameters</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="k">elif</span> <span class="n">mf</span> <span class="o">==</span> <span class="n">Membership</span><span class="o">.</span><span class="n">trapmf</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">parameters</span><span class="p">)</span> <span class="o">!=</span> <span class="mi">4</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Incorrect number of parameters for the Membership.trapmf&quot;</span><span class="p">)</span>
<span class="n">centroid</span> <span class="o">=</span> <span class="p">(</span><span class="n">parameters</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">+</span><span class="n">parameters</span><span class="p">[</span><span class="mi">2</span><span class="p">])</span><span class="o">/</span><span class="mi">2</span>
<span class="k">elif</span> <span class="n">mf</span> <span class="o">==</span> <span class="n">Membership</span><span class="o">.</span><span class="n">singleton</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">parameters</span><span class="p">)</span> <span class="o">!=</span> <span class="mi">1</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Incorrect number of parameters for the Membership.singleton&quot;</span><span class="p">)</span>
<span class="n">centroid</span> <span class="o">=</span> <span class="n">parameters</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="k">elif</span> <span class="n">mf</span> <span class="o">==</span> <span class="n">Membership</span><span class="o">.</span><span class="n">sigmf</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">parameters</span><span class="p">)</span> <span class="o">!=</span> <span class="mi">2</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Incorrect number of parameters for the Membership.sigmf&quot;</span><span class="p">)</span>
<span class="n">centroid</span> <span class="o">=</span> <span class="n">parameters</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="o">+</span> <span class="p">(</span><span class="n">parameters</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="o">/</span> <span class="p">(</span><span class="mi">2</span> <span class="o">*</span> <span class="n">parameters</span><span class="p">[</span><span class="mi">0</span><span class="p">]))</span>
<span class="bp">self</span><span class="o">.</span><span class="n">sets</span><span class="p">[</span><span class="n">name</span><span class="p">]</span> <span class="o">=</span> <span class="n">FuzzySet</span><span class="o">.</span><span class="n">FuzzySet</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">mf</span><span class="p">,</span> <span class="n">parameters</span><span class="p">,</span> <span class="n">centroid</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">partitions</span> <span class="o">+=</span> <span class="mi">1</span>
<span class="bp">self</span><span class="o">.</span><span class="n">ordered_sets</span> <span class="o">=</span> <span class="p">[</span><span class="n">key</span> <span class="k">for</span> <span class="n">key</span> <span class="ow">in</span> <span class="nb">sorted</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">sets</span><span class="o">.</span><span class="n">keys</span><span class="p">(),</span> <span class="n">key</span><span class="o">=</span><span class="k">lambda</span> <span class="n">k</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">sets</span><span class="p">[</span><span class="n">k</span><span class="p">]</span><span class="o">.</span><span class="n">centroid</span><span class="p">)]</span>
<span class="bp">self</span><span class="o">.</span><span class="n">min</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">sets</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">ordered_sets</span><span class="p">[</span><span class="mi">0</span><span class="p">]]</span><span class="o">.</span><span class="n">lower</span>
<span class="bp">self</span><span class="o">.</span><span class="n">max</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">sets</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">ordered_sets</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">]]</span><span class="o">.</span><span class="n">upper</span></div></div>
</pre></div>
</div>
</div>
</div>
<div class="clearer"></div>
</div>
<div class="related" role="navigation" aria-label="related navigation">
<h3>Navigation</h3>
<ul>
<li class="right" style="margin-right: 10px">
<a href="../../../genindex.html" title="General Index"
>index</a></li>
<li class="right" >
<a href="../../../py-modindex.html" title="Python Module Index"
>modules</a> |</li>
<li class="nav-item nav-item-0"><a href="../../../index.html">pyFTS 1.4 documentation</a> &#187;</li>
<li class="nav-item nav-item-1"><a href="../../index.html" >Module code</a> &#187;</li>
</ul>
</div>
<div class="footer" role="contentinfo">
&#169; Copyright 2018, Machine Intelligence and Data Science Laboratory - UFMG - Brazil.
Created using <a href="http://sphinx-doc.org/">Sphinx</a> 1.7.2.
</div>
</body>
</html>

View File

@ -26,6 +26,10 @@ class FuzzySet(FuzzySet.FuzzySet):
self.mf = []
self.parameters = []
self.lower = None
self.upper = None
self.centroid = None
def membership(self, x):
"""
@ -62,3 +66,13 @@ class FuzzySet(FuzzySet.FuzzySet):
:return:
"""
self.sets.append(set)
if self.lower is None or self.lower > set.lower:
self.lower = set.lower
if self.upper is None or self.upper < set.upper:
self.upper = set.upper
if self.centroid is None or self.centroid < set.centroid:
self.centroid = set.centroid

View File

@ -125,6 +125,7 @@ def fuzzyfy(data, partitioner, **kwargs):
:keyword method: the fuzzyfication method (fuzzy: all fuzzy memberships, maximum: only the maximum membership)
:keyword mode: the fuzzyfication mode (sets: return the fuzzy sets names, vector: return a vector with the membership
values for all fuzzy sets, both: return a list with tuples (fuzzy set, membership value) )
:returns a list with the fuzzyfied values, depending on the mode
"""
alpha_cut = kwargs.get('alpha_cut', 0.)

View File

@ -74,7 +74,7 @@ def sigmf(x, parameters):
:param x:
:param parameters: an list with 2 real values (smoothness and midpoint)
:return:
:return
"""
return 1 / (1 + math.exp(-parameters[0] * (x - parameters[1])))

View File

@ -38,6 +38,9 @@ class FTS(object):
"""A boolean value indicating if the model support probabilistic forecasting, default: False"""
self.is_multivariate = False
"""A boolean value indicating if the model support multivariate time series (Pandas DataFrame), default: False"""
self.is_clustered = False
"""A boolean value indicating if the model support multivariate time series (Pandas DataFrame), but works like
a monovariate method, default: False"""
self.dump = False
self.transformations = []
"""A list with the data transformations (common.Transformations) applied on model pre and post processing, default: []"""
@ -61,6 +64,8 @@ class FTS(object):
"""Flag indicating if the test data will be clipped inside the training Universe of Discourse"""
self.alpha_cut = kwargs.get("alpha_cut", 0.0)
"""A float with the minimal membership to be considered on fuzzyfication process"""
self.lags = kwargs.get("lags", None)
"""The list of lag indexes for high order models"""
self.max_lag = self.order
"""A integer indicating the largest lag used by the model. This value also indicates the minimum number of past lags
needed to forecast a single step ahead"""

View File

@ -3,6 +3,7 @@ import pandas as pd
from pyFTS.data import Enrollments, TAIEX
from pyFTS.partitioners import Grid, Simple
from pyFTS.models.multivariate import partitioner as mv_partitioner
from pyFTS.models import hofts
from pyspark import SparkConf
@ -10,44 +11,141 @@ from pyspark import SparkContext
import os
# make sure pyspark tells workers to use python3 not 2 if both are installed
SPARK_ADDR = 'spark://192.168.0.110:7077'
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/usr/bin/python3'
def get_partitioner(shared_partitioner):
def get_partitioner(shared_partitioner, type='common', variables=[]):
"""
:param part:
:return:
"""
if type=='common':
fs_tmp = Simple.SimplePartitioner()
for fset in shared_partitioner.value.keys():
fz = shared_partitioner.value[fset]
fs_tmp.append(fset, fz.mf, fz.parameters)
if type=='common':
fs_tmp.append_complex(fz)
elif type == 'multivariate':
fs_tmp.append(fz)
return fs_tmp
def slave_train(data, shared_method, shared_partitioner, shared_order):
def get_clustered_partitioner(explanatory_variables, target_variable, **parameters):
from pyFTS.models.multivariate.common import MultivariateFuzzySet
fs_tmp = mv_partitioner.MultivariatePartitioner(explanatory_variables=explanatory_variables,
target_variable=target_variable)
for tmp in parameters['partitioner_names'].value:
fs = MultivariateFuzzySet(target_variable=target_variable)
for var, fset in parameters['partitioner_{}'.format(tmp)].value:
fs.append_set(var, fset)
fs_tmp.append(fs)
fs_tmp.build_index()
return fs_tmp
def get_variables(**parameters):
explanatory_variables = []
target_variable = None
for name in parameters['variables'].value:
from pyFTS.models.multivariate import common, variable
var = variable.Variable(name,
type=parameters['{}_type'.format(name)].value,
data_label=parameters['{}_label'.format(name)].value,
alpha_cut=parameters['{}_alpha'.format(name)].value,
#data_type=parameters['{}_data_type'.format(name)].value,
#mask=parameters['{}_mask'.format(name)].value,
)
var.partitioner = get_partitioner(parameters['{}_partitioner'.format(name)])
var.partitioner.type = parameters['{}_partitioner_type'.format(name)].value
explanatory_variables.append(var)
if var.name == parameters['target'].value:
target_variable = var
return (explanatory_variables, target_variable)
def slave_train_univariate(data, **parameters):
"""
:param data:
:return:
"""
model = shared_method.value(partitioner=get_partitioner(shared_partitioner),
order=shared_order.value)
if parameters['type'].value == 'common':
if parameters['order'].value > 1:
model = parameters['method'].value(partitioner=get_partitioner(parameters['partitioner']),
order=parameters['order'].value, alpha_cut=parameters['alpha_cut'].value,
lags=parameters['lags'].value)
else:
model = parameters['method'].value(partitioner=get_partitioner(parameters['partitioner']),
alpha_cut=parameters['alpha_cut'].value)
ndata = [k for k in data]
else:
pass
model.train(ndata)
return [(k, model.flrgs[k]) for k in model.flrgs]
return [(k, model.flrgs[k]) for k in model.flrgs.keys()]
def distributed_train(model, data, url='spark://192.168.0.110:7077', app='pyFTS'):
def slave_train_multivariate(data, **parameters):
explanatory_variables, target_variable = get_variables(**parameters)
#vars = [(v.name, v.name) for v in explanatory_variables]
#return [('vars', vars), ('target',[target_variable.name])]
if parameters['type'].value == 'clustered':
fs = get_clustered_partitioner(explanatory_variables, target_variable, **parameters)
model = parameters['method'].value(explanatory_variables=explanatory_variables,
target_variable=target_variable,
partitioner=fs,
order=parameters['order'].value,
alpha_cut=parameters['alpha_cut'].value,
lags=parameters['lags'].value)
else:
if parameters['order'].value > 1:
model = parameters['method'].value(explanatory_variables=explanatory_variables,
target_variable=target_variable,
order=parameters['order'].value,
alpha_cut=parameters['alpha_cut'].value,
lags=parameters['lags'].value)
else:
model = parameters['method'].value(explanatory_variables=explanatory_variables,
target_variable=target_variable,
alpha_cut=parameters['alpha_cut'].value)
rows = [k for k in data]
ndata = pd.DataFrame.from_records(rows, columns=parameters['columns'].value)
model.train(ndata)
if parameters['type'].value == 'clustered':
counts = [(fset, count) for fset,count in model.partitioner.count.items()]
flrgs = [(k, v) for k,v in model.flrgs.items()]
return [('counts', counts), ('flrgs', flrgs)]
else:
return [(k, v) for k,v in model.flrgs.items()]
def distributed_train(model, data, url=SPARK_ADDR, app='pyFTS'):
"""
@ -61,22 +159,92 @@ def distributed_train(model, data, url='spark://192.168.0.110:7077', app='pyFTS'
conf = SparkConf()
conf.setMaster(url)
conf.setAppName(app)
conf.set("spark.executor.memory", "2g")
conf.set("spark.driver.memory", "2g")
conf.set("spark.memory.offHeap.enabled",True)
conf.set("spark.memory.offHeap.size","16g")
parameters = {}
with SparkContext(conf=conf) as context:
shared_partitioner = context.broadcast(model.partitioner.sets)
shared_order = context.broadcast(model.order)
shared_method = context.broadcast(type(model))
func = lambda x: slave_train(x, shared_method, shared_partitioner, shared_order)
nodes = context.defaultParallelism
flrgs = context.parallelize(data).mapPartitions(func)
if not model.is_multivariate:
parameters['type'] = context.broadcast('common')
parameters['partitioner'] = context.broadcast(model.partitioner.sets)
parameters['alpha_cut'] = context.broadcast(model.alpha_cut)
parameters['order'] = context.broadcast(model.order)
parameters['method'] = context.broadcast(type(model))
parameters['lags'] = context.broadcast(model.lags)
parameters['max_lag'] = context.broadcast(model.max_lag)
func = lambda x: slave_train_univariate(x, **parameters)
flrgs = context.parallelize(data).repartition(nodes*2).mapPartitions(func)
for k in flrgs.collect():
model.append_rule(k[1])
return model
else:
if model.is_clustered:
parameters['type'] = context.broadcast('clustered')
names = []
for name, fset in model.partitioner.sets.items():
names.append(name)
parameters['partitioner_{}'.format(name)] = context.broadcast([(k,v) for k,v in fset.sets.items()])
parameters['partitioner_names'] = context.broadcast(names)
else:
parameters['type'] = context.broadcast('multivariate')
names = []
for var in model.explanatory_variables:
#if var.data_type is None:
# raise Exception("It is mandatory to inform the data_type parameter for each variable when the training is distributed! ")
names.append(var.name)
parameters['{}_type'.format(var.name)] = context.broadcast(var.type)
#parameters['{}_data_type'.format(var.name)] = context.broadcast(var.data_type)
#parameters['{}_mask'.format(var.name)] = context.broadcast(var.mask)
parameters['{}_label'.format(var.name)] = context.broadcast(var.data_label)
parameters['{}_alpha'.format(var.name)] = context.broadcast(var.alpha_cut)
parameters['{}_partitioner'.format(var.name)] = context.broadcast(var.partitioner.sets)
parameters['{}_partitioner_type'.format(var.name)] = context.broadcast(var.partitioner.type)
parameters['variables'] = context.broadcast(names)
parameters['target'] = context.broadcast(model.target_variable.name)
parameters['columns'] = context.broadcast(data.columns.values)
data = data.to_dict(orient='records')
parameters['alpha_cut'] = context.broadcast(model.alpha_cut)
parameters['order'] = context.broadcast(model.order)
parameters['method'] = context.broadcast(type(model))
parameters['lags'] = context.broadcast(model.lags)
parameters['max_lag'] = context.broadcast(model.max_lag)
func = lambda x: slave_train_multivariate(x, **parameters)
flrgs = context.parallelize(data).mapPartitions(func)
for k in flrgs.collect():
print(k)
#for g in k:
# print(g)
#return
if parameters['type'].value == 'clustered':
if k[0] == 'counts':
for fset, count in k[1]:
model.partitioner.count[fset] = count
elif k[0] == 'flrgs':
model.append_rule(k[1])
else:
model.append_rule(k[1])
return model
def distributed_predict(data, model, url='spark://192.168.0.110:7077', app='pyFTS'):
def distributed_predict(data, model, url=SPARK_ADDR, app='pyFTS'):
return None

View File

@ -15,15 +15,32 @@ from pyFTS.common import Membership
from pyFTS.hyperparam import Util as hUtil
# Gera indivíduos após operadores
#
def genotype(mf, npart, partitioner, order, alpha, lags, len_lags, rmse):
ind = dict(mf=mf, npart=npart, partitioner=partitioner, order=order, alpha=alpha, lags=lags, len_lags=len_lags,
rmse=rmse)
'''
Create the individual genotype
:param mf: membership function
:param npart: number of partitions
:param partitioner: partitioner method
:param order: model order
:param alpha: alpha-cut
:param lags: array with lag indexes
:param len_lags: parsimony fitness value
:param rmse: accuracy fitness value
:return: the genotype, a dictionary with all hyperparameters
'''
ind = dict(mf=mf, npart=npart, partitioner=partitioner, order=order,
alpha=alpha, lags=lags, len_lags=len_lags, rmse=rmse)
return ind
# Gera indivíduos
def random_genotype():
'''
Create random genotype
:return: the genotype, a dictionary with all hyperparameters
'''
order = random.randint(1, 3)
return genotype(
random.randint(1, 4),
@ -32,21 +49,34 @@ def random_genotype():
order,
random.uniform(0, .5),
sorted(random.sample(range(1, 50), order)),
[],
[]
None,
None
)
# Gera uma população de tamanho n
#
def initial_population(n):
'''
Create a random population of size n
:param n: the size of the population
:return: a list with n random individuals
'''
pop = []
for i in range(n):
pop.append(random_genotype())
return pop
# Função de avaliação
def phenotype(individual, train):
def phenotype(individual, train, parameters={}):
'''
Instantiate the genotype, creating a fitted model with the genotype hyperparameters
:param individual: a genotype
:param train: the training dataset
:param parameters: dict with model specific arguments for fit method.
:return: a fitted FTS model
'''
try:
if individual['mf'] == 1:
mf = Membership.trimf
@ -67,28 +97,48 @@ def phenotype(individual, train):
alpha_cut=individual['alpha'],
order=individual['order'])
model.fit(train)
model.fit(train, **parameters)
return model
except Exception as ex:
print("EXCEPTION!", str(ex), str(individual))
print("PHENOTYPE EXCEPTION!", str(ex), str(individual))
return None
def evaluation1(dataset, individual):
def evaluate(dataset, individual, **kwargs):
'''
Evaluate an individual using a sliding window cross validation over the dataset.
:param dataset: Evaluation dataset
:param individual: genotype to be tested
:param window_size: The length of scrolling window for train/test on dataset
:param train_rate: The train/test split ([0,1])
:param increment_rate: The increment of the scrolling window, relative to the window_size ([0,1])
:param parameters: dict with model specific arguments for fit method.
:return: a tuple (len_lags, rmse) with the parsimony fitness value and the accuracy fitness value
'''
from pyFTS.common import Util
from pyFTS.benchmarks import Measures
window_size = kwargs.get('window_size', 800)
train_rate = kwargs.get('train_rate', .8)
increment_rate = kwargs.get('increment_rate', .2)
parameters = kwargs.get('parameters',{})
if individual['rmse'] is not None and individual['len_lags'] is not None:
return individual['len_lags'], individual['rmse']
try:
results = []
lengths = []
for count, train, test in Util.sliding_window(dataset, 800, train=.8, inc=.25):
model = phenotype(individual, train)
for count, train, test in Util.sliding_window(dataset, window_size, train=train_rate, inc=increment_rate):
model = phenotype(individual, train, parameters=parameters)
if model is None:
return (None)
raise Exception("Phenotype returned None")
rmse, _, _ = Measures.get_point_statistics(test, model)
lengths.append(len(model))
@ -100,36 +150,59 @@ def evaluation1(dataset, individual):
rmse = np.nansum([.6 * np.nanmean(results), .4 * np.nanstd(results)])
len_lags = np.nansum([.4 * np.nanmean(lengths), .6 * _lags])
#print("EVALUATION {}".format(individual))
return len_lags, rmse
except Exception as ex:
print("EXCEPTION!", str(ex), str(individual))
return np.inf
print("EVALUATION EXCEPTION!", str(ex), str(individual))
return np.inf, np.inf
def tournament(population, objective):
'''
Simple tournament selection strategy.
:param population: the population
:param objective: the objective to be considered on tournament
:return:
'''
n = len(population) - 1
try:
r1 = random.randint(0, n) if n > 2 else 0
r2 = random.randint(0, n) if n > 2 else 1
ix = r1 if population[r1][objective] < population[r2][objective] else r2
return population[ix]
except Exception as ex:
print(r1, population[r1])
print(r2, population[r2])
raise ex
def selection1(population):
pais = []
prob = .8
def double_tournament(population):
'''
Double tournament selection strategy.
# for i in range(len(population)):
pai1 = tournament(population, 'rmse')
pai2 = tournament(population, 'rmse')
:param population:
:return:
'''
finalista = tournament([pai1, pai2], 'len_lags')
ancestor1 = tournament(population, 'rmse')
ancestor2 = tournament(population, 'rmse')
return finalista
selected = tournament([ancestor1, ancestor2], 'len_lags')
return selected
def lag_crossover2(best, worst):
'''
Cross over two lag genes
:param best: best genotype
:param worst: worst genotype
:return: a tuple (order, lags)
'''
order = int(round(.7 * best['order'] + .3 * worst['order']))
lags = []
@ -151,15 +224,26 @@ def lag_crossover2(best, worst):
# Cruzamento
def crossover(pais):
def crossover(parents):
'''
Crossover operation between two parents
:param parents: a list with two genotypes
:return: a genotype
'''
import random
if pais[0]['rmse'] < pais[1]['rmse']:
best = pais[0]
worst = pais[1]
n = len(parents) - 1
r1 = random.randint(0, n)
r2 = random.randint(0, n)
if parents[r1]['rmse'] < parents[r2]['rmse']:
best = parents[r1]
worst = parents[r2]
else:
best = pais[1]
worst = pais[0]
best = parents[r2]
worst = parents[r1]
npart = int(round(.7 * best['npart'] + .3 * worst['npart']))
alpha = float(.7 * best['alpha'] + .3 * worst['alpha'])
@ -172,20 +256,27 @@ def crossover(pais):
order, lags = lag_crossover2(best, worst)
rmse = []
len_lags = []
descendent = genotype(mf, npart, partitioner, order, alpha, lags, None, None)
filho = genotype(mf, npart, partitioner, order, alpha, lags, len_lags, rmse)
return descendent
return filho
# Mutação | p é a probabilidade de mutação
def mutation_lags(lags, order):
new = sorted(random.sample(range(1, 50), order))
for lag in np.arange(len(lags) - 1):
new[lag] = min(50, max(1, int(lags[lag] + np.random.normal(0, 0.5))))
'''
Mutation operation for lags gene
:param lags:
:param order:
:return:
'''
try:
l = len(lags)
new = []
for lag in np.arange(order):
if lag < l:
new.append( min(50, max(1, int(lags[lag] + np.random.randint(-5, 5)))) )
else:
new.append( new[-1] + np.random.randint(1, 5) )
if order > 1:
for k in np.arange(1, order):
@ -193,98 +284,169 @@ def mutation_lags(lags, order):
new[k] = int(new[k] + np.random.randint(1, 5))
return new
except Exception as ex:
print(lags, order, new, lag)
def mutation(individual):
def mutation(individual, pmut):
'''
Mutation operator
:param population:
:return:
'''
import numpy.random
individual['npart'] = min(50, max(3, int(individual['npart'] + np.random.normal(0, 2))))
individual['alpha'] = min(.5, max(0, individual['alpha'] + np.random.normal(0, .1)))
rnd = random.uniform(0, 1)
if rnd < pmut:
print('mutation')
individual['npart'] = min(50, max(3, int(individual['npart'] + np.random.normal(0, 4))))
individual['alpha'] = min(.5, max(0, individual['alpha'] + np.random.normal(0, .5)))
individual['mf'] = random.randint(1, 2)
individual['partitioner'] = random.randint(1, 2)
individual['order'] = min(5, max(1, int(individual['order'] + np.random.normal(0, 0.5))))
individual['order'] = min(5, max(1, int(individual['order'] + np.random.normal(0, 1))))
# Chama a função mutation_lags
individual['lags'] = mutation_lags( individual['lags'], individual['order'])
#individual['lags'] = sorted(random.sample(range(1, 50), individual['order']))
individual['rmse'] = None
individual['len_lags'] = None
return individual
# Elitismo
def elitism(population, new_population):
# Pega melhor indivíduo da população corrente
'''
Elitism operation, always select the best individual of the population and discard the worst
:param population:
:param new_population:
:return:
'''
population = sorted(population, key=itemgetter('rmse'))
best = population[0]
# Ordena a nova população e insere o melhor1 no lugar do pior
new_population = sorted(new_population, key=itemgetter('rmse'))
new_population[-1] = best
# Ordena novamente e pega o melhor
new_population = sorted(new_population, key=itemgetter('rmse'))
if new_population[0]["rmse"] > best["rmse"]:
new_population.insert(0,best)
return new_population
def genetico(dataset, ngen, npop, pcruz, pmut, option=1):
new_populacao = populacao_nova = []
# Gerar população inicial
populacao = initial_population(npop)
def GeneticAlgorithm(dataset, **kwargs):
'''
Genetic algoritm for hyperparameter optimization
# Avaliar população inicial
result = [evaluation1(dataset, k) for k in populacao]
:param dataset:
:param ngen: Max number of generations
:param mgen: Max number of generations without improvement
:param npop: Population size
:param pcruz: Probability of crossover
:param pmut: Probability of mutation
:param window_size: The length of scrolling window for train/test on dataset
:param train_rate: The train/test split ([0,1])
:param increment_rate: The increment of the scrolling window, relative to the window_size ([0,1])
:param parameters: dict with model specific arguments for fit method.
:return: the best genotype
'''
for i in range(npop):
if option == 1:
populacao[i]['len_lags'], populacao[i]['rmse'] = result[i]
else:
populacao[i]['rmse'] = result[i]
statistics = []
ngen = kwargs.get('ngen',30)
mgen = kwargs.get('mgen', 7)
npop = kwargs.get('npop',20)
pcruz = kwargs.get('pcruz',.5)
pmut = kwargs.get('pmut',.3)
collect_statistics = kwargs.get('collect_statistics', False)
no_improvement_count = 0
new_population = []
population = initial_population(npop)
last_best = population[0]
best = population[1]
for individual in population:
individual['len_lags'], individual['rmse'] = evaluate(dataset, individual, **kwargs)
# Gerações
for i in range(ngen):
# Iteração para gerar a nova população
print("GENERATION {}".format(i))
generation_statistics = {}
# Selection
for j in range(int(npop / 2)):
# Selecao de pais
pais = []
pais.append(selection1(populacao))
pais.append(selection1(populacao))
new_population.append(double_tournament(population))
new_population.append(double_tournament(population))
# Cruzamento com probabilidade pcruz
rnd = random.uniform(0, 1)
filho1 = crossover(pais) if pcruz > rnd else pais[0]
rnd = random.uniform(0, 1)
filho2 = crossover(pais) if pcruz > rnd else pais[1]
# Crossover
new = []
for j in range(int(npop * pcruz)):
new.append(crossover(new_population))
new_population.extend(new)
# Mutação com probabilidade pmut
rnd = random.uniform(0, 1)
filho11 = mutation(filho1) if pmut > rnd else filho1
rnd = random.uniform(0, 1)
filho22 = mutation(filho2) if pmut > rnd else filho2
# Mutation
for ct, individual in enumerate(new_population):
new_population[ct] = mutation(individual, pmut)
# Insere filhos na nova população
new_populacao.append(filho11)
new_populacao.append(filho22)
# Evaluation
_f1 = _f2 = []
for individual in new_population:
f1, f2 = evaluate(dataset, individual, **kwargs)
individual['len_lags'], individual['rmse'] = f1, f2
if collect_statistics:
_f1.append(f1)
_f2.append(f2)
#print('eval {}'.format(individual))
result = [evaluation1(dataset, k) for k in new_populacao]
if collect_statistics:
generation_statistics['population'] = {'f1': np.nanmedian(_f1), 'f2': np.nanmedian(_f2)}
for i in range(len(new_populacao)):
new_populacao[i]['len_lags'], new_populacao[i]['rmse'] = result[i]
# Elitism
population = elitism(population, new_population)
populacao = elitism(populacao, new_populacao)
population = population[:npop]
new_populacao = []
new_population = []
melhorT = sorted(populacao, key=lambda item: item['rmse'])[0]
last_best = best
return melhorT
best = population[0]
if collect_statistics:
generation_statistics['best'] = {'f1': best["len_lags"], 'f2': best["rmse"]}
statistics.append(generation_statistics)
if last_best['rmse'] <= best['rmse'] and last_best['len_lags'] <= best['len_lags']:
no_improvement_count += 1
#print("WITHOUT IMPROVEMENT {}".format(no_improvement_count))
pmut += .05
else:
no_improvement_count = 0
pcruz = kwargs.get('pcruz', .5)
pmut = kwargs.get('pmut', .3)
#print(best)
if no_improvement_count == mgen:
break
if collect_statistics:
return best, generation_statistics
else:
return best
def cluster_method(dataset, ngen, npop, pcruz, pmut, option=1):
print(ngen, npop, pcruz, pmut, option)
from pyFTS.hyperparam.Evolutionary import genetico
def cluster_method(dataset, **kwargs):
from pyFTS.hyperparam.Evolutionary import GeneticAlgorithm
inicio = time.time()
ret = genetico(dataset, ngen, npop, pcruz, pmut, option)
ret = GeneticAlgorithm(dataset, **kwargs)
fim = time.time()
ret['time'] = fim - inicio
ret['size'] = ret['len_lags']
@ -297,8 +459,16 @@ def process_jobs(jobs, datasetname, conn):
if job.status == dispy.DispyJob.Finished and result is not None:
print("Processing result of {}".format(result))
metrics = ['rmse', 'size', 'time']
log_result(conn, datasetname, result)
else:
print(job.exception)
print(job.stdout)
def log_result(conn, datasetname, result):
metrics = ['rmse', 'size', 'time']
for metric in metrics:
record = (datasetname, 'Evolutive', 'WHOFTS', None, result['mf'],
result['order'], result['partitioner'], result['npart'],
@ -309,28 +479,33 @@ def process_jobs(jobs, datasetname, conn):
hUtil.insert_hyperparam(record, conn)
else:
print(job.exception)
print(job.stdout)
def execute(datasetname, dataset, **kwargs):
conn = hUtil.open_hyperparam_db('hyperparam.db')
distributed = kwargs.get('distributed', False)
experiments = kwargs.get('experiments', 30)
if not distributed:
ret = []
for i in range(experiments):
result = cluster_method(dataset, **kwargs)
log_result(conn, datasetname, result)
ret.append(result)
return result
elif distributed=='dispy':
nodes = kwargs.get('nodes', ['127.0.0.1'])
cluster, http_server = Util.start_dispy_cluster(cluster_method, nodes=nodes)
conn = hUtil.open_hyperparam_db('hyperparam.db')
ngen = kwargs.get('ngen', 70)
npop = kwargs.get('npop', 20)
pcruz = kwargs.get('pcruz', .8)
pmut = kwargs.get('pmut', .2)
option = kwargs.get('option', 1)
jobs = []
for i in range(kwargs.get('experiments', 30)):
for i in range(experiments):
print("Experiment {}".format(i))
job = cluster.submit(dataset, ngen, npop, pcruz, pmut, option)
job = cluster.submit(dataset, **kwargs)
jobs.append(job)
process_jobs(jobs, datasetname, conn)

View File

@ -90,7 +90,6 @@ class HighOrderFTS(fts.FTS):
self.is_high_order = True
self.min_order = 1
self.order= kwargs.get("order", self.min_order)
self.lags = kwargs.get("lags", None)
self.configure_lags(**kwargs)
def configure_lags(self, **kwargs):

View File

@ -19,7 +19,7 @@ class FLR(object):
self.RHS = set
def __str__(self):
return str([self.LHS[k] for k in self.LHS.keys()]) + " -> " + self.RHS
return "{} -> {}".format([self.LHS[k] for k in self.LHS.keys()], self.RHS)

View File

@ -13,13 +13,6 @@ class ClusteredMVFTS(mvfts.MVFTS):
def __init__(self, **kwargs):
super(ClusteredMVFTS, self).__init__(**kwargs)
self.cluster_method = kwargs.get('cluster_method', grid.GridCluster)
"""The cluster method to be called when a new model is build"""
self.cluster_params = kwargs.get('cluster_params', {})
"""The cluster method parameters"""
self.cluster = kwargs.get('cluster', None)
"""The trained clusterer"""
self.fts_method = kwargs.get('fts_method', hofts.WeightedHighOrderFTS)
"""The FTS method to be called when a new model is build"""
self.fts_params = kwargs.get('fts_params', {})
@ -30,6 +23,8 @@ class ClusteredMVFTS(mvfts.MVFTS):
self.is_high_order = True
self.is_clustered = True
self.order = kwargs.get("order", 2)
self.lags = kwargs.get("lags", None)
self.alpha_cut = kwargs.get('alpha_cut', 0.25)
@ -43,16 +38,13 @@ class ClusteredMVFTS(mvfts.MVFTS):
ndata = []
for index, row in data.iterrows():
data_point = self.format_data(row)
ndata.append(common.fuzzyfy_instance_clustered(data_point, self.cluster, alpha_cut=self.alpha_cut))
ndata.append(common.fuzzyfy_instance_clustered(data_point, self.partitioner, alpha_cut=self.alpha_cut))
return ndata
def train(self, data, **kwargs):
if self.cluster is None:
self.cluster = self.cluster_method(data=data, mvfts=self, neighbors=self.knn, **self.cluster_params)
self.model = self.fts_method(partitioner=self.cluster, **self.fts_params)
self.model = self.fts_method(partitioner=self.partitioner, **self.fts_params)
if self.model.is_high_order:
self.model.order = self.order
@ -60,7 +52,7 @@ class ClusteredMVFTS(mvfts.MVFTS):
self.model.train(ndata, fuzzyfied=self.pre_fuzzyfy)
self.cluster.prune()
self.partitioner.prune()
def check_data(self, data):
if self.pre_fuzzyfy:
@ -84,8 +76,8 @@ class ClusteredMVFTS(mvfts.MVFTS):
for var in self.explanatory_variables:
if self.target_variable.name != var.name:
self.target_variable = var
self.cluster.change_target_variable(var)
self.model.partitioner = self.cluster
self.partitioner.change_target_variable(var)
self.model.partitioner = self.partitioner
self.model.reset_calculated_values()
ret[var.name] = self.model.forecast(ndata, fuzzyfied=self.pre_fuzzyfy, **kwargs)

View File

@ -7,12 +7,12 @@ class MultivariateFuzzySet(Composite.FuzzySet):
"""
Multivariate Composite Fuzzy Set
"""
def __init__(self, name, **kwargs):
def __init__(self, **kwargs):
"""
Create an empty composite fuzzy set
:param name: fuzzy set name
"""
super(MultivariateFuzzySet, self).__init__(name)
super(MultivariateFuzzySet, self).__init__("")
self.sets = {}
self.target_variable = kwargs.get('target_variable',None)
@ -28,10 +28,10 @@ class MultivariateFuzzySet(Composite.FuzzySet):
if variable == self.target_variable.name:
self.centroid = set.centroid
self.name += set.name
def set_target_variable(self, variable):
#print(self.target_variable, variable)
self.target_variable = variable
#print(self.centroid,self.sets[variable.name].centroid)
self.centroid = self.sets[variable.name].centroid
def membership(self, x):
@ -42,7 +42,6 @@ class MultivariateFuzzySet(Composite.FuzzySet):
return np.nanmin(mv)
def fuzzyfy_instance(data_point, var):
fsets = FuzzySet.fuzzyfy(data_point, var.partitioner, mode='sets', method='fuzzy', alpha_cut=var.alpha_cut)
return [(var.name, fs) for fs in fsets]

View File

@ -1,4 +1,4 @@
from pyFTS.partitioners import partitioner
from pyFTS.models.multivariate import partitioner
from pyFTS.models.multivariate.common import MultivariateFuzzySet, fuzzyfy_instance_clustered
from itertools import product
from scipy.spatial import KDTree
@ -6,106 +6,28 @@ import numpy as np
import pandas as pd
class GridCluster(partitioner.Partitioner):
class GridCluster(partitioner.MultivariatePartitioner):
"""
A cartesian product of all fuzzy sets of all variables
"""
def __init__(self, **kwargs):
super(GridCluster, self).__init__(name="GridCluster", preprocess=False, **kwargs)
self.mvfts = kwargs.get('mvfts', None)
self.sets = {}
self.kdtree = None
self.index = {}
self.neighbors = kwargs.get('neighbors', 2)
self.optmize = kwargs.get('optmize', False)
if self.optmize:
self.count = {}
data = kwargs.get('data', [None])
self.build(data)
super(GridCluster, self).__init__(**kwargs)
self.name="GridCluster"
self.build(None)
def build(self, data):
fsets = [[x for x in k.partitioner.sets.values()]
for k in self.mvfts.explanatory_variables]
midpoints = []
for k in self.explanatory_variables]
c = 0
for k in product(*fsets):
#key = self.prefix+str(c)
mvfset = MultivariateFuzzySet(name="", target_variable=self.mvfts.target_variable)
mp = []
_key = ""
mvfset = MultivariateFuzzySet(target_variable=self.target_variable)
for fset in k:
mvfset.append_set(fset.variable, fset)
mp.append(fset.centroid)
_key += fset.name
mvfset.name = _key
self.sets[_key] = mvfset
midpoints.append(mp)
self.index[c] = _key
self.sets[mvfset.name] = mvfset
c += 1
import sys
sys.setrecursionlimit(100000)
self.build_index()
self.kdtree = KDTree(midpoints)
sys.setrecursionlimit(1000)
def prune(self):
if not self.optmize:
return
for fset in [fs for fs in self.sets.keys()]:
if fset not in self.count:
fs = self.sets.pop(fset)
del (fs)
vars = [k.name for k in self.mvfts.explanatory_variables]
midpoints = []
self.index = {}
for ct, fset in enumerate(self.sets.values()):
mp = []
for vr in vars:
mp.append(fset.sets[vr].centroid)
midpoints.append(mp)
self.index[ct] = fset.name
import sys
sys.setrecursionlimit(100000)
self.kdtree = KDTree(midpoints)
sys.setrecursionlimit(1000)
def knn(self, data):
tmp = [data[k.name]
for k in self.mvfts.explanatory_variables]
tmp, ix = self.kdtree.query(tmp, self.neighbors)
if not isinstance(ix, (list, np.ndarray)):
ix = [ix]
if self.optmize:
tmp = []
for k in ix:
tmp.append(self.index[k])
self.count[self.index[k]] = 1
return tmp
else:
return [self.index[k] for k in ix]
def fuzzyfy(self, data, **kwargs):
return fuzzyfy_instance_clustered(data, self, **kwargs)
def change_target_variable(self, variable):
for fset in self.sets:
self.sets[fset].set_target_variable(variable)

View File

@ -12,8 +12,8 @@ class MVFTS(fts.FTS):
"""
def __init__(self, **kwargs):
super(MVFTS, self).__init__(**kwargs)
self.explanatory_variables = []
self.target_variable = None
self.explanatory_variables = kwargs.get('explanatory_variables',[])
self.target_variable = kwargs.get('target_variable',None)
self.flrgs = {}
self.is_multivariate = True
self.shortname = "MVFTS"

View File

@ -0,0 +1,90 @@
from pyFTS.partitioners import partitioner
from pyFTS.models.multivariate.common import MultivariateFuzzySet, fuzzyfy_instance_clustered
from itertools import product
from scipy.spatial import KDTree
import numpy as np
import pandas as pd
class MultivariatePartitioner(partitioner.Partitioner):
"""
Base class for partitioners which use the MultivariateFuzzySet
"""
def __init__(self, **kwargs):
super(MultivariatePartitioner, self).__init__(name="MultivariatePartitioner", preprocess=False, **kwargs)
self.type = 'multivariate'
self.sets = {}
self.kdtree = None
self.index = {}
self.explanatory_variables = kwargs.get('explanatory_variables', [])
self.target_variable = kwargs.get('target_variable', None)
self.neighbors = kwargs.get('neighbors', 2)
self.optimize = kwargs.get('optimize', True)
if self.optimize:
self.count = {}
data = kwargs.get('data', None)
self.build(data)
def build(self, data):
pass
def append(self, fset):
self.sets[fset.name] = fset
def prune(self):
if not self.optimize:
return
for fset in [fs for fs in self.sets.keys()]:
if fset not in self.count:
fs = self.sets.pop(fset)
del (fs)
self.build_index()
def knn(self, data):
tmp = [data[k.name]
for k in self.explanatory_variables]
tmp, ix = self.kdtree.query(tmp, self.neighbors)
if not isinstance(ix, (list, np.ndarray)):
ix = [ix]
if self.optimize:
tmp = []
for k in ix:
tmp.append(self.index[k])
self.count[self.index[k]] = 1
return tmp
else:
return [self.index[k] for k in ix]
def fuzzyfy(self, data, **kwargs):
return fuzzyfy_instance_clustered(data, self, **kwargs)
def change_target_variable(self, variable):
for fset in self.sets.values():
fset.set_target_variable(variable)
def build_index(self):
midpoints = []
self.index = {}
for ct, fset in enumerate(self.sets.values()):
mp = []
for vr in self.explanatory_variables:
mp.append(fset.sets[vr.name].centroid)
midpoints.append(mp)
self.index[ct] = fset.name
import sys
sys.setrecursionlimit(100000)
self.kdtree = KDTree(midpoints)
sys.setrecursionlimit(1000)

View File

@ -1,3 +1,4 @@
import pandas as pd
from pyFTS.common import fts, FuzzySet, FLR, Membership, tree
from pyFTS.partitioners import Grid
from pyFTS.models.multivariate import FLR as MVFLR
@ -24,6 +25,10 @@ class Variable:
self.data_label = kwargs.get('data_label', self.name)
"""A string with the column name on DataFrame"""
self.type = kwargs.get('type', 'common')
self.data_type = kwargs.get('data_type', None)
"""The type of the data column on Pandas Dataframe"""
self.mask = kwargs.get('mask', None)
"""The mask for format the data column on Pandas Dataframe"""
self.transformation = kwargs.get('transformation', None)
self.transformation_params = kwargs.get('transformation_params', None)
self.partitioner = None

View File

@ -20,11 +20,12 @@ class WeightedFLRG(mvflrg.FLRG):
self.w = None
def append_rhs(self, fset, **kwargs):
count = kwargs.get('count', 1.0)
if fset not in self.RHS:
self.RHS[fset] = 1.0
self.RHS[fset] = count
else:
self.RHS[fset] += 1.0
self.count += 1.0
self.RHS[fset] += count
self.count += count
def weights(self):
if self.w is None:
@ -51,10 +52,6 @@ class WeightedMVFTS(mvfts.MVFTS):
"""
def __init__(self, **kwargs):
super(WeightedMVFTS, self).__init__(order=1, **kwargs)
self.explanatory_variables = []
self.target_variable = None
self.flrgs = {}
self.is_multivariate = True
self.shortname = "WeightedMVFTS"
self.name = "Weighted Multivariate FTS"

View File

@ -21,6 +21,15 @@ class SimplePartitioner(partitioner.Partitioner):
self.partitions = 0
def append_complex(self, fs):
self.sets[fs.name] = fs
self.partitions += 1
self.ordered_sets = [key for key in sorted(self.sets.keys(), key=lambda k: self.sets[k].centroid)]
self.min = self.sets[self.ordered_sets[0]].lower
self.max = self.sets[self.ordered_sets[-1]].upper
def append(self, name, mf, parameters, **kwargs):
"""
Append a new partition (fuzzy set) to the partitioner
@ -39,7 +48,7 @@ class SimplePartitioner(partitioner.Partitioner):
if mf is None or mf not in (Membership.trimf, Membership.gaussmf,
Membership.trapmf, Membership.singleton,
Membership.sigmf):
raise ValueError("The mf parameter should be one of pyFTS.common.Membership functions")
raise ValueError("The mf parameter should be one of pyFTS.common.Membership functions, not {}".format(mf))
if mf == Membership.trimf:
if len(parameters) != 3:

View File

@ -2,14 +2,14 @@ import numpy as np
from pyFTS.hyperparam import GridSearch, Evolutionary
def get_dataset():
#from pyFTS.data import SONDA
from pyFTS.data import Malaysia
ds = Malaysia.get_data('temperature')[:1000]
# ds = pd.read_csv('Malaysia.csv',delimiter=',' )[['temperature']].values[:2000].flatten().tolist()
#train = ds[:800]
#test = ds[800:]
#data = SONDA.get_data('temperature')[:1000]
data = Malaysia.get_data('temperature')[:1000]
return 'Malaysia.temperature', ds #train, test
#return 'SONDA.glo_avg', data #train, test
return 'Malaysia.temperature', data #train, test
"""
hyperparams = {
@ -39,4 +39,28 @@ datsetname, dataset = get_dataset()
#Evolutionary.cluster_method(dataset, 70, 20, .8, .3, 1)
Evolutionary.execute(datsetname, dataset, nodes=nodes, ngen=50, npop=30, )
'''
from pyFTS.models import hofts
from pyFTS.partitioners import Grid
from pyFTS.benchmarks import Measures
fs = Grid.GridPartitioner(data=dataset[:800], npart=30)
model = hofts.WeightedHighOrderFTS(partitioner=fs, order=2)
model.fit(dataset[:800])
model.predict(dataset[800:1000])
Measures.get_point_statistics(dataset[800:1000], model)
print(model)
'''
ret = Evolutionary.execute(datsetname, dataset,
ngen=30, npop=20, pcruz=.5, pmut=.3,
window_size=800, experiments=30)
#parameters={'distributed': 'spark', 'url': 'spark://192.168.0.106:7077'})
print(ret)
#'''

View File

@ -28,8 +28,6 @@ test_uv = dataset['value'].values[24505:]
train_mv = dataset.iloc[:24505]
test_mv = dataset.iloc[24505:]
print(train_mv)
sp = {'seasonality': DateTime.minute_of_day, 'names': [str(k)+'hs' for k in range(0,24)]}
vhour = variable.Variable("Hour", data_label="date", partitioner=seasonal.TimeGridPartitioner, npart=24,
@ -46,20 +44,15 @@ parameters = [
{'order':2, 'knn': 3},
]
for ct, method in enumerate([mvfts.MVFTS, wmvfts.WeightedMVFTS,
cmvfts.ClusteredMVFTS,cmvfts.ClusteredMVFTS,cmvfts.ClusteredMVFTS]):
print(method)
model = method(**parameters[ct])
model.shortname += str(ct)
#for ct, method in enumerate([, wmvfts.WeightedMVFTS,
# cmvfts.ClusteredMVFTS,cmvfts.ClusteredMVFTS,cmvfts.ClusteredMVFTS]):
model = mvfts.MVFTS()
model.append_variable(vhour)
model.append_variable(vvalue)
model.target_variable = vvalue
model.fit(train_mv)
Util.persist_obj(model, model.shortname)
forecasts = model.predict(test_mv.iloc[:100])
print(model)

View File

@ -1,7 +1,8 @@
import numpy as np
import pandas as pd
import time
from pyFTS.data import Enrollments, TAIEX
from pyFTS.data import Enrollments, TAIEX, SONDA
from pyFTS.partitioners import Grid, Simple
from pyFTS.models import hofts
@ -12,20 +13,51 @@ import os
# make sure pyspark tells workers to use python3 not 2 if both are installed
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/usr/bin/python3'
data = TAIEX.get_data()
#'''
data = SONDA.get_data('glo_avg')
fs = Grid.GridPartitioner(data=data, npart=50)
model = hofts.WeightedHighOrderFTS(partitioner=fs, order=2)
model.fit(data, distributed='spark', url='spark://192.168.0.110:7077')
_s1 = time.time()
model.fit(data, distributed='spark', url='spark://192.168.0.106:7077')
_s2 = time.time()
print(_s2-_s1)
#model.fit(data, distributed='dispy', nodes=['192.168.0.110'])
'''
from pyFTS.models.multivariate import common, variable, mvfts, wmvfts, cmvfts, grid
from pyFTS.models.seasonal import partitioner as seasonal
from pyFTS.models.seasonal.common import DateTime
dataset = pd.read_csv('/home/petronio/Downloads/kalang.csv', sep=',')
dataset['date'] = pd.to_datetime(dataset["date"], format='%Y-%m-%d %H:%M:%S')
train_mv = dataset.iloc[:24505]
test_mv = dataset.iloc[24505:]
sp = {'seasonality': DateTime.minute_of_day, 'names': [str(k)+'hs' for k in range(0,24)]}
vhour = variable.Variable("Hour", data_label="date", partitioner=seasonal.TimeGridPartitioner, npart=24,
data=train_mv, partitioner_specific=sp, data_type=pd.datetime, mask='%Y-%m-%d %H:%M:%S')
vvalue = variable.Variable("Pollution", data_label="value", alias='value',
partitioner=Grid.GridPartitioner, npart=35, data_type=np.float64,
data=train_mv)
fs = grid.GridCluster(explanatory_variables=[vhour, vvalue], target_variable=vvalue)
#model = wmvfts.WeightedMVFTS(explanatory_variables=[vhour, vvalue], target_variable=vvalue)
model = cmvfts.ClusteredMVFTS(explanatory_variables=[vhour, vvalue], target_variable=vvalue,
partitioner=fs)
model.fit(train_mv, distributed='spark', url='spark://192.168.0.106:7077')
#'''
print(model)
'''
def fun(x):
return (x, x % 2)