distributed.spark optimizations and bugfixes
This commit is contained in:
parent
6373afe136
commit
e91d44afd1
@ -8,7 +8,6 @@ import dill
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def plot_rules(model, size=[5, 5], axis=None, rules_by_axis=None, columns=1):
|
def plot_rules(model, size=[5, 5], axis=None, rules_by_axis=None, columns=1):
|
||||||
if axis is None and rules_by_axis is None:
|
if axis is None and rules_by_axis is None:
|
||||||
rows = 1
|
rows = 1
|
||||||
@ -126,8 +125,8 @@ def show_and_save_image(fig, file, flag, lgd=None):
|
|||||||
:param flag: if True the image will be saved
|
:param flag: if True the image will be saved
|
||||||
:param lgd: legend
|
:param lgd: legend
|
||||||
"""
|
"""
|
||||||
|
plt.show()
|
||||||
if flag:
|
if flag:
|
||||||
plt.show()
|
|
||||||
if lgd is not None:
|
if lgd is not None:
|
||||||
fig.savefig(file, additional_artists=lgd,bbox_inches='tight') #bbox_extra_artists=(lgd,), )
|
fig.savefig(file, additional_artists=lgd,bbox_inches='tight') #bbox_extra_artists=(lgd,), )
|
||||||
else:
|
else:
|
||||||
|
@ -163,9 +163,7 @@ class FTS(object):
|
|||||||
elif distributed == 'spark':
|
elif distributed == 'spark':
|
||||||
from pyFTS.distributed import spark
|
from pyFTS.distributed import spark
|
||||||
|
|
||||||
nodes = kwargs.get("nodes", 'spark://192.168.0.110:7077')
|
ret = spark.distributed_predict(data=ndata, model=self, **kwargs)
|
||||||
app = kwargs.get("app", 'pyFTS')
|
|
||||||
ret = spark.distributed_predict(data=ndata, model=self, url=nodes, app=app)
|
|
||||||
|
|
||||||
|
|
||||||
if not self.is_multivariate:
|
if not self.is_multivariate:
|
||||||
|
@ -16,7 +16,21 @@ SPARK_ADDR = 'spark://192.168.0.110:7077'
|
|||||||
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3'
|
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3'
|
||||||
os.environ['PYSPARK_DRIVER_PYTHON'] = '/usr/bin/python3'
|
os.environ['PYSPARK_DRIVER_PYTHON'] = '/usr/bin/python3'
|
||||||
|
|
||||||
|
def create_spark_conf(**kwargs):
|
||||||
|
spark_executor_memory = kwargs.get("spark_executor_memory", "2g")
|
||||||
|
spark_driver_memory = kwargs.get("spark_driver_memory", "2g")
|
||||||
|
url = kwargs.get("url", SPARK_ADDR)
|
||||||
|
app = kwargs.get("app", 'pyFTS')
|
||||||
|
|
||||||
|
conf = SparkConf()
|
||||||
|
conf.setMaster(url)
|
||||||
|
conf.setAppName(app)
|
||||||
|
conf.set("spark.executor.memory", spark_executor_memory)
|
||||||
|
conf.set("spark.driver.memory", spark_driver_memory)
|
||||||
|
conf.set("spark.memory.offHeap.enabled",True)
|
||||||
|
conf.set("spark.memory.offHeap.size","16g")
|
||||||
|
|
||||||
|
return conf
|
||||||
|
|
||||||
def get_partitioner(shared_partitioner, type='common', variables=[]):
|
def get_partitioner(shared_partitioner, type='common', variables=[]):
|
||||||
"""
|
"""
|
||||||
@ -74,6 +88,16 @@ def get_variables(**parameters):
|
|||||||
|
|
||||||
return (explanatory_variables, target_variable)
|
return (explanatory_variables, target_variable)
|
||||||
|
|
||||||
|
def create_univariate_model(**parameters):
|
||||||
|
if parameters['order'].value > 1:
|
||||||
|
model = parameters['method'].value(partitioner=get_partitioner(parameters['partitioner']),
|
||||||
|
order=parameters['order'].value, alpha_cut=parameters['alpha_cut'].value,
|
||||||
|
lags=parameters['lags'].value)
|
||||||
|
else:
|
||||||
|
model = parameters['method'].value(partitioner=get_partitioner(parameters['partitioner']),
|
||||||
|
alpha_cut=parameters['alpha_cut'].value)
|
||||||
|
|
||||||
|
return model
|
||||||
|
|
||||||
def slave_train_univariate(data, **parameters):
|
def slave_train_univariate(data, **parameters):
|
||||||
"""
|
"""
|
||||||
@ -82,27 +106,32 @@ def slave_train_univariate(data, **parameters):
|
|||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if parameters['type'].value == 'common':
|
model = create_univariate_model(**parameters)
|
||||||
|
|
||||||
if parameters['order'].value > 1:
|
ndata = [k for k in data]
|
||||||
model = parameters['method'].value(partitioner=get_partitioner(parameters['partitioner']),
|
|
||||||
order=parameters['order'].value, alpha_cut=parameters['alpha_cut'].value,
|
|
||||||
lags=parameters['lags'].value)
|
|
||||||
else:
|
|
||||||
model = parameters['method'].value(partitioner=get_partitioner(parameters['partitioner']),
|
|
||||||
alpha_cut=parameters['alpha_cut'].value)
|
|
||||||
|
|
||||||
ndata = [k for k in data]
|
|
||||||
|
|
||||||
else:
|
|
||||||
pass
|
|
||||||
|
|
||||||
model.train(ndata)
|
model.train(ndata)
|
||||||
|
|
||||||
return [(k, model.flrgs[k]) for k in model.flrgs.keys()]
|
return [(k, model.flrgs[k]) for k in model.flrgs.keys()]
|
||||||
|
|
||||||
|
|
||||||
|
def slave_forecast_univariate(data, **parameters):
|
||||||
|
"""
|
||||||
|
|
||||||
|
:param data:
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
|
||||||
|
model = create_univariate_model(**parameters)
|
||||||
|
|
||||||
|
ndata = [k for k in data]
|
||||||
|
|
||||||
|
forecasts = model.predict(ndata)
|
||||||
|
|
||||||
|
return [(k, k) for k in forecasts]
|
||||||
|
|
||||||
|
|
||||||
def slave_train_multivariate(data, **parameters):
|
def create_multivariate_model(**parameters):
|
||||||
explanatory_variables, target_variable = get_variables(**parameters)
|
explanatory_variables, target_variable = get_variables(**parameters)
|
||||||
#vars = [(v.name, v.name) for v in explanatory_variables]
|
#vars = [(v.name, v.name) for v in explanatory_variables]
|
||||||
|
|
||||||
@ -129,8 +158,13 @@ def slave_train_multivariate(data, **parameters):
|
|||||||
target_variable=target_variable,
|
target_variable=target_variable,
|
||||||
alpha_cut=parameters['alpha_cut'].value)
|
alpha_cut=parameters['alpha_cut'].value)
|
||||||
|
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
def slave_train_multivariate(data, **parameters):
|
||||||
|
|
||||||
|
model = create_multivariate_model(**parameters)
|
||||||
|
|
||||||
rows = [k for k in data]
|
rows = [k for k in data]
|
||||||
ndata = pd.DataFrame.from_records(rows, columns=parameters['columns'].value)
|
ndata = pd.DataFrame.from_records(rows, columns=parameters['columns'].value)
|
||||||
|
|
||||||
@ -145,7 +179,68 @@ def slave_train_multivariate(data, **parameters):
|
|||||||
return [(k, v) for k,v in model.flrgs.items()]
|
return [(k, v) for k,v in model.flrgs.items()]
|
||||||
|
|
||||||
|
|
||||||
def distributed_train(model, data, url=SPARK_ADDR, app='pyFTS'):
|
def slave_forecast_multivariate(data, **parameters):
|
||||||
|
|
||||||
|
model = create_multivariate_model(**parameters)
|
||||||
|
|
||||||
|
rows = [k for k in data]
|
||||||
|
ndata = pd.DataFrame.from_records(rows, columns=parameters['columns'].value)
|
||||||
|
|
||||||
|
forecasts = model.predict(ndata)
|
||||||
|
|
||||||
|
return [(k, k) for k in forecasts]
|
||||||
|
|
||||||
|
|
||||||
|
def share_parameters(model, context, data):
|
||||||
|
parameters = {}
|
||||||
|
if not model.is_multivariate:
|
||||||
|
parameters['type'] = context.broadcast('common')
|
||||||
|
parameters['partitioner'] = context.broadcast(model.partitioner.sets)
|
||||||
|
parameters['alpha_cut'] = context.broadcast(model.alpha_cut)
|
||||||
|
parameters['order'] = context.broadcast(model.order)
|
||||||
|
parameters['method'] = context.broadcast(type(model))
|
||||||
|
parameters['lags'] = context.broadcast(model.lags)
|
||||||
|
parameters['max_lag'] = context.broadcast(model.max_lag)
|
||||||
|
else:
|
||||||
|
if model.is_clustered:
|
||||||
|
parameters['type'] = context.broadcast('clustered')
|
||||||
|
names = []
|
||||||
|
for name, fset in model.partitioner.sets.items():
|
||||||
|
names.append(name)
|
||||||
|
parameters['partitioner_{}'.format(name)] = context.broadcast([(k,v) for k,v in fset.sets.items()])
|
||||||
|
|
||||||
|
parameters['partitioner_names'] = context.broadcast(names)
|
||||||
|
|
||||||
|
else:
|
||||||
|
parameters['type'] = context.broadcast('multivariate')
|
||||||
|
names = []
|
||||||
|
for var in model.explanatory_variables:
|
||||||
|
#if var.data_type is None:
|
||||||
|
# raise Exception("It is mandatory to inform the data_type parameter for each variable when the training is distributed! ")
|
||||||
|
names.append(var.name)
|
||||||
|
parameters['{}_type'.format(var.name)] = context.broadcast(var.type)
|
||||||
|
#parameters['{}_data_type'.format(var.name)] = context.broadcast(var.data_type)
|
||||||
|
#parameters['{}_mask'.format(var.name)] = context.broadcast(var.mask)
|
||||||
|
parameters['{}_label'.format(var.name)] = context.broadcast(var.data_label)
|
||||||
|
parameters['{}_alpha'.format(var.name)] = context.broadcast(var.alpha_cut)
|
||||||
|
parameters['{}_partitioner'.format(var.name)] = context.broadcast(var.partitioner.sets)
|
||||||
|
parameters['{}_partitioner_type'.format(var.name)] = context.broadcast(var.partitioner.type)
|
||||||
|
|
||||||
|
parameters['variables'] = context.broadcast(names)
|
||||||
|
parameters['target'] = context.broadcast(model.target_variable.name)
|
||||||
|
|
||||||
|
parameters['columns'] = context.broadcast(data.columns.values)
|
||||||
|
|
||||||
|
parameters['alpha_cut'] = context.broadcast(model.alpha_cut)
|
||||||
|
parameters['order'] = context.broadcast(model.order)
|
||||||
|
parameters['method'] = context.broadcast(type(model))
|
||||||
|
parameters['lags'] = context.broadcast(model.lags)
|
||||||
|
parameters['max_lag'] = context.broadcast(model.max_lag)
|
||||||
|
|
||||||
|
return parameters
|
||||||
|
|
||||||
|
|
||||||
|
def distributed_train(model, data, **kwargs):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
@ -155,85 +250,34 @@ def distributed_train(model, data, url=SPARK_ADDR, app='pyFTS'):
|
|||||||
:param app:
|
:param app:
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
conf = SparkConf()
|
num_batches = kwargs.get("num_batches", 4)
|
||||||
conf.setMaster(url)
|
|
||||||
conf.setAppName(app)
|
conf = create_spark_conf(**kwargs)
|
||||||
conf.set("spark.executor.memory", "2g")
|
|
||||||
conf.set("spark.driver.memory", "2g")
|
|
||||||
conf.set("spark.memory.offHeap.enabled",True)
|
|
||||||
conf.set("spark.memory.offHeap.size","16g")
|
|
||||||
parameters = {}
|
|
||||||
|
|
||||||
with SparkContext(conf=conf) as context:
|
with SparkContext(conf=conf) as context:
|
||||||
|
|
||||||
nodes = context.defaultParallelism
|
nodes = context.defaultParallelism
|
||||||
|
|
||||||
|
parameters = share_parameters(model, context, data)
|
||||||
|
|
||||||
if not model.is_multivariate:
|
if not model.is_multivariate:
|
||||||
parameters['type'] = context.broadcast('common')
|
|
||||||
parameters['partitioner'] = context.broadcast(model.partitioner.sets)
|
|
||||||
parameters['alpha_cut'] = context.broadcast(model.alpha_cut)
|
|
||||||
parameters['order'] = context.broadcast(model.order)
|
|
||||||
parameters['method'] = context.broadcast(type(model))
|
|
||||||
parameters['lags'] = context.broadcast(model.lags)
|
|
||||||
parameters['max_lag'] = context.broadcast(model.max_lag)
|
|
||||||
|
|
||||||
func = lambda x: slave_train_univariate(x, **parameters)
|
func = lambda x: slave_train_univariate(x, **parameters)
|
||||||
|
|
||||||
flrgs = context.parallelize(data).repartition(nodes*4).mapPartitions(func)
|
flrgs = context.parallelize(data).repartition(nodes*num_batches).mapPartitions(func)
|
||||||
|
|
||||||
for k in flrgs.collect():
|
for k in flrgs.collect():
|
||||||
model.append_rule(k[1])
|
model.append_rule(k[1])
|
||||||
|
|
||||||
return model
|
|
||||||
else:
|
else:
|
||||||
if model.is_clustered:
|
|
||||||
parameters['type'] = context.broadcast('clustered')
|
|
||||||
names = []
|
|
||||||
for name, fset in model.partitioner.sets.items():
|
|
||||||
names.append(name)
|
|
||||||
parameters['partitioner_{}'.format(name)] = context.broadcast([(k,v) for k,v in fset.sets.items()])
|
|
||||||
|
|
||||||
parameters['partitioner_names'] = context.broadcast(names)
|
|
||||||
|
|
||||||
else:
|
|
||||||
parameters['type'] = context.broadcast('multivariate')
|
|
||||||
names = []
|
|
||||||
for var in model.explanatory_variables:
|
|
||||||
#if var.data_type is None:
|
|
||||||
# raise Exception("It is mandatory to inform the data_type parameter for each variable when the training is distributed! ")
|
|
||||||
names.append(var.name)
|
|
||||||
parameters['{}_type'.format(var.name)] = context.broadcast(var.type)
|
|
||||||
#parameters['{}_data_type'.format(var.name)] = context.broadcast(var.data_type)
|
|
||||||
#parameters['{}_mask'.format(var.name)] = context.broadcast(var.mask)
|
|
||||||
parameters['{}_label'.format(var.name)] = context.broadcast(var.data_label)
|
|
||||||
parameters['{}_alpha'.format(var.name)] = context.broadcast(var.alpha_cut)
|
|
||||||
parameters['{}_partitioner'.format(var.name)] = context.broadcast(var.partitioner.sets)
|
|
||||||
parameters['{}_partitioner_type'.format(var.name)] = context.broadcast(var.partitioner.type)
|
|
||||||
|
|
||||||
parameters['variables'] = context.broadcast(names)
|
|
||||||
parameters['target'] = context.broadcast(model.target_variable.name)
|
|
||||||
|
|
||||||
parameters['columns'] = context.broadcast(data.columns.values)
|
|
||||||
|
|
||||||
data = data.to_dict(orient='records')
|
data = data.to_dict(orient='records')
|
||||||
|
|
||||||
parameters['alpha_cut'] = context.broadcast(model.alpha_cut)
|
|
||||||
parameters['order'] = context.broadcast(model.order)
|
|
||||||
parameters['method'] = context.broadcast(type(model))
|
|
||||||
parameters['lags'] = context.broadcast(model.lags)
|
|
||||||
parameters['max_lag'] = context.broadcast(model.max_lag)
|
|
||||||
|
|
||||||
func = lambda x: slave_train_multivariate(x, **parameters)
|
func = lambda x: slave_train_multivariate(x, **parameters)
|
||||||
|
|
||||||
flrgs = context.parallelize(data).mapPartitions(func)
|
flrgs = context.parallelize(data).mapPartitions(func)
|
||||||
|
|
||||||
for k in flrgs.collect():
|
for k in flrgs.collect():
|
||||||
print(k)
|
|
||||||
#for g in k:
|
|
||||||
# print(g)
|
|
||||||
|
|
||||||
#return
|
|
||||||
if parameters['type'].value == 'clustered':
|
if parameters['type'].value == 'clustered':
|
||||||
if k[0] == 'counts':
|
if k[0] == 'counts':
|
||||||
for fset, count in k[1]:
|
for fset, count in k[1]:
|
||||||
@ -243,8 +287,46 @@ def distributed_train(model, data, url=SPARK_ADDR, app='pyFTS'):
|
|||||||
else:
|
else:
|
||||||
model.append_rule(k[1])
|
model.append_rule(k[1])
|
||||||
|
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
def distributed_predict(data, model, url=SPARK_ADDR, app='pyFTS'):
|
def distributed_predict(data, model, **kwargs):
|
||||||
return None
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
:param model:
|
||||||
|
:param data:
|
||||||
|
:param url:
|
||||||
|
:param app:
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
|
||||||
|
num_batches = kwargs.get("num_batches", 4)
|
||||||
|
|
||||||
|
conf = create_spark_conf(**kwargs)
|
||||||
|
|
||||||
|
ret = []
|
||||||
|
|
||||||
|
with SparkContext(conf=conf) as context:
|
||||||
|
|
||||||
|
nodes = context.defaultParallelism
|
||||||
|
|
||||||
|
parameters = share_parameters(model, context)
|
||||||
|
|
||||||
|
if not model.is_multivariate:
|
||||||
|
func = lambda x: slave_forecast_univariate(x, **parameters)
|
||||||
|
|
||||||
|
forecasts = context.parallelize(data).repartition(nodes * num_batches).mapPartitions(func)
|
||||||
|
|
||||||
|
else:
|
||||||
|
|
||||||
|
data = data.to_dict(orient='records')
|
||||||
|
|
||||||
|
func = lambda x: slave_forecast_multivariate(x, **parameters)
|
||||||
|
|
||||||
|
forecasts = context.parallelize(data).repartition(nodes * num_batches).mapPartitions(func)
|
||||||
|
|
||||||
|
for k in forecasts.collect():
|
||||||
|
ret.extend(k)
|
||||||
|
|
||||||
|
return ret
|
||||||
|
@ -96,7 +96,7 @@ t1 = time()
|
|||||||
Evolutionary.execute('SONDA', dataset,
|
Evolutionary.execute('SONDA', dataset,
|
||||||
ngen=20, mgen=5, npop=15, pcruz=.5, pmut=.3,
|
ngen=20, mgen=5, npop=15, pcruz=.5, pmut=.3,
|
||||||
window_size=35000, train_rate=.6, increment_rate=1,
|
window_size=35000, train_rate=.6, increment_rate=1,
|
||||||
collect_statistics=True, experiments=1)
|
collect_statistics=True, experiments=5)
|
||||||
#distributed='dispy', nodes=['192.168.0.110','192.168.0.106','192.168.0.107'])
|
#distributed='dispy', nodes=['192.168.0.110','192.168.0.106','192.168.0.107'])
|
||||||
|
|
||||||
t2 = time()
|
t2 = time()
|
||||||
|
@ -1,20 +1,98 @@
|
|||||||
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import matplotlib.pylab as plt
|
import time
|
||||||
from pyFTS.data import TAIEX, Malaysia
|
|
||||||
from pyFTS.common import Transformations
|
|
||||||
|
|
||||||
from pyFTS.benchmarks import Measures
|
from pyFTS.data import Enrollments, TAIEX, SONDA
|
||||||
from pyFTS.partitioners import Grid, Util as pUtil
|
from pyFTS.partitioners import Grid, Simple
|
||||||
from pyFTS.common import Transformations, Util
|
from pyFTS.common import Util
|
||||||
from pyFTS.models import pwfts
|
|
||||||
from pyFTS.models.multivariate import common, variable, mvfts, wmvfts
|
from pyspark import SparkConf
|
||||||
|
from pyspark import SparkContext
|
||||||
|
|
||||||
|
import os
|
||||||
|
# make sure pyspark tells workers to use python3 not 2 if both are installed
|
||||||
|
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3'
|
||||||
|
os.environ['PYSPARK_DRIVER_PYTHON'] = '/usr/bin/python3'
|
||||||
|
#'''
|
||||||
|
data = SONDA.get_dataframe()
|
||||||
|
|
||||||
|
data = data[['datahora','glo_avg']]
|
||||||
|
|
||||||
|
data = data[~(np.isnan(data['glo_avg']) | np.equal(data['glo_avg'], 0.0))]
|
||||||
|
|
||||||
|
train = data.iloc[:1500000]
|
||||||
|
test = data.iloc[1500000:]
|
||||||
|
|
||||||
|
from pyFTS.models.multivariate import common, variable, wmvfts
|
||||||
from pyFTS.models.seasonal import partitioner as seasonal
|
from pyFTS.models.seasonal import partitioner as seasonal
|
||||||
from pyFTS.models.seasonal.common import DateTime
|
from pyFTS.models.seasonal.common import DateTime
|
||||||
|
from pyFTS.partitioners import Grid
|
||||||
|
|
||||||
bc = Transformations.BoxCox(0)
|
import matplotlib.pyplot as plt
|
||||||
tdiff = Transformations.Differential(1)
|
|
||||||
|
|
||||||
from pyFTS.models.multivariate import common, variable, mvfts, cmvfts
|
'''
|
||||||
|
#fig, ax = plt.subplots(nrows=3, ncols=1, figsize=[15,5])
|
||||||
|
|
||||||
|
|
||||||
|
sp = {'seasonality': DateTime.day_of_year , 'names': ['Jan','Feb','Mar','Apr','May','Jun','Jul', 'Aug','Sep','Oct','Nov','Dec']}
|
||||||
|
|
||||||
|
vmonth = variable.Variable("Month", data_label="datahora", partitioner=seasonal.TimeGridPartitioner, npart=12, alpha_cut=.25,
|
||||||
|
data=train, partitioner_specific=sp)
|
||||||
|
|
||||||
|
#vmonth.partitioner.plot(ax[0])
|
||||||
|
|
||||||
|
sp = {'seasonality': DateTime.minute_of_day, 'names': [str(k) for k in range(0,24)]}
|
||||||
|
|
||||||
|
vhour = variable.Variable("Hour", data_label="datahora", partitioner=seasonal.TimeGridPartitioner, npart=24, alpha_cut=.2,
|
||||||
|
data=train, partitioner_specific=sp)
|
||||||
|
|
||||||
|
#vhour.partitioner.plot(ax[1])
|
||||||
|
|
||||||
|
|
||||||
|
vavg = variable.Variable("Radiation", data_label="glo_avg", alias='R',
|
||||||
|
partitioner=Grid.GridPartitioner, npart=35, alpha_cut=.3,
|
||||||
|
data=train)
|
||||||
|
|
||||||
|
#vavg.partitioner.plot(ax[2])
|
||||||
|
|
||||||
|
#plt.tight_layout()
|
||||||
|
|
||||||
|
#Util.show_and_save_image(fig, 'variables', True)
|
||||||
|
|
||||||
|
model = wmvfts.WeightedMVFTS(explanatory_variables=[vmonth,vhour,vavg], target_variable=vavg)
|
||||||
|
|
||||||
|
|
||||||
|
_s1 = time.time()
|
||||||
|
model.fit(train)
|
||||||
|
#model.fit(data, distributed='spark', url='spark://192.168.0.106:7077', num_batches=4)
|
||||||
|
_s2 = time.time()
|
||||||
|
|
||||||
|
print(_s2-_s1)
|
||||||
|
|
||||||
|
Util.persist_obj(model, 'sonda_wmvfts')
|
||||||
|
'''
|
||||||
|
|
||||||
|
model = Util.load_obj('sonda_wmvfts')
|
||||||
|
|
||||||
|
'''
|
||||||
|
from pyFTS.benchmarks import Measures
|
||||||
|
|
||||||
|
_s1 = time.time()
|
||||||
|
print(Measures.get_point_statistics(test, model))
|
||||||
|
_s2 = time.time()
|
||||||
|
|
||||||
|
print(_s2-_s1)
|
||||||
|
'''
|
||||||
|
|
||||||
|
print(len(model))
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
|
||||||
|
#model.fit(data, distributed='dispy', nodes=['192.168.0.110'])
|
||||||
|
'''
|
||||||
|
|
||||||
|
from pyFTS.models.multivariate import common, variable, mvfts, wmvfts, cmvfts, grid
|
||||||
from pyFTS.models.seasonal import partitioner as seasonal
|
from pyFTS.models.seasonal import partitioner as seasonal
|
||||||
from pyFTS.models.seasonal.common import DateTime
|
from pyFTS.models.seasonal.common import DateTime
|
||||||
|
|
||||||
@ -22,70 +100,83 @@ dataset = pd.read_csv('/home/petronio/Downloads/kalang.csv', sep=',')
|
|||||||
|
|
||||||
dataset['date'] = pd.to_datetime(dataset["date"], format='%Y-%m-%d %H:%M:%S')
|
dataset['date'] = pd.to_datetime(dataset["date"], format='%Y-%m-%d %H:%M:%S')
|
||||||
|
|
||||||
train_uv = dataset['value'].values[:24505]
|
|
||||||
test_uv = dataset['value'].values[24505:]
|
|
||||||
|
|
||||||
train_mv = dataset.iloc[:24505]
|
train_mv = dataset.iloc[:24505]
|
||||||
test_mv = dataset.iloc[24505:]
|
test_mv = dataset.iloc[24505:]
|
||||||
|
|
||||||
sp = {'seasonality': DateTime.minute_of_day, 'names': [str(k)+'hs' for k in range(0,24)]}
|
sp = {'seasonality': DateTime.minute_of_day, 'names': [str(k)+'hs' for k in range(0,24)]}
|
||||||
|
|
||||||
vhour = variable.Variable("Hour", data_label="date", partitioner=seasonal.TimeGridPartitioner, npart=24,
|
vhour = variable.Variable("Hour", data_label="date", partitioner=seasonal.TimeGridPartitioner, npart=24,
|
||||||
data=train_mv, partitioner_specific=sp)
|
data=train_mv, partitioner_specific=sp, data_type=pd.datetime, mask='%Y-%m-%d %H:%M:%S')
|
||||||
|
|
||||||
vvalue = variable.Variable("Pollution", data_label="value", alias='value',
|
vvalue = variable.Variable("Pollution", data_label="value", alias='value',
|
||||||
partitioner=Grid.GridPartitioner, npart=35,
|
partitioner=Grid.GridPartitioner, npart=35, data_type=np.float64,
|
||||||
data=train_mv)
|
data=train_mv)
|
||||||
|
|
||||||
parameters = [
|
fs = grid.GridCluster(explanatory_variables=[vhour, vvalue], target_variable=vvalue)
|
||||||
{},{},
|
#model = wmvfts.WeightedMVFTS(explanatory_variables=[vhour, vvalue], target_variable=vvalue)
|
||||||
{'order':2, 'knn': 1},
|
model = cmvfts.ClusteredMVFTS(explanatory_variables=[vhour, vvalue], target_variable=vvalue,
|
||||||
{'order':2, 'knn': 2},
|
partitioner=fs)
|
||||||
{'order':2, 'knn': 3},
|
|
||||||
]
|
|
||||||
|
|
||||||
#for ct, method in enumerate([, wmvfts.WeightedMVFTS,
|
|
||||||
# cmvfts.ClusteredMVFTS,cmvfts.ClusteredMVFTS,cmvfts.ClusteredMVFTS]):
|
|
||||||
model = mvfts.MVFTS()
|
|
||||||
|
|
||||||
model.append_variable(vhour)
|
|
||||||
model.append_variable(vvalue)
|
|
||||||
model.target_variable = vvalue
|
|
||||||
model.fit(train_mv)
|
|
||||||
|
|
||||||
print(model)
|
|
||||||
|
|
||||||
|
|
||||||
|
model.fit(train_mv, distributed='spark', url='spark://192.168.0.106:7077')
|
||||||
|
#'''
|
||||||
|
#print(model)
|
||||||
|
|
||||||
'''
|
'''
|
||||||
from pyFTS.data import henon
|
def fun(x):
|
||||||
df = henon.get_dataframe(iterations=1000)
|
return (x, x % 2)
|
||||||
|
|
||||||
from pyFTS.models.multivariate import variable, cmvfts
|
|
||||||
|
|
||||||
vx = variable.Variable("x", data_label="x", partitioner=Grid.GridPartitioner, npart=15, data=df)
|
def get_fs():
|
||||||
vy = variable.Variable("y", data_label="y", partitioner=Grid.GridPartitioner, npart=15, data=df)
|
fs_tmp = Simple.SimplePartitioner()
|
||||||
|
|
||||||
model = cmvfts.ClusteredMVFTS(pre_fuzzyfy=False, knn=3, order=2, fts_method=pwfts.ProbabilisticWeightedFTS)
|
for fset in part.value.keys():
|
||||||
model.append_variable(vx)
|
fz = part.value[fset]
|
||||||
model.append_variable(vy)
|
fs_tmp.append(fset, fz.mf, fz.parameters)
|
||||||
model.target_variable = vx
|
|
||||||
|
|
||||||
model.fit(df.iloc[:800])
|
return fs_tmp
|
||||||
|
|
||||||
test = df.iloc[800:]
|
def fuzzyfy(x):
|
||||||
|
|
||||||
|
fs_tmp = get_fs()
|
||||||
|
|
||||||
|
ret = []
|
||||||
|
|
||||||
|
for k in x:
|
||||||
|
ret.append(fs_tmp.fuzzyfy(k, mode='both'))
|
||||||
|
|
||||||
|
return ret
|
||||||
|
|
||||||
|
|
||||||
|
def train(fuzzyfied):
|
||||||
|
model = hofts.WeightedHighOrderFTS(partitioner=get_fs(), order=order.value)
|
||||||
|
|
||||||
|
ndata = [k for k in fuzzyfied]
|
||||||
|
|
||||||
|
model.train(ndata)
|
||||||
|
|
||||||
|
return [(k, model.flrgs[k]) for k in model.flrgs]
|
||||||
|
|
||||||
|
|
||||||
|
with SparkContext(conf=conf) as sc:
|
||||||
|
|
||||||
|
part = sc.broadcast(fs.sets)
|
||||||
|
|
||||||
|
order = sc.broadcast(2)
|
||||||
|
|
||||||
|
#ret = sc.parallelize(np.arange(0,100)).map(fun)
|
||||||
|
|
||||||
|
#fuzzyfied = sc.parallelize(data).mapPartitions(fuzzyfy)
|
||||||
|
|
||||||
|
flrgs = sc.parallelize(data).mapPartitions(train)
|
||||||
|
|
||||||
|
model = hofts.WeightedHighOrderFTS(partitioner=fs, order=order.value)
|
||||||
|
|
||||||
|
for k in flrgs.collect():
|
||||||
|
model.append_rule(k[1])
|
||||||
|
|
||||||
|
print(model)
|
||||||
|
|
||||||
|
'''
|
||||||
|
|
||||||
forecasts = model.predict(test, type='multivariate')
|
|
||||||
|
|
||||||
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=[15,7])
|
|
||||||
ax[0][0].plot(test['x'].values)
|
|
||||||
ax[0][0].plot(forecasts['x'].values)
|
|
||||||
ax[0][1].scatter(test['x'].values,test['y'].values)
|
|
||||||
ax[0][1].scatter(forecasts['x'].values,forecasts['y'].values)
|
|
||||||
ax[1][0].scatter(test['y'].values,test['x'].values)
|
|
||||||
ax[1][0].scatter(forecasts['y'].values,forecasts['x'].values)
|
|
||||||
ax[1][1].plot(test['y'].values)
|
|
||||||
ax[1][1].plot(forecasts['y'].values)
|
|
||||||
|
|
||||||
print(forecasts)
|
|
||||||
'''
|
|
@ -4,7 +4,7 @@ import time
|
|||||||
|
|
||||||
from pyFTS.data import Enrollments, TAIEX, SONDA
|
from pyFTS.data import Enrollments, TAIEX, SONDA
|
||||||
from pyFTS.partitioners import Grid, Simple
|
from pyFTS.partitioners import Grid, Simple
|
||||||
from pyFTS.models import hofts
|
from pyFTS.common import Util
|
||||||
|
|
||||||
from pyspark import SparkConf
|
from pyspark import SparkConf
|
||||||
from pyspark import SparkContext
|
from pyspark import SparkContext
|
||||||
@ -14,18 +14,66 @@ import os
|
|||||||
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3'
|
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3'
|
||||||
os.environ['PYSPARK_DRIVER_PYTHON'] = '/usr/bin/python3'
|
os.environ['PYSPARK_DRIVER_PYTHON'] = '/usr/bin/python3'
|
||||||
#'''
|
#'''
|
||||||
data = SONDA.get_data('glo_avg')
|
data = SONDA.get_dataframe()
|
||||||
|
|
||||||
fs = Grid.GridPartitioner(data=data, npart=50)
|
data = data[['datahora','glo_avg']]
|
||||||
|
|
||||||
|
data = data[~(np.isnan(data['glo_avg']) | np.equal(data['glo_avg'], 0.0))]
|
||||||
|
|
||||||
|
train = data.iloc[:1500000]
|
||||||
|
test = data.iloc[1500000:]
|
||||||
|
|
||||||
|
from pyFTS.models.multivariate import common, variable, wmvfts
|
||||||
|
from pyFTS.models.seasonal import partitioner as seasonal
|
||||||
|
from pyFTS.models.seasonal.common import DateTime
|
||||||
|
from pyFTS.partitioners import Grid
|
||||||
|
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
#fig, ax = plt.subplots(nrows=3, ncols=1, figsize=[15,5])
|
||||||
|
|
||||||
|
|
||||||
|
sp = {'seasonality': DateTime.day_of_year , 'names': ['Jan','Feb','Mar','Apr','May','Jun','Jul', 'Aug','Sep','Oct','Nov','Dec']}
|
||||||
|
|
||||||
|
vmonth = variable.Variable("Month", data_label="datahora", partitioner=seasonal.TimeGridPartitioner, npart=12, alpha_cut=.25,
|
||||||
|
data=train, partitioner_specific=sp)
|
||||||
|
|
||||||
|
#vmonth.partitioner.plot(ax[0])
|
||||||
|
|
||||||
|
sp = {'seasonality': DateTime.minute_of_day, 'names': [str(k) for k in range(0,24)]}
|
||||||
|
|
||||||
|
vhour = variable.Variable("Hour", data_label="datahora", partitioner=seasonal.TimeGridPartitioner, npart=24, alpha_cut=.2,
|
||||||
|
data=train, partitioner_specific=sp)
|
||||||
|
|
||||||
|
#vhour.partitioner.plot(ax[1])
|
||||||
|
|
||||||
|
|
||||||
|
vavg = variable.Variable("Radiation", data_label="glo_avg", alias='R',
|
||||||
|
partitioner=Grid.GridPartitioner, npart=35, alpha_cut=.3,
|
||||||
|
data=train)
|
||||||
|
|
||||||
|
#vavg.partitioner.plot(ax[2])
|
||||||
|
|
||||||
|
#plt.tight_layout()
|
||||||
|
|
||||||
|
#Util.show_and_save_image(fig, 'variables', True)
|
||||||
|
|
||||||
|
model = wmvfts.WeightedMVFTS(explanatory_variables=[vmonth,vhour,vavg], target_variable=vavg)
|
||||||
|
|
||||||
model = hofts.WeightedHighOrderFTS(partitioner=fs, order=2)
|
|
||||||
|
|
||||||
_s1 = time.time()
|
_s1 = time.time()
|
||||||
model.fit(data, distributed='spark', url='spark://192.168.0.106:7077')
|
#model.fit(train)
|
||||||
|
model.fit(data, distributed='spark', url='spark://192.168.0.106:7077', num_batches=5)
|
||||||
_s2 = time.time()
|
_s2 = time.time()
|
||||||
|
|
||||||
print(_s2-_s1)
|
print(_s2-_s1)
|
||||||
|
|
||||||
|
Util.persist_obj(model, 'sonda_wmvfts')
|
||||||
|
|
||||||
|
from pyFTS.benchmarks import Measures
|
||||||
|
|
||||||
|
#print(Measures.get_point_statistics(test, model))
|
||||||
|
|
||||||
#model.fit(data, distributed='dispy', nodes=['192.168.0.110'])
|
#model.fit(data, distributed='dispy', nodes=['192.168.0.110'])
|
||||||
'''
|
'''
|
||||||
|
|
||||||
@ -56,7 +104,7 @@ model = cmvfts.ClusteredMVFTS(explanatory_variables=[vhour, vvalue], target_vari
|
|||||||
|
|
||||||
model.fit(train_mv, distributed='spark', url='spark://192.168.0.106:7077')
|
model.fit(train_mv, distributed='spark', url='spark://192.168.0.106:7077')
|
||||||
#'''
|
#'''
|
||||||
print(model)
|
#print(model)
|
||||||
|
|
||||||
'''
|
'''
|
||||||
def fun(x):
|
def fun(x):
|
||||||
|
Loading…
Reference in New Issue
Block a user