2017-02-16 00:16:13 +04:00
|
|
|
import numpy as np
|
|
|
|
import matplotlib.pyplot as plt
|
2022-04-10 21:32:24 +04:00
|
|
|
from pyFTS.common import SortedCollection
|
2017-07-04 01:39:10 +04:00
|
|
|
from pyFTS.probabilistic import kde
|
2018-02-27 19:56:05 +04:00
|
|
|
|
2017-02-16 00:16:13 +04:00
|
|
|
|
2019-06-21 22:10:19 +04:00
|
|
|
def from_point(x,**kwargs):
|
|
|
|
"""
|
|
|
|
Create a probability distribution from a scalar value
|
|
|
|
|
|
|
|
:param x: scalar value
|
|
|
|
:param kwargs: common parameters of the distribution
|
|
|
|
:return: the ProbabilityDistribution object
|
|
|
|
"""
|
|
|
|
tmp = ProbabilityDistribution(**kwargs)
|
|
|
|
tmp.set(x, 1.0)
|
|
|
|
return tmp
|
|
|
|
|
|
|
|
|
2017-02-16 00:16:13 +04:00
|
|
|
class ProbabilityDistribution(object):
|
2017-07-02 02:42:45 +04:00
|
|
|
"""
|
|
|
|
Represents a discrete or continous probability distribution
|
|
|
|
If type is histogram, the PDF is discrete
|
|
|
|
If type is KDE the PDF is continuous
|
|
|
|
"""
|
2018-04-13 22:23:54 +04:00
|
|
|
def __init__(self, type = "KDE", **kwargs):
|
2017-07-04 01:39:10 +04:00
|
|
|
self.uod = kwargs.get("uod", None)
|
2018-08-30 09:05:29 +04:00
|
|
|
"""Universe of discourse"""
|
2017-07-04 01:39:10 +04:00
|
|
|
|
2018-04-25 18:36:01 +04:00
|
|
|
self.data = []
|
|
|
|
|
2019-05-30 07:03:01 +04:00
|
|
|
data = kwargs.get("data", None)
|
|
|
|
|
2017-07-04 23:30:53 +04:00
|
|
|
self.type = type
|
2018-08-30 09:05:29 +04:00
|
|
|
"""
|
|
|
|
If type is histogram, the PDF is discrete
|
|
|
|
If type is KDE the PDF is continuous
|
|
|
|
"""
|
2018-04-25 18:36:01 +04:00
|
|
|
|
|
|
|
self.bins = kwargs.get("bins", None)
|
2018-08-30 09:05:29 +04:00
|
|
|
"""Number of bins on a discrete PDF"""
|
2018-04-25 18:36:01 +04:00
|
|
|
self.labels = kwargs.get("bins_labels", None)
|
2018-08-30 09:05:29 +04:00
|
|
|
"""Bins labels on a discrete PDF"""
|
2018-04-25 18:36:01 +04:00
|
|
|
|
2017-07-04 23:30:53 +04:00
|
|
|
if self.type == "KDE":
|
2019-05-28 21:32:06 +04:00
|
|
|
self.kde = kde.KernelSmoothing(h=kwargs.get("h", 0.5), kernel=kwargs.get("kernel", "epanechnikov"))
|
|
|
|
|
2019-05-30 07:03:01 +04:00
|
|
|
if data is not None and self.uod is None:
|
2018-04-25 18:36:01 +04:00
|
|
|
_min = np.nanmin(data)
|
|
|
|
_min = _min * .7 if _min > 0 else _min * 1.3
|
|
|
|
_max = np.nanmax(data)
|
|
|
|
_max = _max * 1.3 if _max > 0 else _max * .7
|
|
|
|
self.uod = [_min, _max]
|
2017-07-04 19:18:07 +04:00
|
|
|
|
|
|
|
self.nbins = kwargs.get("num_bins", 100)
|
2017-02-16 00:16:13 +04:00
|
|
|
|
2017-07-04 19:18:07 +04:00
|
|
|
if self.bins is None:
|
2019-05-28 21:32:06 +04:00
|
|
|
self.bins = np.linspace(int(self.uod[0]), int(self.uod[1]), int(self.nbins)).tolist()
|
2017-07-04 19:18:07 +04:00
|
|
|
self.labels = [str(k) for k in self.bins]
|
2017-07-02 02:42:45 +04:00
|
|
|
|
2017-09-20 21:31:45 +04:00
|
|
|
if self.uod is not None:
|
|
|
|
self.resolution = (self.uod[1] - self.uod[0]) / self.nbins
|
|
|
|
|
2019-05-28 21:32:06 +04:00
|
|
|
self.bin_index = SortedCollection.SortedCollection(iterable=sorted(self.bins))
|
2017-09-20 21:31:45 +04:00
|
|
|
self.quantile_index = None
|
2017-07-04 19:18:07 +04:00
|
|
|
self.distribution = {}
|
2017-09-20 21:31:45 +04:00
|
|
|
self.cdf = None
|
|
|
|
self.qtl = None
|
2017-07-04 19:18:07 +04:00
|
|
|
self.count = 0
|
|
|
|
for k in self.bins: self.distribution[k] = 0
|
2017-07-02 02:42:45 +04:00
|
|
|
|
2017-07-04 23:30:53 +04:00
|
|
|
if data is not None:
|
|
|
|
self.append(data)
|
|
|
|
|
|
|
|
self.name = kwargs.get("name", "")
|
2017-02-16 00:16:13 +04:00
|
|
|
|
2017-07-11 06:37:09 +04:00
|
|
|
def set(self, value, density):
|
2018-09-29 02:35:07 +04:00
|
|
|
"""
|
|
|
|
Assert a probability 'density' for a certain value 'value', such that P(value) = density
|
|
|
|
|
|
|
|
:param value: A value in the universe of discourse from the distribution
|
|
|
|
:param density: The probability density to assign to the value
|
|
|
|
"""
|
2019-05-28 21:32:06 +04:00
|
|
|
k = self.bin_index.find_ge(value)
|
2017-07-11 06:37:09 +04:00
|
|
|
self.distribution[k] = density
|
|
|
|
|
2017-02-16 00:16:13 +04:00
|
|
|
def append(self, values):
|
2018-09-29 02:35:07 +04:00
|
|
|
"""
|
|
|
|
Increment the frequency count for the values
|
|
|
|
|
|
|
|
:param values: A list of values to account the frequency
|
|
|
|
"""
|
2017-07-02 02:42:45 +04:00
|
|
|
if self.type == "histogram":
|
|
|
|
for k in values:
|
2019-05-28 21:32:06 +04:00
|
|
|
v = self.bin_index.find_ge(k)
|
2017-07-02 02:42:45 +04:00
|
|
|
self.distribution[v] += 1
|
|
|
|
self.count += 1
|
|
|
|
else:
|
|
|
|
self.data.extend(values)
|
2017-07-04 19:18:07 +04:00
|
|
|
self.distribution = {}
|
|
|
|
dens = self.density(self.bins)
|
|
|
|
for v,d in enumerate(dens):
|
2017-07-04 23:30:53 +04:00
|
|
|
self.distribution[self.bins[v]] = d
|
2017-02-16 00:16:13 +04:00
|
|
|
|
2018-02-26 20:11:29 +04:00
|
|
|
def append_interval(self, intervals):
|
2018-09-29 02:35:07 +04:00
|
|
|
"""
|
|
|
|
Increment the frequency count for all values inside an interval
|
|
|
|
|
|
|
|
:param intervals: A list of intervals do increment the frequency
|
|
|
|
"""
|
2017-07-11 06:37:09 +04:00
|
|
|
if self.type == "histogram":
|
|
|
|
for interval in intervals:
|
2019-05-28 21:32:06 +04:00
|
|
|
for k in self.bin_index.inside(interval[0], interval[1]):
|
|
|
|
self.distribution[k] += 1
|
2017-07-11 06:37:09 +04:00
|
|
|
self.count += 1
|
|
|
|
|
2017-02-16 00:16:13 +04:00
|
|
|
def density(self, values):
|
2018-09-29 02:35:07 +04:00
|
|
|
"""
|
|
|
|
Return the probability densities for the input values
|
|
|
|
|
|
|
|
:param values: List of values to return the densities
|
|
|
|
:return: List of probability densities for the input values
|
|
|
|
"""
|
2017-07-04 01:39:10 +04:00
|
|
|
ret = []
|
2017-07-12 04:43:13 +04:00
|
|
|
scalar = False
|
|
|
|
|
|
|
|
if not isinstance(values, list):
|
|
|
|
values = [values]
|
|
|
|
scalar = True
|
|
|
|
|
2017-07-04 01:39:10 +04:00
|
|
|
for k in values:
|
|
|
|
if self.type == "histogram":
|
2019-05-28 21:32:06 +04:00
|
|
|
v = self.bin_index.find_ge(k)
|
2018-04-26 18:53:53 +04:00
|
|
|
ret.append(self.distribution[v] / (self.count + 1e-5))
|
2017-07-11 06:37:09 +04:00
|
|
|
elif self.type == "KDE":
|
2019-05-28 21:32:06 +04:00
|
|
|
v = self.kde.probability(k, data=self.data)
|
2017-07-04 01:39:10 +04:00
|
|
|
ret.append(v)
|
2017-07-11 06:37:09 +04:00
|
|
|
else:
|
2019-05-28 21:32:06 +04:00
|
|
|
v = self.bin_index.find_ge(k)
|
2017-07-11 06:37:09 +04:00
|
|
|
ret.append(self.distribution[v])
|
2017-07-12 04:43:13 +04:00
|
|
|
|
|
|
|
if scalar:
|
|
|
|
return ret[0]
|
|
|
|
|
2017-07-11 06:37:09 +04:00
|
|
|
return ret
|
|
|
|
|
2018-04-22 23:59:17 +04:00
|
|
|
def differential_offset(self, value):
|
2018-09-29 02:35:07 +04:00
|
|
|
"""
|
|
|
|
Auxiliary function for probability distributions of differentiated data
|
|
|
|
|
|
|
|
:param value:
|
|
|
|
:return:
|
|
|
|
"""
|
2018-04-22 23:59:17 +04:00
|
|
|
nbins = []
|
|
|
|
dist = {}
|
|
|
|
|
|
|
|
for k in self.bins:
|
|
|
|
nk = k+value
|
|
|
|
nbins.append(nk)
|
|
|
|
dist[nk] = self.distribution[k]
|
|
|
|
|
|
|
|
self.bins = nbins
|
|
|
|
self.distribution = dist
|
|
|
|
self.labels = [str(k) for k in self.bins]
|
|
|
|
|
2019-05-28 21:32:06 +04:00
|
|
|
self.bin_index = SortedCollection.SortedCollection(iterable=sorted(self.bins))
|
2018-04-22 23:59:17 +04:00
|
|
|
self.quantile_index = None
|
|
|
|
self.cdf = None
|
|
|
|
self.qtl = None
|
|
|
|
|
2018-04-11 07:19:05 +04:00
|
|
|
def expected_value(self):
|
2018-09-29 02:35:07 +04:00
|
|
|
"""
|
|
|
|
Return the expected value of the distribution, as E[X] = ∑ x * P(x)
|
|
|
|
|
|
|
|
:return: The expected value of the distribution
|
|
|
|
"""
|
2018-04-11 07:19:05 +04:00
|
|
|
return np.nansum([v * self.distribution[v] for v in self.bins])
|
|
|
|
|
2017-09-20 21:31:45 +04:00
|
|
|
def build_cdf_qtl(self):
|
|
|
|
ret = 0.0
|
|
|
|
self.cdf = {}
|
|
|
|
self.qtl = {}
|
|
|
|
for k in sorted(self.bins):
|
|
|
|
ret += self.density(k)
|
|
|
|
if k not in self.cdf:
|
|
|
|
self.cdf[k] = ret
|
2017-07-11 06:37:09 +04:00
|
|
|
|
2017-09-20 21:31:45 +04:00
|
|
|
if str(ret) not in self.qtl:
|
|
|
|
self.qtl[str(ret)] = []
|
2017-07-02 02:42:45 +04:00
|
|
|
|
2018-03-05 22:07:02 +04:00
|
|
|
self.qtl[str(ret)].append(k)
|
2017-09-20 21:31:45 +04:00
|
|
|
|
|
|
|
_keys = [float(k) for k in sorted(self.qtl.keys())]
|
|
|
|
|
2019-05-28 21:32:06 +04:00
|
|
|
self.quantile_index = SortedCollection.SortedCollection(iterable=_keys)
|
2017-02-16 00:16:13 +04:00
|
|
|
|
2018-09-29 02:35:07 +04:00
|
|
|
def cumulative(self, values):
|
|
|
|
"""
|
2019-05-28 21:32:06 +04:00
|
|
|
Return the cumulative probability densities for the input values,
|
2018-10-01 22:42:17 +04:00
|
|
|
such that F(x) = P(X <= x)
|
2018-09-29 02:35:07 +04:00
|
|
|
|
|
|
|
:param values: A list of input values
|
|
|
|
:return: The cumulative probability densities for the input values
|
|
|
|
"""
|
2017-09-20 21:31:45 +04:00
|
|
|
if self.cdf is None:
|
|
|
|
self.build_cdf_qtl()
|
|
|
|
|
2017-07-11 06:37:09 +04:00
|
|
|
if isinstance(values, list):
|
|
|
|
ret = []
|
2017-09-20 21:31:45 +04:00
|
|
|
for val in values:
|
2019-06-22 16:20:53 +04:00
|
|
|
try:
|
|
|
|
k = self.bin_index.find_ge(val)
|
2019-06-23 18:51:40 +04:00
|
|
|
#ret.append(self.cdf[k])
|
|
|
|
ret.append(self.cdf[val])
|
2019-06-22 16:20:53 +04:00
|
|
|
except:
|
|
|
|
ret.append(np.nan)
|
2017-07-11 06:37:09 +04:00
|
|
|
else:
|
2019-06-22 16:20:53 +04:00
|
|
|
try:
|
|
|
|
k = self.bin_index.find_ge(values)
|
2019-06-23 01:29:40 +04:00
|
|
|
return self.cdf[k]
|
2019-06-22 16:20:53 +04:00
|
|
|
except:
|
|
|
|
return np.nan
|
2017-07-11 06:37:09 +04:00
|
|
|
|
2017-09-20 21:31:45 +04:00
|
|
|
def quantile(self, values):
|
2018-09-29 02:35:07 +04:00
|
|
|
"""
|
2019-05-28 21:32:06 +04:00
|
|
|
Return the Universe of Discourse values in relation to the quantile input values,
|
2018-10-01 22:42:17 +04:00
|
|
|
such that Q(tau) = min( {x | F(x) >= tau })
|
2018-09-29 02:35:07 +04:00
|
|
|
|
|
|
|
:param values: input values
|
|
|
|
:return: The list of the quantile values for the input values
|
|
|
|
"""
|
2017-09-20 21:31:45 +04:00
|
|
|
if self.qtl is None:
|
|
|
|
self.build_cdf_qtl()
|
2017-03-03 15:53:55 +04:00
|
|
|
|
2017-09-20 21:31:45 +04:00
|
|
|
if isinstance(values, list):
|
|
|
|
ret = []
|
|
|
|
for val in values:
|
2019-06-22 16:20:53 +04:00
|
|
|
try:
|
|
|
|
k = self.quantile_index.find_ge(val)
|
|
|
|
ret.append(self.qtl[str(k)][0])
|
|
|
|
except:
|
|
|
|
ret.append(np.nan)
|
2017-09-20 21:31:45 +04:00
|
|
|
else:
|
2019-06-22 16:20:53 +04:00
|
|
|
try:
|
|
|
|
k = self.quantile_index.find_ge(values)
|
|
|
|
ret = self.qtl[str(k)]
|
|
|
|
except:
|
|
|
|
return np.nan
|
2017-09-20 21:31:45 +04:00
|
|
|
|
|
|
|
return ret
|
2017-03-03 15:53:55 +04:00
|
|
|
|
2017-02-16 00:16:13 +04:00
|
|
|
def entropy(self):
|
2018-09-29 02:35:07 +04:00
|
|
|
"""
|
2018-10-01 22:42:17 +04:00
|
|
|
Return the entropy of the probability distribution, H(P) = E[ -ln P(X) ] = - ∑ P(x) log ( P(x) )
|
2018-09-29 02:35:07 +04:00
|
|
|
|
|
|
|
:return:the entropy of the probability distribution
|
|
|
|
"""
|
2019-06-22 16:20:53 +04:00
|
|
|
h = -np.nansum([self.distribution[k] * np.log(self.distribution[k]) if self.distribution[k] > 0 else 0
|
2017-02-16 00:16:13 +04:00
|
|
|
for k in self.bins])
|
|
|
|
return h
|
|
|
|
|
2017-02-19 08:02:59 +04:00
|
|
|
def crossentropy(self,q):
|
2018-09-29 02:35:07 +04:00
|
|
|
"""
|
2018-10-01 22:42:17 +04:00
|
|
|
Cross entropy between the actual probability distribution and the informed one,
|
|
|
|
H(P,Q) = - ∑ P(x) log ( Q(x) )
|
2018-09-29 02:35:07 +04:00
|
|
|
|
|
|
|
:param q: a probabilistic.ProbabilityDistribution object
|
|
|
|
:return: Cross entropy between this probability distribution and the given distribution
|
|
|
|
"""
|
2019-06-22 16:20:53 +04:00
|
|
|
h = -np.nansum([self.distribution[k] * np.log(q.distribution[k]) if self.distribution[k] > 0 else 0
|
2017-02-19 08:02:59 +04:00
|
|
|
for k in self.bins])
|
|
|
|
return h
|
|
|
|
|
|
|
|
def kullbackleiblerdivergence(self,q):
|
2018-09-29 02:35:07 +04:00
|
|
|
"""
|
|
|
|
Kullback-Leibler divergence between the actual probability distribution and the informed one.
|
2018-10-01 22:42:17 +04:00
|
|
|
DKL(P || Q) = - ∑ P(x) log( P(X) / Q(x) )
|
2018-09-29 02:35:07 +04:00
|
|
|
|
|
|
|
:param q: a probabilistic.ProbabilityDistribution object
|
|
|
|
:return: Kullback-Leibler divergence
|
|
|
|
"""
|
2019-06-22 16:20:53 +04:00
|
|
|
h = np.nansum([self.distribution[k] * np.log(self.distribution[k]/q.distribution[k]) if self.distribution[k] > 0 else 0
|
2017-02-19 08:02:59 +04:00
|
|
|
for k in self.bins])
|
|
|
|
return h
|
|
|
|
|
2017-02-16 00:16:13 +04:00
|
|
|
def empiricalloglikelihood(self):
|
2018-09-29 02:35:07 +04:00
|
|
|
"""
|
2018-10-01 22:42:17 +04:00
|
|
|
Empirical Log Likelihood of the probability distribution, L(P) = ∑ log( P(x) )
|
2018-09-29 02:35:07 +04:00
|
|
|
|
|
|
|
:return:
|
|
|
|
"""
|
2017-02-16 00:16:13 +04:00
|
|
|
_s = 0
|
|
|
|
for k in self.bins:
|
|
|
|
if self.distribution[k] > 0:
|
|
|
|
_s += np.log(self.distribution[k])
|
|
|
|
return _s
|
|
|
|
|
|
|
|
def pseudologlikelihood(self, data):
|
2018-09-29 02:35:07 +04:00
|
|
|
"""
|
|
|
|
Pseudo log likelihood of the probability distribution with respect to data
|
|
|
|
|
|
|
|
:param data:
|
|
|
|
:return:
|
|
|
|
"""
|
2017-02-16 00:16:13 +04:00
|
|
|
|
|
|
|
densities = self.density(data)
|
|
|
|
|
|
|
|
_s = 0
|
|
|
|
for k in densities:
|
|
|
|
if k > 0:
|
|
|
|
_s += np.log(k)
|
|
|
|
return _s
|
|
|
|
|
2017-02-19 08:02:59 +04:00
|
|
|
def averageloglikelihood(self, data):
|
2018-09-29 02:35:07 +04:00
|
|
|
"""
|
|
|
|
Average log likelihood of the probability distribution with respect to data
|
|
|
|
|
|
|
|
:param data:
|
|
|
|
:return:
|
|
|
|
"""
|
2017-02-19 08:02:59 +04:00
|
|
|
|
|
|
|
densities = self.density(data)
|
|
|
|
|
|
|
|
_s = 0
|
|
|
|
for k in densities:
|
|
|
|
if k > 0:
|
|
|
|
_s += np.log(k)
|
|
|
|
return _s / len(data)
|
|
|
|
|
2017-09-08 21:18:02 +04:00
|
|
|
def plot(self,axis=None,color="black",tam=[10, 6], title = None):
|
2017-09-20 21:31:45 +04:00
|
|
|
|
2017-02-16 00:16:13 +04:00
|
|
|
if axis is None:
|
|
|
|
fig = plt.figure(figsize=tam)
|
|
|
|
axis = fig.add_subplot(111)
|
|
|
|
|
2017-07-04 19:18:07 +04:00
|
|
|
if self.type == "histogram":
|
|
|
|
ys = [self.distribution[k]/self.count for k in self.bins]
|
|
|
|
else:
|
|
|
|
ys = [self.distribution[k] for k in self.bins]
|
2017-07-05 22:35:22 +04:00
|
|
|
yp = [0 for k in self.data]
|
|
|
|
axis.plot(self.data, yp, c="red")
|
2017-02-16 00:16:13 +04:00
|
|
|
|
2017-09-08 21:18:02 +04:00
|
|
|
if title is None:
|
|
|
|
title = self.name
|
2017-07-06 22:28:08 +04:00
|
|
|
axis.plot(self.bins, ys, c=color)
|
2017-09-08 21:18:02 +04:00
|
|
|
axis.set_title(title)
|
2017-02-16 00:16:13 +04:00
|
|
|
|
|
|
|
axis.set_xlabel('Universe of Discourse')
|
|
|
|
axis.set_ylabel('Probability')
|
2017-02-16 17:54:37 +04:00
|
|
|
|
|
|
|
def __str__(self):
|
2017-09-20 21:31:45 +04:00
|
|
|
ret = ""
|
2018-04-25 18:36:01 +04:00
|
|
|
for k in sorted(self.bins):
|
2017-09-20 21:31:45 +04:00
|
|
|
ret += str(round(k,2)) + ':\t'
|
2017-07-05 22:35:22 +04:00
|
|
|
if self.type == "histogram":
|
2017-09-20 21:31:45 +04:00
|
|
|
ret += str(round(self.distribution[k] / self.count,3))
|
2018-04-25 18:36:01 +04:00
|
|
|
elif self.type == "KDE":
|
|
|
|
ret += str(round(self.density(k),3))
|
2017-07-05 22:35:22 +04:00
|
|
|
else:
|
2017-09-20 21:31:45 +04:00
|
|
|
ret += str(round(self.distribution[k], 6))
|
|
|
|
ret += '\n'
|
|
|
|
return ret
|