2017-02-16 00:16:13 +04:00
|
|
|
import numpy as np
|
|
|
|
import pandas as pd
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
from pyFTS.common import FuzzySet,SortedCollection
|
2017-07-04 01:39:10 +04:00
|
|
|
from pyFTS.probabilistic import kde
|
2017-02-16 00:16:13 +04:00
|
|
|
|
|
|
|
|
|
|
|
class ProbabilityDistribution(object):
|
2017-07-02 02:42:45 +04:00
|
|
|
"""
|
|
|
|
Represents a discrete or continous probability distribution
|
|
|
|
If type is histogram, the PDF is discrete
|
|
|
|
If type is KDE the PDF is continuous
|
|
|
|
"""
|
2017-07-04 23:30:53 +04:00
|
|
|
def __init__(self,type = "KDE", **kwargs):
|
2017-07-04 01:39:10 +04:00
|
|
|
self.uod = kwargs.get("uod", None)
|
|
|
|
|
2017-07-04 23:30:53 +04:00
|
|
|
self.type = type
|
|
|
|
if self.type == "KDE":
|
2017-07-04 19:18:07 +04:00
|
|
|
self.kde = kde.KernelSmoothing(kwargs.get("h", 10), kwargs.get("method", "epanechnikov"))
|
|
|
|
|
|
|
|
self.nbins = kwargs.get("num_bins", 100)
|
2017-02-16 00:16:13 +04:00
|
|
|
|
2017-07-04 23:30:53 +04:00
|
|
|
self.bins = kwargs.get("bins", None)
|
|
|
|
self.labels = kwargs.get("bins_labels", None)
|
2017-07-02 02:42:45 +04:00
|
|
|
|
2017-07-04 19:18:07 +04:00
|
|
|
if self.bins is None:
|
2017-07-04 23:30:53 +04:00
|
|
|
self.bins = np.linspace(int(self.uod[0]), int(self.uod[1]), int(self.nbins)).tolist()
|
2017-07-04 19:18:07 +04:00
|
|
|
self.labels = [str(k) for k in self.bins]
|
2017-07-02 02:42:45 +04:00
|
|
|
|
2017-07-04 19:18:07 +04:00
|
|
|
self.index = SortedCollection.SortedCollection(iterable=sorted(self.bins))
|
|
|
|
self.distribution = {}
|
|
|
|
self.count = 0
|
|
|
|
for k in self.bins: self.distribution[k] = 0
|
2017-07-02 02:42:45 +04:00
|
|
|
|
2017-07-04 23:30:53 +04:00
|
|
|
self.data = []
|
|
|
|
|
|
|
|
data = kwargs.get("data",None)
|
|
|
|
|
|
|
|
if data is not None:
|
|
|
|
self.append(data)
|
|
|
|
|
|
|
|
self.name = kwargs.get("name", "")
|
2017-02-16 00:16:13 +04:00
|
|
|
|
|
|
|
def append(self, values):
|
2017-07-02 02:42:45 +04:00
|
|
|
if self.type == "histogram":
|
|
|
|
for k in values:
|
|
|
|
v = self.index.find_ge(k)
|
|
|
|
self.distribution[v] += 1
|
|
|
|
self.count += 1
|
|
|
|
else:
|
|
|
|
self.data.extend(values)
|
2017-07-04 19:18:07 +04:00
|
|
|
self.distribution = {}
|
|
|
|
dens = self.density(self.bins)
|
|
|
|
for v,d in enumerate(dens):
|
2017-07-04 23:30:53 +04:00
|
|
|
self.distribution[self.bins[v]] = d
|
2017-02-16 00:16:13 +04:00
|
|
|
|
|
|
|
def density(self, values):
|
2017-07-04 01:39:10 +04:00
|
|
|
ret = []
|
|
|
|
for k in values:
|
|
|
|
if self.type == "histogram":
|
2017-07-02 02:42:45 +04:00
|
|
|
v = self.index.find_ge(k)
|
|
|
|
ret.append(self.distribution[v] / self.count)
|
2017-07-04 01:39:10 +04:00
|
|
|
else:
|
|
|
|
v = self.kde.probability(k, self.data)
|
|
|
|
ret.append(v)
|
|
|
|
return ret
|
2017-07-02 02:42:45 +04:00
|
|
|
|
2017-02-16 00:16:13 +04:00
|
|
|
|
2017-03-03 15:53:55 +04:00
|
|
|
def cummulative(self, values):
|
|
|
|
pass
|
|
|
|
|
|
|
|
def quantile(self, qt):
|
|
|
|
pass
|
|
|
|
|
2017-02-16 00:16:13 +04:00
|
|
|
def entropy(self):
|
|
|
|
h = -sum([self.distribution[k] * np.log(self.distribution[k]) if self.distribution[k] > 0 else 0
|
|
|
|
for k in self.bins])
|
|
|
|
return h
|
|
|
|
|
2017-02-19 08:02:59 +04:00
|
|
|
def crossentropy(self,q):
|
|
|
|
h = -sum([self.distribution[k] * np.log(q.distribution[k]) if self.distribution[k] > 0 else 0
|
|
|
|
for k in self.bins])
|
|
|
|
return h
|
|
|
|
|
|
|
|
def kullbackleiblerdivergence(self,q):
|
|
|
|
h = sum([self.distribution[k] * np.log(self.distribution[k]/q.distribution[k]) if self.distribution[k] > 0 else 0
|
|
|
|
for k in self.bins])
|
|
|
|
return h
|
|
|
|
|
2017-02-16 00:16:13 +04:00
|
|
|
def empiricalloglikelihood(self):
|
|
|
|
_s = 0
|
|
|
|
for k in self.bins:
|
|
|
|
if self.distribution[k] > 0:
|
|
|
|
_s += np.log(self.distribution[k])
|
|
|
|
return _s
|
|
|
|
|
|
|
|
def pseudologlikelihood(self, data):
|
|
|
|
|
|
|
|
densities = self.density(data)
|
|
|
|
|
|
|
|
_s = 0
|
|
|
|
for k in densities:
|
|
|
|
if k > 0:
|
|
|
|
_s += np.log(k)
|
|
|
|
return _s
|
|
|
|
|
2017-02-19 08:02:59 +04:00
|
|
|
def averageloglikelihood(self, data):
|
|
|
|
|
|
|
|
densities = self.density(data)
|
|
|
|
|
|
|
|
_s = 0
|
|
|
|
for k in densities:
|
|
|
|
if k > 0:
|
|
|
|
_s += np.log(k)
|
|
|
|
return _s / len(data)
|
|
|
|
|
2017-02-16 00:16:13 +04:00
|
|
|
def plot(self,axis=None,color="black",tam=[10, 6]):
|
|
|
|
if axis is None:
|
|
|
|
fig = plt.figure(figsize=tam)
|
|
|
|
axis = fig.add_subplot(111)
|
|
|
|
|
2017-07-04 19:18:07 +04:00
|
|
|
if self.type == "histogram":
|
|
|
|
ys = [self.distribution[k]/self.count for k in self.bins]
|
|
|
|
else:
|
|
|
|
ys = [self.distribution[k] for k in self.bins]
|
2017-02-16 00:16:13 +04:00
|
|
|
|
|
|
|
axis.plot(self.bins, ys,c=color, label=self.name)
|
|
|
|
|
|
|
|
axis.set_xlabel('Universe of Discourse')
|
|
|
|
axis.set_ylabel('Probability')
|
2017-02-16 17:54:37 +04:00
|
|
|
|
|
|
|
def __str__(self):
|
|
|
|
head = '|'
|
|
|
|
body = '|'
|
|
|
|
for k in sorted(self.distribution.keys()):
|
|
|
|
head += str(round(k,2)) + '\t|'
|
|
|
|
body += str(round(self.distribution[k] / self.count,3)) + '\t|'
|
|
|
|
return head + '\n' + body
|