Empirical Probability Distribution Function

Empirical Probability Distribution Function 1D

Description

Example written in Python to reconstruct a one dimensional probability distribution based on an Empirical Probability Distribution Function (EPDF).

Running

To try this example you must have PyGObject installed, and numcosmo built with –enable-introspection option. To run the examples without installing follow the instructions here.

example_epdf1d.py:

#!/usr/bin/python2

try:
  import gi
  gi.require_version('NumCosmo', '1.0')
  gi.require_version('NumCosmoMath', '1.0')
except:
  pass

import matplotlib as mpl
import matplotlib.pyplot as plot
from scipy.stats import norm
from scipy.stats import beta
import numpy as np
import math
from gi.repository import NumCosmo as Nc
from gi.repository import NumCosmoMath as Ncm

#
#  Initializing the library objects, this must be called before 
#  any other library function.
#
Ncm.cfg_init ()

np.random.seed (0)

#
# n = number of points to reconstruct the distribution
# Sampling from a Gaussian distribution
#
cut_l  = -0.4
cut_u  =  0.4
peak1  =  0.5
sigma1 =  0.2
peak2  = -0.5
sigma2 =  0.2

# Cumulative distribution function
def true_cdf (x):
  return 0.5 * (norm.cdf (x, peak1, sigma1) + norm.cdf (x, peak2, sigma2))


cdf_cut_l = true_cdf (cut_l)
cdf_cut_u = 1.0 - true_cdf (cut_u)
cdf_cut   = cdf_cut_l + cdf_cut_u
rbnorm = 1.0 - cdf_cut

# Probability density function  
def true_p (x):
  return 0.5 * (norm.pdf (x, peak1, sigma1) + norm.pdf (x, peak2, sigma2)) / rbnorm

n = 800000
s = np.concatenate ((np.random.normal (peak1, sigma1, n), np.random.normal (peak2, sigma2, n)), axis = 0)

sa = []
for si in s:
  if si >= cut_l and si <= cut_u:
    sa.append (si)
s = sa
n = len (s)

print "# Number of points = %u" % (n)

#
# Creating a new Ncm.StatsDist1dEPDF object with
# Maximum of 200 saved samples (exceeding points are joined with 
# their nearest points), standard deviation scale of 0.1 factor
# and minimum distance scale of 0.001.
# 
epdf     = Ncm.StatsDist1dEPDF.new_full (2000, Ncm.StatsDist1dEPDFBw.AUTO, 0.01, 0.001)
epdf_rot = Ncm.StatsDist1dEPDF.new_full (2000, Ncm.StatsDist1dEPDFBw.ROT, 0.01,  0.001)

#
# Adding the points to the epdf object.
#
for i in range (n):
  epdf.add_obs (s[i])
  epdf_rot.add_obs (s[i])

#
# Preparing the object from the given sample.
#
epdf.prepare ()
epdf_rot.prepare ()

#
# Plotting the results.
#
p_a = []
p_rot_a = []
pdf_a = []
pdf_rot_a = []
x_a = []
inv_pdf_a = []
inv_pdf_rot_a = []
u_a = []

for i in range (1000):
  x = epdf.xi + (epdf.xf - epdf.xi) / 999.0 * i;
  u = 1.0 / 1000.0 * i
  
  x_a.append (x)
  u_a.append (u)
  
  p_a.append (epdf.eval_p (x))
  p_rot_a.append (epdf_rot.eval_p (x))
  
  pdf_a.append (epdf.eval_pdf (x))
  pdf_rot_a.append (epdf_rot.eval_pdf (x))

  inv_pdf_a.append (epdf.eval_inv_pdf (u))
  inv_pdf_rot_a.append (epdf_rot.eval_inv_pdf (u))

#
# Plotting the probability density function.
#
fig = plot.subplot ()
plot.title ("PDF")
fig.plot (x_a, p_a, label = "auto-bw")
fig.plot (x_a, p_rot_a, label = "RoT-bw")
fig.plot (x_a, true_p (x_a), label = "true dist")

fig.legend(loc = "upper center")

plot.savefig ("epdf1d_pdf.svg")
plot.clf ()

#
# Plotting the relative difference of the reconstructed distributions and the true one.
#
fig = plot.subplot ()
plot.title ("PDF relative difference with respect to the true distribution")
fig.plot (x_a, np.abs (np.array ((p_a - true_p (x_a)) / true_p (x_a))), label = "auto-bw")
fig.plot (x_a, np.abs (np.array ((p_rot_a - true_p (x_a)) / true_p (x_a))), label = "RoT-bw")
fig.set_ylim ([1.0e-6, 1.0e1])
fig.grid ()

fig.legend(loc = "upper right")
fig.set_yscale ("log")

plot.savefig ("epdf1d_pdf_diff.svg")
plot.clf ()

#
# Plotting the cumulative distribution.
#
fig = plot.subplot ()
plot.title ("CDF")
fig.plot (x_a, pdf_a, label = "auto-bw")
fig.plot (x_a, pdf_rot_a, label = "RoT-bw")
fig.plot (x_a, (true_cdf (x_a) - cdf_cut_l) / rbnorm, label = "true dist")

fig.legend(loc = "upper left")

plot.savefig ("epdf1d_cdf.svg")
plot.clf ()

#
# Plotting the relative difference of the reconstructed cumulative distributions and the true one.
#
fig = plot.subplot ()
plot.title ("CDF relative difference with respect to the true distribution")
fig.plot (x_a, np.abs (pdf_a     / ( (true_cdf (x_a) - cdf_cut_l) / rbnorm ) - 1.0), label = "auto-bw")
fig.plot (x_a, np.abs (pdf_rot_a / ( (true_cdf (x_a) - cdf_cut_l) / rbnorm ) - 1.0), label = "RoT-bw")
fig.grid ()

fig.legend(loc = "upper right")
fig.set_yscale ("log")

plot.savefig ("epdf1d_cdf_diff.svg")
plot.clf ()

#
# Plotting the inverse cumulative distribution.
#
fig = plot.subplot ()
plot.title ("Inverse CDF")
fig.plot (u_a, inv_pdf_a, label = "auto-bw")
fig.plot (u_a, inv_pdf_rot_a, label = "RoT-bw")

fig.legend(loc = "upper left")

plot.savefig ("epdf1d_invcdf.svg")
plot.clf ()

Figure 1: PDF

Figure 1: Reconstructed probability density functions (PDF) using the automatic (auto-bw) and Rule of Thumb (RoT-bw) methods to determine the bandwidth, and the true PDF.

Figure 2: PDF diff

Figure 2: Relative difference between the reconstructed probability density functions, using the automatic (auto-bw) and Rule of Thumb (RoT-bw) methods to determine the bandwidth, with respect to the true distribution.

Figure 3: CDF

Figure 3: Reconstructed cumulative distribution functions (CDF) using the automatic (auto-bw) and Rule of Thumb (RoT-bw) methods to determine the bandwidth, and the true CDF.

Figure 4: CDF diff

Figure 4: Relative difference between the reconstructed cumulative distribution functions (CDF), using the automatic (auto-bw) and Rule of Thumb (RoT-bw) methods to determine the bandwidth, with respect to the true CDF.

Figure 5: CDF Inverse

Figure 5: Reconstructed inverse cumulative distribution functions using the automatic (auto-bw) and Rule of Thumb (RoT-bw) methods to determine the bandwidth.

Empirical Probability Distribution Function