sergeyprokudin/kde.py

## kde.py
#!/usr/bin/env python
# coding: utf-8

# In[32]:


import numpy as np

def hist_pdf(x, data, n_bins=10, minv=None, maxv=None):
    """Histogram density estimator

    [minv, minv+delta, , minv+delta*2, ..., 1]

    for any x in B_l=[minv+delta*j, minv+delta*(j+1)] density is estimated in the following way

    p(x) = p(x | x \in B_l) * p(x \in B_l) = (1/delta) * (\sum_{x_j}{I(x_j \in B_l)} / N)

    where N - number of points in dataset

    See lecture notes for details:

    http://faculty.washington.edu/yenchic/18W_425/Lec6_hist_KDE.pdf

    Parameters
    ----------
    x: float
        point to estimate density at
    data: numpy array
        data points used to construct the density
    n_bins: int
        number of bins
    minv: float or None
        minimum value of the domain. If None, estimated from data
    maxv: float or None
        maximum value of the domain. If None, estimated from data

    Returns
    -------
    pdf: float
        computed density at point x given data
    """

    if minv is None:
        minv = np.min(data)
    if maxv is None:
        maxv = np.max(data)

    delta = (maxv-minv) / n_bins

    bins = np.arange(minv, maxv, delta)

    bin_id = int((x-minv)/delta)
    bin_minv = minv+delta*bin_id
    bin_maxv = minv+delta*(bin_id+1)

    n_data = len(data)
    n_data_in_bin = len(data[np.where((data>bin_minv) & (data<bin_maxv))])

    pdf = (1.0/delta) * (n_data_in_bin / n_data)

    return pdf


def kde_pdf(x, data, h=0.1, minv=None, maxv=None):
    """Kernel density estimator (with Gaussian kernel)

    General form of Kernel Density Estimator:

    p(x) = \frac{1}{N \mul h}\sum_{i=1}{N}{K(\frac{x-x_i}{h})}

    where K(x) = \frac{1}{\sqrt{2\pi}} \exp{\frac{x^2}{2}}

    See lecture notes for details:

    http://faculty.washington.edu/yenchic/18W_425/Lec6_hist_KDE.pdf

    Parameters
    ----------
    x: float
        point to estimate density at
    data: numpy array
        data points used to construct the density
    h: int
        bandwidth of a kernel. Selecting this is a research topic on its own.

    Returns
    -------
    pdf: float
        computed density at point x given data
    """

    N = len(data)
    pdf = (1/(h*N)) * np.sum((1/np.sqrt(2*np.pi)) * np.exp(-((data-x)**2)/(2*(h**2))))

    return pdf


# Demo with Boston Housing Data

from sklearn.datasets import load_boston
import matplotlib.pyplot as plt
ds = load_boston()
data = ds['target']

# Demo histogram
xvals = np.arange(min(data), max(data), 1)
n_bins=15
pdf = [hist_pdf(x, data, n_bins=n_bins) for x in xvals]
plt.plot(xvals, pdf)

# Demo KDE
xvals = np.arange(min(data), max(data), 0.1)
h=1.0 # play with this parameter
pdf = [kde_pdf(x, data, h=h) for x in xvals]
plt.plot(xvals, pdf)
	#!/usr/bin/env python
	# coding: utf-8

	# In[32]:


	import numpy as np

	def hist_pdf(x, data, n_bins=10, minv=None, maxv=None):
	"""Histogram density estimator

	[minv, minv+delta, , minv+delta*2, ..., 1]

	for any x in B_l=[minv+deltaj, minv+delta(j+1)] density is estimated in the following way

	p(x) = p(x \| x \in B_l) * p(x \in B_l) = (1/delta) * (\sum_{x_j}{I(x_j \in B_l)} / N)

	where N - number of points in dataset

	See lecture notes for details:

	http://faculty.washington.edu/yenchic/18W_425/Lec6_hist_KDE.pdf

	Parameters
	----------
	x: float
	point to estimate density at
	data: numpy array
	data points used to construct the density
	n_bins: int
	number of bins
	minv: float or None
	minimum value of the domain. If None, estimated from data
	maxv: float or None
	maximum value of the domain. If None, estimated from data

	Returns
	-------
	pdf: float
	computed density at point x given data
	"""

	if minv is None:
	minv = np.min(data)
	if maxv is None:
	maxv = np.max(data)

	delta = (maxv-minv) / n_bins

	bins = np.arange(minv, maxv, delta)

	bin_id = int((x-minv)/delta)
	bin_minv = minv+delta*bin_id
	bin_maxv = minv+delta*(bin_id+1)

	n_data = len(data)
	n_data_in_bin = len(data[np.where((data>bin_minv) & (data<bin_maxv))])

	pdf = (1.0/delta) * (n_data_in_bin / n_data)

	return pdf


	def kde_pdf(x, data, h=0.1, minv=None, maxv=None):
	"""Kernel density estimator (with Gaussian kernel)

	General form of Kernel Density Estimator:

	p(x) = \frac{1}{N \mul h}\sum_{i=1}{N}{K(\frac{x-x_i}{h})}

	where K(x) = \frac{1}{\sqrt{2\pi}} \exp{\frac{x^2}{2}}

	See lecture notes for details:

	http://faculty.washington.edu/yenchic/18W_425/Lec6_hist_KDE.pdf

	Parameters
	----------
	x: float
	point to estimate density at
	data: numpy array
	data points used to construct the density
	h: int
	bandwidth of a kernel. Selecting this is a research topic on its own.

	Returns
	-------
	pdf: float
	computed density at point x given data
	"""

	N = len(data)
	pdf = (1/(hN)) np.sum((1/np.sqrt(2np.pi)) np.exp(-((data-x)*2)/(2(h**2))))

	return pdf


	# Demo with Boston Housing Data

	from sklearn.datasets import load_boston
	import matplotlib.pyplot as plt
	ds = load_boston()
	data = ds['target']

	# Demo histogram
	xvals = np.arange(min(data), max(data), 1)
	n_bins=15
	pdf = [hist_pdf(x, data, n_bins=n_bins) for x in xvals]
	plt.plot(xvals, pdf)

	# Demo KDE
	xvals = np.arange(min(data), max(data), 0.1)
	h=1.0 # play with this parameter
	pdf = [kde_pdf(x, data, h=h) for x in xvals]
	plt.plot(xvals, pdf)