Skip to content

Instantly share code, notes, and snippets.

@sergeyprokudin
Last active January 5, 2019 19:31
Show Gist options
  • Save sergeyprokudin/fbb35e25aa63987e5e1fefac1f2d1bfb to your computer and use it in GitHub Desktop.
Save sergeyprokudin/fbb35e25aa63987e5e1fefac1f2d1bfb to your computer and use it in GitHub Desktop.
Kernel and Histogram Density Estimation
#!/usr/bin/env python
# coding: utf-8
# In[32]:
import numpy as np
def hist_pdf(x, data, n_bins=10, minv=None, maxv=None):
"""Histogram density estimator
[minv, minv+delta, , minv+delta*2, ..., 1]
for any x in B_l=[minv+delta*j, minv+delta*(j+1)] density is estimated in the following way
p(x) = p(x | x \in B_l) * p(x \in B_l) = (1/delta) * (\sum_{x_j}{I(x_j \in B_l)} / N)
where N - number of points in dataset
See lecture notes for details:
http://faculty.washington.edu/yenchic/18W_425/Lec6_hist_KDE.pdf
Parameters
----------
x: float
point to estimate density at
data: numpy array
data points used to construct the density
n_bins: int
number of bins
minv: float or None
minimum value of the domain. If None, estimated from data
maxv: float or None
maximum value of the domain. If None, estimated from data
Returns
-------
pdf: float
computed density at point x given data
"""
if minv is None:
minv = np.min(data)
if maxv is None:
maxv = np.max(data)
delta = (maxv-minv) / n_bins
bins = np.arange(minv, maxv, delta)
bin_id = int((x-minv)/delta)
bin_minv = minv+delta*bin_id
bin_maxv = minv+delta*(bin_id+1)
n_data = len(data)
n_data_in_bin = len(data[np.where((data>bin_minv) & (data<bin_maxv))])
pdf = (1.0/delta) * (n_data_in_bin / n_data)
return pdf
def kde_pdf(x, data, h=0.1, minv=None, maxv=None):
"""Kernel density estimator (with Gaussian kernel)
General form of Kernel Density Estimator:
p(x) = \frac{1}{N \mul h}\sum_{i=1}{N}{K(\frac{x-x_i}{h})}
where K(x) = \frac{1}{\sqrt{2\pi}} \exp{\frac{x^2}{2}}
See lecture notes for details:
http://faculty.washington.edu/yenchic/18W_425/Lec6_hist_KDE.pdf
Parameters
----------
x: float
point to estimate density at
data: numpy array
data points used to construct the density
h: int
bandwidth of a kernel. Selecting this is a research topic on its own.
Returns
-------
pdf: float
computed density at point x given data
"""
N = len(data)
pdf = (1/(h*N)) * np.sum((1/np.sqrt(2*np.pi)) * np.exp(-((data-x)**2)/(2*(h**2))))
return pdf
# Demo with Boston Housing Data
from sklearn.datasets import load_boston
import matplotlib.pyplot as plt
ds = load_boston()
data = ds['target']
# Demo histogram
xvals = np.arange(min(data), max(data), 1)
n_bins=15
pdf = [hist_pdf(x, data, n_bins=n_bins) for x in xvals]
plt.plot(xvals, pdf)
# Demo KDE
xvals = np.arange(min(data), max(data), 0.1)
h=1.0 # play with this parameter
pdf = [kde_pdf(x, data, h=h) for x in xvals]
plt.plot(xvals, pdf)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment