Skip to content

Instantly share code, notes, and snippets.

@joferkington joferkington/fast_kde.py
Last active Apr 13, 2018

Embed
What would you like to do?
import numpy as np
import scipy.sparse
import scipy.ndimage
import scipy.stats
import scipy.signal
import matplotlib.pyplot as plt
def main():
x, y = generate_data(int(1e7))
grid, extents, density = fast_kde(x, y, sample=True)
image_example(grid, extents)
scatter_example(x, y, density)
plt.show()
def generate_data(num):
x = 10 * np.random.random(num)
y = x**2 + np.random.normal(0, 5, num)**2
return x, y
def image_example(grid, extents):
fig, ax = plt.subplots()
im = ax.imshow(grid, origin='lower', extent=extents, aspect='auto',
cmap='gist_earth_r')
fig.colorbar(im)
def scatter_example(x, y, density, num_points=10000):
# Randomly draw a subset based on the _inverse_ of the estimated density
prob = 1.0 / density
prob /= prob.sum()
subset = np.random.choice(np.arange(x.size), num_points, False, prob)
x, y, density = x[subset], y[subset], density[subset]
fig, ax = plt.subplots()
ax.scatter(x, y, c=density, cmap='gist_earth_r')
ax.axis('tight')
def fast_kde(x, y, gridsize=(400, 400), extents=None, weights=None,
sample=False):
"""
Performs a gaussian kernel density estimate over a regular grid using a
convolution of the gaussian kernel with a 2D histogram of the data.
This function is typically several orders of magnitude faster than
scipy.stats.kde.gaussian_kde for large (>1e7) numbers of points and
produces an essentially identical result.
Input:
x: array-like
The x-coords of the input data points
y: array-like
The y-coords of the input data points
gridsize: tuple, optional
An (nx,ny) tuple of the size of the output
grid. Defaults to (400, 400).
extents: tuple, optional
A (xmin, xmax, ymin, ymax) tuple of the extents of output grid.
Defaults to min/max of x & y input.
weights: array-like or None, optional
An array of the same shape as x & y that weighs each sample (x_i,
y_i) by each value in weights (w_i). Defaults to an array of ones
the same size as x & y.
sample: boolean
Whether or not to return the estimated density at each location.
Defaults to False
Output:
density : 2D array of shape *gridsize*
The estimated probability distribution function on a regular grid
extents : tuple
xmin, xmax, ymin, ymax
sampled_density : 1D array of len(*x*)
Only returned if *sample* is True. The estimated density at each
point.
"""
#---- Setup --------------------------------------------------------------
x, y = np.atleast_1d([x, y])
x, y = x.reshape(-1), y.reshape(-1)
if x.size != y.size:
raise ValueError('Input x & y arrays must be the same size!')
nx, ny = gridsize
n = x.size
if weights is None:
# Default: Weight all points equally
weights = np.ones(n)
else:
weights = np.squeeze(np.asarray(weights))
if weights.size != x.size:
raise ValueError('Input weights must be an array of the same size'
' as input x & y arrays!')
# Default extents are the extent of the data
if extents is None:
xmin, xmax = x.min(), x.max()
ymin, ymax = y.min(), y.max()
else:
xmin, xmax, ymin, ymax = map(float, extents)
extents = xmin, xmax, ymin, ymax
dx = (xmax - xmin) / (nx - 1)
dy = (ymax - ymin) / (ny - 1)
#---- Preliminary Calculations -------------------------------------------
# Most of this is a hack to re-implment np.histogram2d using `coo_matrix`
# for better memory/speed performance with huge numbers of points.
# First convert x & y over to pixel coordinates
# (Avoiding np.digitize due to excessive memory usage!)
ij = np.column_stack((y, x))
ij -= [ymin, xmin]
ij /= [dy, dx]
ij = np.floor(ij, ij).T
# Next, make a 2D histogram of x & y
# Avoiding np.histogram2d due to excessive memory usage with many points
grid = scipy.sparse.coo_matrix((weights, ij), shape=(ny, nx)).toarray()
# Calculate the covariance matrix (in pixel coords)
cov = image_cov(grid)
# Scaling factor for bandwidth
scotts_factor = np.power(n, -1.0 / 6) # For 2D
#---- Make the gaussian kernel -------------------------------------------
# First, determine how big the kernel needs to be
std_devs = np.diag(np.sqrt(cov))
kern_nx, kern_ny = np.round(scotts_factor * 2 * np.pi * std_devs)
kern_nx, kern_ny = int(kern_nx), int(kern_ny)
# Determine the bandwidth to use for the gaussian kernel
inv_cov = np.linalg.inv(cov * scotts_factor**2)
# x & y (pixel) coords of the kernel grid, with <x,y> = <0,0> in center
xx = np.arange(kern_nx, dtype=np.float) - kern_nx / 2.0
yy = np.arange(kern_ny, dtype=np.float) - kern_ny / 2.0
xx, yy = np.meshgrid(xx, yy)
# Then evaluate the gaussian function on the kernel grid
kernel = np.vstack((xx.flatten(), yy.flatten()))
kernel = np.dot(inv_cov, kernel) * kernel
kernel = np.sum(kernel, axis=0) / 2.0
kernel = np.exp(-kernel)
kernel = kernel.reshape((kern_ny, kern_nx))
#---- Produce the kernel density estimate --------------------------------
# Convolve the gaussian kernel with the 2D histogram, producing a gaussian
# kernel density estimate on a regular grid
# Big kernel, use fft...
if kern_nx * kern_ny > np.product(gridsize) / 4.0:
grid = scipy.signal.fftconvolve(grid, kernel, mode='same')
# Small kernel, use ndimage
else:
grid = scipy.ndimage.convolve(grid, kernel, mode='constant', cval=0)
# Normalization factor to divide result by so that units are in the same
# units as scipy.stats.kde.gaussian_kde's output.
norm_factor = 2 * np.pi * cov * scotts_factor**2
norm_factor = np.linalg.det(norm_factor)
norm_factor = n * dx * dy * np.sqrt(norm_factor)
# Normalize the result
grid /= norm_factor
if sample:
i, j = ij.astype(int)
return grid, extents, grid[i, j]
else:
return grid, extents
def image_cov(data):
"""Efficiently calculate the cov matrix of an image."""
def raw_moment(data, ix, iy, iord, jord):
data = data * ix**iord * iy**jord
return data.sum()
ni, nj = data.shape
iy, ix = np.mgrid[:ni, :nj]
data_sum = data.sum()
m10 = raw_moment(data, ix, iy, 1, 0)
m01 = raw_moment(data, ix, iy, 0, 1)
x_bar = m10 / data_sum
y_bar = m01 / data_sum
u11 = (raw_moment(data, ix, iy, 1, 1) - x_bar * m01) / data_sum
u20 = (raw_moment(data, ix, iy, 2, 0) - x_bar * m10) / data_sum
u02 = (raw_moment(data, ix, iy, 0, 2) - y_bar * m01) / data_sum
cov = np.array([[u20, u11], [u11, u02]])
return cov
if __name__ == '__main__':
main()
@MikeDacre

This comment has been minimized.

Copy link

MikeDacre commented Apr 6, 2018

This is amazing! Thanks for sharing.

Do you have an idea of how to make this code show a more smooth plot even when the vast majority of the points are in a small area?

Specifically, I have some R code that uses densCols ((cisVar)[https://github.com/TheFraserLab/cisVar/blob/31e5c8c92643c6ccc7efd0ac6d5b50e1cef252c5/scripts/plot_fun.R]) to plot some frequency data and produces this:

image

Whereas when I use this fast kde method, I end up with this:

image

I would really like to be able to adapt this code to do more general density plots even in highly asymmetric data but I am not sure how to do that. Maybe by adjusting the weights or the cmap?

@joferkington

This comment has been minimized.

Copy link
Owner Author

joferkington commented Apr 13, 2018

@MikeDacre - Sorry I didn't notice your comment earlier!

In that case, there's the correct way and then there's a shortcut.

The shortcut is to just plot the log of the values. If you're not displaying a colorbar, that's the route you'll want to take. For example, plot ax.scatter(x, y, c=np.log(density), cmap='gist_earth_r') and you should get something that looks similar to the R plot.

If you want to display a colorbar alongside it and still have the correct values, you'd want to change the Normalize type used for the color plotting. Something like ax.scatter(x, y, c=density, cmap='gist_earth_r', norm=LogNorm(density.min(), density.max())).

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.