Skip to content

Instantly share code, notes, and snippets.

@joferkington
Last active August 8, 2019 22:53
Show Gist options
  • Star 6 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save joferkington/d95101a61a02e0ba63e5 to your computer and use it in GitHub Desktop.
Save joferkington/d95101a61a02e0ba63e5 to your computer and use it in GitHub Desktop.
import numpy as np
import scipy.sparse
import scipy.ndimage
import scipy.stats
import scipy.signal
import matplotlib.pyplot as plt
def main():
x, y = generate_data(int(1e7))
grid, extents, density = fast_kde(x, y, sample=True)
image_example(grid, extents)
scatter_example(x, y, density)
plt.show()
def generate_data(num):
x = 10 * np.random.random(num)
y = x**2 + np.random.normal(0, 5, num)**2
return x, y
def image_example(grid, extents):
fig, ax = plt.subplots()
im = ax.imshow(grid, origin='lower', extent=extents, aspect='auto',
cmap='gist_earth_r')
fig.colorbar(im)
def scatter_example(x, y, density, num_points=10000):
# Randomly draw a subset based on the _inverse_ of the estimated density
prob = 1.0 / density
prob /= prob.sum()
subset = np.random.choice(np.arange(x.size), num_points, False, prob)
x, y, density = x[subset], y[subset], density[subset]
fig, ax = plt.subplots()
ax.scatter(x, y, c=density, cmap='gist_earth_r')
ax.axis('tight')
def fast_kde(x, y, gridsize=(400, 400), extents=None, weights=None,
sample=False):
"""
Performs a gaussian kernel density estimate over a regular grid using a
convolution of the gaussian kernel with a 2D histogram of the data.
This function is typically several orders of magnitude faster than
scipy.stats.kde.gaussian_kde for large (>1e7) numbers of points and
produces an essentially identical result.
Input:
x: array-like
The x-coords of the input data points
y: array-like
The y-coords of the input data points
gridsize: tuple, optional
An (nx,ny) tuple of the size of the output
grid. Defaults to (400, 400).
extents: tuple, optional
A (xmin, xmax, ymin, ymax) tuple of the extents of output grid.
Defaults to min/max of x & y input.
weights: array-like or None, optional
An array of the same shape as x & y that weighs each sample (x_i,
y_i) by each value in weights (w_i). Defaults to an array of ones
the same size as x & y.
sample: boolean
Whether or not to return the estimated density at each location.
Defaults to False
Output:
density : 2D array of shape *gridsize*
The estimated probability distribution function on a regular grid
extents : tuple
xmin, xmax, ymin, ymax
sampled_density : 1D array of len(*x*)
Only returned if *sample* is True. The estimated density at each
point.
"""
#---- Setup --------------------------------------------------------------
x, y = np.atleast_1d([x, y])
x, y = x.reshape(-1), y.reshape(-1)
if x.size != y.size:
raise ValueError('Input x & y arrays must be the same size!')
nx, ny = gridsize
n = x.size
if weights is None:
# Default: Weight all points equally
weights = np.ones(n)
else:
weights = np.squeeze(np.asarray(weights))
if weights.size != x.size:
raise ValueError('Input weights must be an array of the same size'
' as input x & y arrays!')
# Default extents are the extent of the data
if extents is None:
xmin, xmax = x.min(), x.max()
ymin, ymax = y.min(), y.max()
else:
xmin, xmax, ymin, ymax = map(float, extents)
extents = xmin, xmax, ymin, ymax
dx = (xmax - xmin) / (nx - 1)
dy = (ymax - ymin) / (ny - 1)
#---- Preliminary Calculations -------------------------------------------
# Most of this is a hack to re-implment np.histogram2d using `coo_matrix`
# for better memory/speed performance with huge numbers of points.
# First convert x & y over to pixel coordinates
# (Avoiding np.digitize due to excessive memory usage!)
ij = np.column_stack((y, x))
ij -= [ymin, xmin]
ij /= [dy, dx]
ij = np.floor(ij, ij).T
# Next, make a 2D histogram of x & y
# Avoiding np.histogram2d due to excessive memory usage with many points
grid = scipy.sparse.coo_matrix((weights, ij), shape=(ny, nx)).toarray()
# Calculate the covariance matrix (in pixel coords)
cov = image_cov(grid)
# Scaling factor for bandwidth
scotts_factor = np.power(n, -1.0 / 6) # For 2D
#---- Make the gaussian kernel -------------------------------------------
# First, determine how big the kernel needs to be
std_devs = np.diag(np.sqrt(cov))
kern_nx, kern_ny = np.round(scotts_factor * 2 * np.pi * std_devs)
kern_nx, kern_ny = int(kern_nx), int(kern_ny)
# Determine the bandwidth to use for the gaussian kernel
inv_cov = np.linalg.inv(cov * scotts_factor**2)
# x & y (pixel) coords of the kernel grid, with <x,y> = <0,0> in center
xx = np.arange(kern_nx, dtype=np.float) - kern_nx / 2.0
yy = np.arange(kern_ny, dtype=np.float) - kern_ny / 2.0
xx, yy = np.meshgrid(xx, yy)
# Then evaluate the gaussian function on the kernel grid
kernel = np.vstack((xx.flatten(), yy.flatten()))
kernel = np.dot(inv_cov, kernel) * kernel
kernel = np.sum(kernel, axis=0) / 2.0
kernel = np.exp(-kernel)
kernel = kernel.reshape((kern_ny, kern_nx))
#---- Produce the kernel density estimate --------------------------------
# Convolve the gaussian kernel with the 2D histogram, producing a gaussian
# kernel density estimate on a regular grid
# Big kernel, use fft...
if kern_nx * kern_ny > np.product(gridsize) / 4.0:
grid = scipy.signal.fftconvolve(grid, kernel, mode='same')
# Small kernel, use ndimage
else:
grid = scipy.ndimage.convolve(grid, kernel, mode='constant', cval=0)
# Normalization factor to divide result by so that units are in the same
# units as scipy.stats.kde.gaussian_kde's output.
norm_factor = 2 * np.pi * cov * scotts_factor**2
norm_factor = np.linalg.det(norm_factor)
norm_factor = n * dx * dy * np.sqrt(norm_factor)
# Normalize the result
grid /= norm_factor
if sample:
i, j = ij.astype(int)
return grid, extents, grid[i, j]
else:
return grid, extents
def image_cov(data):
"""Efficiently calculate the cov matrix of an image."""
def raw_moment(data, ix, iy, iord, jord):
data = data * ix**iord * iy**jord
return data.sum()
ni, nj = data.shape
iy, ix = np.mgrid[:ni, :nj]
data_sum = data.sum()
m10 = raw_moment(data, ix, iy, 1, 0)
m01 = raw_moment(data, ix, iy, 0, 1)
x_bar = m10 / data_sum
y_bar = m01 / data_sum
u11 = (raw_moment(data, ix, iy, 1, 1) - x_bar * m01) / data_sum
u20 = (raw_moment(data, ix, iy, 2, 0) - x_bar * m10) / data_sum
u02 = (raw_moment(data, ix, iy, 0, 2) - y_bar * m01) / data_sum
cov = np.array([[u20, u11], [u11, u02]])
return cov
if __name__ == '__main__':
main()
@MikeDacre
Copy link

This is amazing! Thanks for sharing.

Do you have an idea of how to make this code show a more smooth plot even when the vast majority of the points are in a small area?

Specifically, I have some R code that uses densCols ((cisVar)[https://github.com/TheFraserLab/cisVar/blob/31e5c8c92643c6ccc7efd0ac6d5b50e1cef252c5/scripts/plot_fun.R]) to plot some frequency data and produces this:

image

Whereas when I use this fast kde method, I end up with this:

image

I would really like to be able to adapt this code to do more general density plots even in highly asymmetric data but I am not sure how to do that. Maybe by adjusting the weights or the cmap?

@joferkington
Copy link
Author

@MikeDacre - Sorry I didn't notice your comment earlier!

In that case, there's the correct way and then there's a shortcut.

The shortcut is to just plot the log of the values. If you're not displaying a colorbar, that's the route you'll want to take. For example, plot ax.scatter(x, y, c=np.log(density), cmap='gist_earth_r') and you should get something that looks similar to the R plot.

If you want to display a colorbar alongside it and still have the correct values, you'd want to change the Normalize type used for the color plotting. Something like ax.scatter(x, y, c=density, cmap='gist_earth_r', norm=LogNorm(density.min(), density.max())).

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment