lgarrison/numba_ecdf.py

## numba_ecdf.py
import numba as nb
import numpy as np

def nb_ecdf(X,Rmax,ngrid=64,nthreads=1):
    '''Compute the empirical CDF of points X (shape (N,D))
    in domain [0,Rmax) using a histogram with `ngrid` cells
    per dimension.

    Rmax and ngrid can also be length D arrays.
    '''
    # Do some Pythonic things before dropping into Numba
    ngrid = tuple(np.broadcast_to(ngrid, X.shape[-1]))
    Rmax = np.broadcast_to(Rmax, X.shape[-1])

    return _nb_ecdf(X, Rmax, ngrid=ngrid, nthreads=nthreads)

@nb.njit(fastmath=True,parallel=True)
def _nb_ecdf(X,Rmax,ngrid,nthreads):
    N, D = X.shape
    dtype = X.dtype
    Rmax = Rmax.astype(dtype)

    nb.set_num_threads(nthreads)
    # Divide the particle work over threads
    Nstart = np.linspace(0, N, nthreads+1).astype(np.int64)

    ngridflat = np.prod(np.array(ngrid))

    # one hist per thread
    # TODO: could instead spatially sort particles
    hist = np.zeros((nthreads,ngridflat),dtype=np.uint32)

    invRmax = (1./Rmax).astype(dtype)
    for t in nb.prange(nthreads):
        for i in range(Nstart[t],Nstart[t+1]):
            ii = 0
            # Compute the D-dimensional hist index
            for d in range(D):
                ii = ii*ngrid[d] + np.int64(X[i,d]*invRmax[d]*ngrid[d])
            hist[t,ii] += 1

    # Thread reduction
    hist = hist.sum(axis=0).reshape(ngrid)

    cumsum_nd(hist, inplace=True)

    return hist

@nb.njit(parallel=True, fastmath=True)
def cumsum_nd(hist, inplace=True):
    '''An N-dimensional cumulative sum'''
    if not inplace:
        hist = hist.copy()
    ngrid = np.array(hist.shape)
    D = len(ngrid)
    ngridflat = np.prod(ngrid)
    strides = np.array(hist.strides)//hist.itemsize
    hist = hist.reshape(-1)
    for d in range(D):
        # We have `nsums` 1D sums to do
        nsums = ngridflat//ngrid[d]
        ninner = strides[d]  # inner dims are dimensions after the sum dim, stride always 1
        nouter = nsums//ninner  # outer dims are before the sum dim, use the stride 1 above the sum dim
        outerstride = strides[d-1]
        stride = strides[d]

        for i in nb.prange(nouter):
            for ii in range(ninner):
                for j in range(1,ngrid[d]):
                    hist[i*outerstride + ii + j*stride] += hist[i*outerstride + ii + (j-1)*stride]
    return hist


def py_ecdf(X,Rmax,ngrid=64):
    '''A straw-man Python/Numpy ECDF'''
    hist, edges = np.histogramdd(X, bins=ngrid, range=[(0,Rmax) for d in range(X.shape[-1])])
    res = np.cumsum(hist, axis=0)
    for ax in range(1,hist.ndim):
        res = np.cumsum(res, axis=ax)
    return res

# test driver
rng = np.random.default_rng()
X = rng.random((10**8,4), dtype=np.float32)
# these are ipython %timeit "magic" statements (can be used in jupyter)
%timeit nb_ecdf(X, Rmax=1, ngrid=32, nthreads=8)  # 200 ms
%timeit -n1 -r1 py_ecdf(X, Rmax=1, ngrid=32)  # 15 s
print(np.all(nb_ecdf(X, 1, 32) == py_ecdf(X, 1, 32)))  # check correctness
	import numba as nb
	import numpy as np

	def nb_ecdf(X,Rmax,ngrid=64,nthreads=1):
	'''Compute the empirical CDF of points X (shape (N,D))
	in domain [0,Rmax) using a histogram with `ngrid` cells
	per dimension.

	Rmax and ngrid can also be length D arrays.
	'''
	# Do some Pythonic things before dropping into Numba
	ngrid = tuple(np.broadcast_to(ngrid, X.shape[-1]))
	Rmax = np.broadcast_to(Rmax, X.shape[-1])

	return _nb_ecdf(X, Rmax, ngrid=ngrid, nthreads=nthreads)

	@nb.njit(fastmath=True,parallel=True)
	def _nb_ecdf(X,Rmax,ngrid,nthreads):
	N, D = X.shape
	dtype = X.dtype
	Rmax = Rmax.astype(dtype)

	nb.set_num_threads(nthreads)
	# Divide the particle work over threads
	Nstart = np.linspace(0, N, nthreads+1).astype(np.int64)

	ngridflat = np.prod(np.array(ngrid))

	# one hist per thread
	# TODO: could instead spatially sort particles
	hist = np.zeros((nthreads,ngridflat),dtype=np.uint32)

	invRmax = (1./Rmax).astype(dtype)
	for t in nb.prange(nthreads):
	for i in range(Nstart[t],Nstart[t+1]):
	ii = 0
	# Compute the D-dimensional hist index
	for d in range(D):
	ii = iingrid[d] + np.int64(X[i,d]invRmax[d]*ngrid[d])
	hist[t,ii] += 1

	# Thread reduction
	hist = hist.sum(axis=0).reshape(ngrid)

	cumsum_nd(hist, inplace=True)

	return hist

	@nb.njit(parallel=True, fastmath=True)
	def cumsum_nd(hist, inplace=True):
	'''An N-dimensional cumulative sum'''
	if not inplace:
	hist = hist.copy()
	ngrid = np.array(hist.shape)
	D = len(ngrid)
	ngridflat = np.prod(ngrid)
	strides = np.array(hist.strides)//hist.itemsize
	hist = hist.reshape(-1)
	for d in range(D):
	# We have `nsums` 1D sums to do
	nsums = ngridflat//ngrid[d]
	ninner = strides[d] # inner dims are dimensions after the sum dim, stride always 1
	nouter = nsums//ninner # outer dims are before the sum dim, use the stride 1 above the sum dim
	outerstride = strides[d-1]
	stride = strides[d]

	for i in nb.prange(nouter):
	for ii in range(ninner):
	for j in range(1,ngrid[d]):
	hist[iouterstride + ii + jstride] += hist[iouterstride + ii + (j-1)stride]
	return hist


	def py_ecdf(X,Rmax,ngrid=64):
	'''A straw-man Python/Numpy ECDF'''
	hist, edges = np.histogramdd(X, bins=ngrid, range=[(0,Rmax) for d in range(X.shape[-1])])
	res = np.cumsum(hist, axis=0)
	for ax in range(1,hist.ndim):
	res = np.cumsum(res, axis=ax)
	return res

	# test driver
	rng = np.random.default_rng()
	X = rng.random((10**8,4), dtype=np.float32)
	# these are ipython %timeit "magic" statements (can be used in jupyter)
	%timeit nb_ecdf(X, Rmax=1, ngrid=32, nthreads=8) # 200 ms
	%timeit -n1 -r1 py_ecdf(X, Rmax=1, ngrid=32) # 15 s
	print(np.all(nb_ecdf(X, 1, 32) == py_ecdf(X, 1, 32))) # check correctness