Skip to content

Instantly share code, notes, and snippets.

@vhaasteren
Last active March 7, 2024 16:12
Show Gist options
  • Save vhaasteren/1fb63796fcd3cc813a9a00b05af80bf4 to your computer and use it in GitHub Desktop.
Save vhaasteren/1fb63796fcd3cc813a9a00b05af80bf4 to your computer and use it in GitHub Desktop.
Fast bulk interpolation using a CUDA and Apple Metal
#!/usr/bin/python
# -*- coding: utf-8 -*-
# vim: tabstop=4:softtabstop=4:shiftwidth=4:expandtab
"""
cuda_interpolation: Fast bulk interpolation using numpy, cuda, and cupy
author: Rutger van Haasteren
email: rutger@vhaasteren.com
date: March, 2024
"""
import numpy as np
import scipy.interpolate as sint
import scipy.stats as sstats
import cupy as cp
import os
# This is the code for a CUDA kernel that does a parallel binary search
kernel_code = '''
extern "C"
__global__ void binary_search_kernel(const float* rho, const float* knots, int* left_indices, int n_rhos, int n_knots) {
int rho_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (rho_idx >= n_rhos) return;
float rho_val = rho[rho_idx];
const float* row_knots = knots + rho_idx * n_knots;
int left = 0;
int right = n_knots - 1;
int mid = 0;
while (left <= right) {
mid = left + (right - left) / 2;
if (row_knots[mid] <= rho_val) {
left = mid + 1;
} else {
right = mid - 1;
}
}
left_indices[rho_idx] = max(0, left - 1);
}
'''
# This is the code for a CUDA kernel that does the full parallel interpolation
bulk_interpolation_kernel_code = '''
extern "C"
__global__ void bulk_interpolation_kernel(const float* rho, const float* knots, const float* coeffs, float* y, int n_rhos, int n_knots, int n_order) {
int rho_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (rho_idx >= n_rhos) return;
float rho_val = rho[rho_idx];
const float* row_knots = knots + rho_idx * n_knots;
const float* row_coeffs = coeffs + rho_idx * (n_knots-1) * n_order;
int left = 0;
int right = n_knots - 1;
int mid = 0;
int left_index = 0;
while (left <= right) {
mid = left + (right - left) / 2;
if (row_knots[mid] <= rho_val) {
left = mid + 1;
} else {
right = mid - 1;
}
}
left_index = max(0, left - 1);
/* We have the left-indices. Now do the interpolation */
float delta = rho_val - row_knots[left_index];
float multiply = 1.0;
int order = 0;
y[rho_idx] = 0;
for(order=n_order-1; order>=0; order--) {
y[rho_idx] += row_coeffs[left_index*n_order+order] * multiply;
multiply = multiply * delta;
}
}
'''
# Compile the CUDA kernels
binary_search_kernel = cp.RawKernel(kernel_code, 'binary_search_kernel')
bulk_interpolation_kernel = cp.RawKernel(bulk_interpolation_kernel_code, 'bulk_interpolation_kernel')
# The custom CUDA kernel for the bulk binary tree search
def linear_interp_to_spline(spline_func):
"""Convert a scipy.interp1d linear representation to spline rep"""
delta_x = spline_func.x[1:] - spline_func.x[:-1]
delta_y = spline_func.y[1:] - spline_func.y[:-1]
spline_func.c = np.stack([delta_y/delta_x, spline_func.y[:-1]], axis=0)
def get_spline_knots_coeffs(spline_funcs, order=3):
"""Get the knots and coefficients of the splines"""
coeffs = np.stack([s.c.T for s in spline_funcs], axis=0)
knots = np.stack([s.x for s in spline_funcs], axis=0)
return knots, coeffs[:,:,:order]
def find_segment_indices(rho, knots):
"""Find the segment indices for each rho value in its corresponding set of knots."""
# Expand rho to match knots shape for broadcasting
rho_expanded = rho[:, None]
# Find the knot indices
is_less_equal = rho_expanded <= knots
right_knot_indices = np.argmax(is_less_equal, axis=1)
left_knot_indices = right_knot_indices - 1
# Adjust indices to ensure they are within bounds
return np.clip(left_knot_indices, 0, knots.shape[1] - 2)
def find_segment_indices_cupy(rho, knots):
"""Find the segment indices for each rho value in its corresponding set of knots using CuPy."""
rho_expanded = cp.asarray(rho, dtype=np.float32)[:, None]
knots = cp.asarray(knots, dtype=np.float32)
# Find the knot indices
is_less_equal = rho_expanded <= knots
right_knot_indices = cp.argmax(is_less_equal, axis=1)
left_knot_indices = right_knot_indices - 1
return cp.clip(left_knot_indices, 0, knots.shape[1] - 2)
def find_segment_indices_cuda(rho, knots):
"""Find the segment indices for each rho value in its corresponding set of knots using CuPy, optimized with binary search."""
# Ensure rho and knots are CuPy arrays, single precision
rho_cp = cp.asarray(rho, dtype=np.float32)
knots_cp = cp.asarray(knots, dtype=np.float32)
# Prepare output array for indices
left_knot_indices = cp.empty(rho_cp.size, dtype=cp.int32)
# Launch the custom kernel to find the knot indices
threads_per_block = 256
blocks_per_grid = (rho_cp.size + (threads_per_block - 1)) // threads_per_block
binary_search_kernel((blocks_per_grid,), (threads_per_block,), (rho_cp, knots_cp, left_knot_indices, knots_cp.shape[0], knots_cp.shape[1]))
return left_knot_indices
def polynomial_spline_eval_bulk(rho, knots, coeffs):
"""Efficient polynomial interpolation using vectorized operations.
Parameters:
----------
- rho: array of points at which to evaluate the interpolation.
- knots: array of x-values where the interpolator changes ('knots').
- coeffs: array of coefficients for each polynomial segment. For a linear
segment, coeffs[..., 1] is the y-value at the start of the segment, and
coeffs[..., 0] is the slope of the segment.
Assumes the first dimension of knots and coeffs correspond to different interpolators,
and that knots for each interpolator are sorted.
"""
# Find the index of the left knot for each rho
left_knot_indices = find_segment_indices(rho, knots)
# Gather the coefficients for the relevant segment
segment_coeffs = np.take_along_axis(coeffs, left_knot_indices[:, None, None], axis=1).squeeze(axis=1)
# Gather the x values for the left knots
x0 = np.take_along_axis(knots, left_knot_indices[:, None], axis=1).flatten()
# Evaluate the polynomial at rho
delta = rho - x0
y = np.sum(segment_coeffs * delta[..., None] ** np.arange(segment_coeffs.shape[1])[::-1], axis=-1)
return y
def polynomial_spline_eval_bulk_cupy(rho, knots, coeffs):
"""Efficient polynomial interpolation using GPU operations. Make sure that the
input arrays are already on the GPU in np.float32 for efficiency
Parameters:
----------
- rho: array of points at which to evaluate the interpolation.
- knots: array of x-values where the interpolator changes ('knots').
- coeffs: array of coefficients for each polynomial segment. For a linear
segment, coeffs[..., 1] is the y-value at the start of the segment, and
coeffs[..., 0] is the slope of the segment.
Assumes the first dimension of knots and coeffs correspond to different interpolators,
and that knots for each interpolator are sorted.
"""
# Make sure these are all cupy arrays
rho = cp.asarray(rho, dtype=np.float32)
knots = cp.asarray(knots, dtype=np.float32)
coeffs = cp.asarray(coeffs, dtype=np.float32)
# Find the index of the left knot for each rho
left_knot_indices = find_segment_indices_cupy(rho, knots)
# Ensuring that the indexing matches the NumPy approach more closely
segment_coeffs = cp.take_along_axis(coeffs, left_knot_indices[:, None, None], axis=1).squeeze(axis=1)
# Gather the x values for the left knots
x0 = cp.take_along_axis(knots, left_knot_indices[:, None], axis=1).flatten()
# Do the interpolation step
delta = rho - x0
powers = cp.arange(segment_coeffs.shape[1])[::-1]
delta_powers = delta[..., None] ** powers
y = cp.sum(segment_coeffs * delta_powers, axis=-1)
return y
def polynomial_spline_eval_bulk_cuda(rho, knots, coeffs):
"""Efficient polynomial interpolation using custom GPU operations.
This one is identical to polynomial_spline_eval_bulk_cupy, except
that the binary tree search is done with a custom CUDA kernel here
Parameters:
----------
- rho: array of points at which to evaluate the interpolation.
- knots: array of x-values where the interpolator changes ('knots').
- coeffs: array of coefficients for each polynomial segment. For a linear
segment, coeffs[..., 1] is the y-value at the start of the segment, and
coeffs[..., 0] is the slope of the segment.
Assumes the first dimension of knots and coeffs correspond to different interpolators,
and that knots for each interpolator are sorted.
"""
# Make sure these are all cupy arrays
rho = cp.asarray(rho, dtype=np.float32)
knots = cp.asarray(knots, dtype=np.float32)
coeffs = cp.asarray(coeffs, dtype=np.float32)
# Find the index of the left knot for each rho
left_knot_indices = find_segment_indices_cuda(rho, knots)
# Ensuring that the indexing matches the NumPy approach more closely
segment_coeffs = cp.take_along_axis(coeffs, left_knot_indices[:, None, None], axis=1).squeeze(axis=1)
# Gather the x values for the left knots
x0 = cp.take_along_axis(knots, left_knot_indices[:, None], axis=1).flatten()
# Do the interpolation step
delta = rho - x0
powers = cp.arange(segment_coeffs.shape[1])[::-1]
delta_powers = delta[..., None] ** powers
y = cp.sum(segment_coeffs * delta_powers, axis=-1)
return y
def polynomial_spline_eval_custom_cuda_kernel(rho, knots, coeffs):
"""Efficient polynomial interpolation using custom GPU operations.
This one is identical to polynomial_spline_eval_bulk_cupy, except
that the binary tree search is done with a custom CUDA kernel here
Parameters:
----------
- rho: array of points at which to evaluate the interpolation.
- knots: array of x-values where the interpolator changes ('knots').
- coeffs: array of coefficients for each polynomial segment. For a linear
segment, coeffs[..., 1] is the y-value at the start of the segment, and
coeffs[..., 0] is the slope of the segment.
Assumes the first dimension of knots and coeffs correspond to different interpolators,
and that knots for each interpolator are sorted.
"""
# Make sure these are all cupy arrays
rho_cp = cp.asarray(rho, dtype=np.float32)
knots_cp = cp.asarray(knots, dtype=np.float32)
coeffs_cp = cp.asarray(coeffs, dtype=np.float32)
y_cp = cp.empty(rho_cp.size, dtype=np.float32)
# Do it all in the CUDA kernel
threads_per_block = 256
blocks_per_grid = (rho_cp.size + (threads_per_block - 1)) // threads_per_block
bulk_interpolation_kernel((blocks_per_grid,), (threads_per_block,), (rho_cp, knots_cp, coeffs_cp, y_cp, knots_cp.shape[0], knots_cp.shape[1], coeffs_cp.shape[2]))
return y_cp
if __name__=="__main__":
# Set up a test problem
N_bins = 300 # Bins for the interpolator
N_psr, N_freqs = 100, 15 # Pulsars and frequency modes
N_pars = int(0.5 * N_freqs * N_psr * (N_psr+1))
print("Optimal counting:", N_pars) # Output number of parameters of Sigma
x_min, x_max = -10, 10 # Range of parameters (just for demo)
spread = 1 # Spread of mean values (should get posterior from KDE)
mu = np.random.randn(N_pars)*spread # Mean of posterior (should get posterior from KDE)
sigma = 1.0 # Standard deviation of posterior ('')
dist = sstats.norm(loc=0, scale=spread)
print("Generating interpolators")
x_vals = np.linspace(x_min, x_max, N_bins, endpoint=True) # Knots of the interpolators
y_vals = dist.pdf(x_vals[None,:]-mu[:,None]) # PDF values at knots
f_ys = [sint.interp1d(x_vals, y, kind='linear') for y in y_vals] # Create linear interpolators
# Convert splines and get knots and coefficients (could do higher order)
for f_y in f_ys:
linear_interp_to_spline(f_y)
# Because we only included the 'c' for a linear interpolator, the 'order' below here
# will not go to order=3, but only as high as is available
knots, coeffs = get_spline_knots_coeffs(f_ys, order=3)
rho = np.random.rand(N_pars) * 20 - 10
# Move all these quantities to the GPU
rho_cp = cp.asarray(rho, dtype=np.float32)
knots_cp = cp.asarray(knots, dtype=np.float32)
coeffs_cp = cp.asarray(coeffs, dtype=np.float32)
print("Calculating traditionally with scipy")
logp_vals_traditional = [float(f_y(rho_val)) for (f_y, rho_val) in zip(f_ys, rho)]
print("Calculating bulk")
logp_vals_fast = polynomial_spline_eval_bulk(rho, knots, coeffs)
print("Calculating with cupy")
logp_vals_cupy = polynomial_spline_eval_bulk_cupy(rho_cp, knots_cp, coeffs_cp)
print("Calculating with custom cuda kernel")
logp_vals_cuda = polynomial_spline_eval_bulk_cuda(rho_cp, knots_cp, coeffs_cp)
print("Calculating with full custom cuda kernel")
logp_vals_custom = polynomial_spline_eval_custom_cuda_kernel(rho_cp, knots_cp, coeffs_cp)
# Compare the values
print(logp_vals_traditional[:4])
print(logp_vals_fast[:4])
print(logp_vals_cupy[:4])
print(logp_vals_cuda[:4])
print(logp_vals_custom[:4])
#!/usr/bin/python
# -*- coding: utf-8 -*-
# vim: tabstop=4:softtabstop=4:shiftwidth=4:expandtab
"""
metal_interpolation: Fast bulk interpolation using numpy and Apple Metal
author: Rutger van Haasteren
email: rutger@vhaasteren.com
date: March, 2024
"""
import numpy as np
import array
import scipy.interpolate as sint
import scipy.stats as sstats
import metalcompute as mc
# Metal shader code as a string
metal_shader = """
#include <metal_stdlib>
using namespace metal;
struct ScalarParams {
uint n_rhos;
uint n_knots;
uint n_order;
};
kernel void bulk_interpolation_kernel(
device const float* rho [[buffer(0)]],
device const float* knots [[buffer(1)]],
device const float* coeffs [[buffer(2)]],
device float* y [[buffer(3)]],
constant ScalarParams& params [[buffer(4)]],
uint grid_position [[thread_position_in_grid]]
) {
uint n_rhos = params.n_rhos;
uint n_knots = params.n_knots;
uint n_order = params.n_order;
if (grid_position >= n_rhos) return;
float rho_val = rho[grid_position];
const device float* row_knots = knots + grid_position * n_knots;
const device float* row_coeffs = coeffs + grid_position * (n_knots-1) * n_order;
int left = 0;
int right = n_knots - 1;
while (left <= right) {
int mid = left + (right - left) / 2;
if (row_knots[mid] <= rho_val) {
left = mid + 1;
} else {
right = mid - 1;
}
}
int left_index = max(0, left - 1);
/* Interpolation */
float delta = rho_val - row_knots[left_index];
float multiply = 1.0f;
y[grid_position] = 0.0f;
for(int order = n_order - 1; order >= 0; --order) {
y[grid_position] += row_coeffs[left_index * n_order + order] * multiply;
multiply *= delta;
}
}
"""
metal_dev = mc.Device()
metal_interpolation_kernel = metal_dev.kernel(metal_shader).function('bulk_interpolation_kernel')
def linear_interp_to_spline(spline_func):
"""Convert a scipy.interp1d linear representation to spline rep"""
delta_x = spline_func.x[1:] - spline_func.x[:-1]
delta_y = spline_func.y[1:] - spline_func.y[:-1]
spline_func.c = np.stack([delta_y/delta_x, spline_func.y[:-1]], axis=0)
def get_spline_knots_coeffs(spline_funcs, order=3):
"""Get the knots and coefficients of the splines"""
coeffs = np.stack([s.c.T for s in spline_funcs], axis=0)
knots = np.stack([s.x for s in spline_funcs], axis=0)
return knots, coeffs[:,:,:order]
def find_segment_indices(rho, knots):
"""Find the segment indices for each rho value in its corresponding set of knots."""
# Expand rho to match knots shape for broadcasting
rho_expanded = rho[:, None]
# Find the knot indices
is_less_equal = rho_expanded <= knots
right_knot_indices = np.argmax(is_less_equal, axis=1)
left_knot_indices = right_knot_indices - 1
# Adjust indices to ensure they are within bounds
return np.clip(left_knot_indices, 0, knots.shape[1] - 2)
def polynomial_spline_eval_bulk(rho, knots, coeffs):
"""Efficient polynomial interpolation using vectorized operations.
Parameters:
----------
- rho: array of points at which to evaluate the interpolation.
- knots: array of x-values where the interpolator changes ('knots').
- coeffs: array of coefficients for each polynomial segment. For a linear
segment, coeffs[..., 1] is the y-value at the start of the segment, and
coeffs[..., 0] is the slope of the segment.
Assumes the first dimension of knots and coeffs correspond to different interpolators,
and that knots for each interpolator are sorted.
"""
# Find the index of the left knot for each rho
left_knot_indices = find_segment_indices(rho, knots)
# Gather the coefficients for the relevant segment
segment_coeffs = np.take_along_axis(coeffs, left_knot_indices[:, None, None], axis=1).squeeze(axis=1)
# Gather the x values for the left knots
x0 = np.take_along_axis(knots, left_knot_indices[:, None], axis=1).flatten()
# Evaluate the polynomial at rho
delta = rho - x0
y = np.sum(segment_coeffs * delta[..., None] ** np.arange(segment_coeffs.shape[1])[::-1], axis=-1)
return y
def polynomial_spline_eval_bulk_metal(rho, knots_buf, coeffs_buf, n_rhos, n_knots, n_order):
"""Efficient polynomial interpolation using custom GPU operations.
This one is identical to polynomial_spline_eval_bulk_cupy, except
that the binary tree search is done with a custom CUDA kernel here
Parameters:
----------
- rho: array of points at which to evaluate the interpolation.
- knots: array of x-values where the interpolator changes ('knots').
- coeffs: array of coefficients for each polynomial segment. For a linear
segment, coeffs[..., 1] is the y-value at the start of the segment, and
coeffs[..., 0] is the slope of the segment.
Assumes the first dimension of knots and coeffs correspond to different interpolators,
and that knots for each interpolator are sorted.
"""
# Convert data to suitable format
rho = rho.astype(np.float32, copy=False)
ivals = metal_dev.buffer(n_rhos * 4) # 4 because a float32 needs 4 bytes
ivals_view = memoryview(ivals).cast('f')
# Parameters for the kernel
scalar_params = np.array([n_rhos, n_knots, n_order], dtype=np.uint32)
# Execute the kernel
metal_interpolation_kernel(n_rhos, rho, knots_buf, coeffs_buf, ivals, scalar_params)
return np.frombuffer(ivals_view, dtype=np.float32)
if __name__=="__main__":
# Set up a test problem
N_bins = 300 # Bins for the interpolator
N_psr, N_freqs = 100, 15 # Pulsars and frequency modes
N_pars = int(0.5 * N_freqs * N_psr * (N_psr+1))
print("Optimal counting:", N_pars) # Output number of parameters of Sigma
x_min, x_max = -10, 10 # Range of parameters (just for demo)
spread = 1 # Spread of mean values (should get posterior from KDE)
mu = np.random.randn(N_pars)*spread # Mean of posterior (should get posterior from KDE)
sigma = 1.0 # Standard deviation of posterior ('')
dist = sstats.norm(loc=0, scale=spread)
print("Generating interpolators")
x_vals = np.linspace(x_min, x_max, N_bins, endpoint=True) # Knots of the interpolators
y_vals = dist.pdf(x_vals[None,:]-mu[:,None]) # PDF values at knots
f_ys = [sint.interp1d(x_vals, y, kind='linear') for y in y_vals] # Create linear interpolators
# Convert splines and get knots and coefficients (could do higher order)
for f_y in f_ys:
linear_interp_to_spline(f_y)
# Because we only included the 'c' for a linear interpolator, the 'order' below here
# will not go to order=3, but only as high as is available
knots, coeffs = get_spline_knots_coeffs(f_ys, order=3)
rho = np.random.rand(N_pars) * 20 - 10
rho = rho.astype(np.float32)
knots = knots.astype(np.float32)
coeffs = coeffs.astype(np.float32)
# Move these quantities to the GPU
knots_buf = metal_dev.buffer(knots)
coeffs_buf = metal_dev.buffer(coeffs.flatten())
n_rhos, n_knots, n_order = rho.shape[0], knots.shape[1], coeffs.shape[2]
print("Calculating traditionally with scipy")
logp_vals_traditional = [float(f_y(rho_val)) for (f_y, rho_val) in zip(f_ys, rho)]
print("Calculating bulk")
logp_vals_fast = polynomial_spline_eval_bulk(rho, knots, coeffs)
print("Calculating with full custom Metal kernel")
logp_vals_metal = polynomial_spline_eval_bulk_metal(rho, knots_buf, coeffs_buf, n_rhos, n_knots, n_order)
# Compare the values
print(logp_vals_traditional[:4])
print(logp_vals_fast[:4])
print(logp_vals_metal[:4])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment