Skip to content

Instantly share code, notes, and snippets.

@gmarkall
Created August 3, 2021 17:59
Show Gist options
  • Save gmarkall/12e66581fab5d9453f6510cfc23731a0 to your computer and use it in GitHub Desktop.
Save gmarkall/12e66581fab5d9453f6510cfc23731a0 to your computer and use it in GitHub Desktop.
CUDA demo presented at the 2021-08-03 Numba meeting (not executable, was modified to exemplify various things)
import math
from numba import cuda, njit, objmode
from time import perf_counter
import numpy as np
import cupy as cp
@njit
def axpy_body(a, x, y):
return a * x + y
@njit
def axpy(r, a, x, y):
for i in range(len(r)):
r[i] = axpy_body(a, x[i], y[i])
np.random.seed(1)
N = 1000000
x = np.random.random(N)
y = np.random.random(N)
r = np.zeros_like(x)
a = 2.0
# Warm up jit
axpy(r, a, x, y)
N_ITERATIONS = 100
start = perf_counter()
for i in range(N_ITERATIONS):
axpy(r, a, x, y)
end = perf_counter()
print(f"CPU time is {end - start}")
@cuda.jit
def cuda_axpy(r, a, x, y):
i = cuda.grid(1)
gs = cuda.gridsize(1)
for i in range(i, len(r), gs):
r[i] = axpy_body(a, x[i], y[i])
N_THREADS = 256
N_BLOCKS = 72 * 8
x_cuda = cp.random.random(x)
y_cuda = cp.asarray(y)
r_cuda = cp.zeros_like(r)
start = perf_counter()
for i in range(N_ITERATIONS):
cuda_axpy[N_BLOCKS, N_THREADS](r_cuda, a, x_cuda, y_cuda)
cuda.synchronize()
end = perf_counter()
np.testing.assert_equal(r, r_cuda.copy_to_host())
print(f"GPU time is {end - start}")
@njit
def proxy_cuda_call(r, a, x, y):
# Do random sampling here
# ...
with objmode:
# copy to device
# ..
cuda_axpy[N_BLOCKS, N_THREADS](r, a, x, y)
proxy_cuda_call(r, a, x, y)
ptx, resty = cuda.compile_ptx(axpy, (float32[::1], float32, float32[::1], float32[::1]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment