gmarkall/cuda_demo.py

## cuda_demo.py
import math
from numba import cuda, njit, objmode
from time import perf_counter
import numpy as np

import cupy as cp


@njit
def axpy_body(a, x, y):
    return a * x + y


@njit
def axpy(r, a, x, y):
    for i in range(len(r)):
        r[i] = axpy_body(a, x[i], y[i])


np.random.seed(1)

N = 1000000

x = np.random.random(N)
y = np.random.random(N)
r = np.zeros_like(x)
a = 2.0

# Warm up jit
axpy(r, a, x, y)

N_ITERATIONS = 100

start = perf_counter()
for i in range(N_ITERATIONS):
    axpy(r, a, x, y)
end = perf_counter()

print(f"CPU time is {end - start}")


@cuda.jit
def cuda_axpy(r, a, x, y):
    i = cuda.grid(1)
    gs = cuda.gridsize(1)

    for i in range(i, len(r), gs):
        r[i] = axpy_body(a, x[i], y[i])


N_THREADS = 256
N_BLOCKS = 72 * 8

x_cuda = cp.random.random(x)
y_cuda = cp.asarray(y)
r_cuda = cp.zeros_like(r)

start = perf_counter()
for i in range(N_ITERATIONS):
    cuda_axpy[N_BLOCKS, N_THREADS](r_cuda, a, x_cuda, y_cuda)
cuda.synchronize()
end = perf_counter()

np.testing.assert_equal(r, r_cuda.copy_to_host())

print(f"GPU time is {end - start}")


@njit
def proxy_cuda_call(r, a, x, y):
    # Do random sampling here
    # ...
    with objmode:
        # copy to device
        # ..
        cuda_axpy[N_BLOCKS, N_THREADS](r, a, x, y)


proxy_cuda_call(r, a, x, y)

ptx, resty = cuda.compile_ptx(axpy, (float32[::1], float32, float32[::1], float32[::1]))
	import math
	from numba import cuda, njit, objmode
	from time import perf_counter
	import numpy as np

	import cupy as cp



	@njit
	def axpy_body(a, x, y):
	return a * x + y


	@njit
	def axpy(r, a, x, y):
	for i in range(len(r)):
	r[i] = axpy_body(a, x[i], y[i])


	np.random.seed(1)

	N = 1000000

	x = np.random.random(N)
	y = np.random.random(N)
	r = np.zeros_like(x)
	a = 2.0

	# Warm up jit
	axpy(r, a, x, y)

	N_ITERATIONS = 100

	start = perf_counter()
	for i in range(N_ITERATIONS):
	axpy(r, a, x, y)
	end = perf_counter()

	print(f"CPU time is {end - start}")


	@cuda.jit
	def cuda_axpy(r, a, x, y):
	i = cuda.grid(1)
	gs = cuda.gridsize(1)

	for i in range(i, len(r), gs):
	r[i] = axpy_body(a, x[i], y[i])


	N_THREADS = 256
	N_BLOCKS = 72 * 8

	x_cuda = cp.random.random(x)
	y_cuda = cp.asarray(y)
	r_cuda = cp.zeros_like(r)

	start = perf_counter()
	for i in range(N_ITERATIONS):
	cuda_axpy[N_BLOCKS, N_THREADS](r_cuda, a, x_cuda, y_cuda)
	cuda.synchronize()
	end = perf_counter()

	np.testing.assert_equal(r, r_cuda.copy_to_host())

	print(f"GPU time is {end - start}")


	@njit
	def proxy_cuda_call(r, a, x, y):
	# Do random sampling here
	# ...
	with objmode:
	# copy to device
	# ..
	cuda_axpy[N_BLOCKS, N_THREADS](r, a, x, y)



	proxy_cuda_call(r, a, x, y)

	ptx, resty = cuda.compile_ptx(axpy, (float32[::1], float32, float32[::1], float32[::1]))