NumbaPro example script in response to the blog post: http://www.quantatrisk.com/2014/12/06/gpu-accelerated-finance-in-python-with-numbapro/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from timeit import default_timer as timer | |
import numpy as np | |
import numbapro | |
from numbapro.cudalib import curand | |
import numba.cuda | |
# Use the builtin matrix_multiply in NumPy for CPU test | |
import numpy.core.umath_tests as ut | |
@numbapro.guvectorize(['void(float32[:,:], float32[:,:], float32[:,:])'], | |
'(m, n),(n, p)->(m, p)', target='gpu') | |
def batch_matrix_mult(a, b, c): | |
for i in range(c.shape[0]): | |
for j in range(c.shape[1]): | |
tmp = 0 | |
for n in range(a.shape[1]): | |
tmp += a[i, n] * b[n, j] | |
c[i, j] = tmp | |
# batch_matrix_mult.max_blocksize = 512 | |
prng = curand.PRNG() | |
def main(): | |
n = 400000 | |
dim = 2 | |
sK = 0 | |
KN = 10 | |
# this array can be reused | |
db = numba.cuda.device_array((n, dim, dim), dtype=np.float32) | |
for K in range(KN): | |
# Matrix Multiplication: c = a x b | |
a = np.random.random(n * dim * dim).astype(np.float32).reshape(n, dim, | |
dim) | |
c = np.random.random(n * dim * dim).astype(np.float32).reshape(n, dim, | |
dim) | |
# NUMPY ------------------------------------------------------------- | |
start = timer() | |
b = np.random.random(n * dim * dim).astype(np.float32).reshape(n, dim, | |
dim) | |
d = ut.matrix_multiply(a, b) | |
np_time = timer() - start | |
# CUDA -------------------------------------------------------------- | |
dc = numba.cuda.device_array_like(c) | |
da = numba.cuda.to_device(a) | |
start = timer() | |
prng.uniform(db.ravel()) # generate random number on the device | |
batch_matrix_mult(da, db, out=dc) | |
numba.cuda.synchronize() | |
dc.copy_to_host(c) | |
cuda_time = timer() - start | |
sK += np_time / cuda_time | |
del da | |
print("\nThe average CUDA speed-up: %.5fx") % (sK / KN) | |
if __name__ == '__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from numbapro import cuda | |
import numbapro.cudalib.cublas as cublas | |
import numba | |
from timeit import default_timer as timer | |
from numba import float32 | |
bpg = 32 | |
tpb = 32 | |
n = bpg * tpb | |
shared_mem_size = (tpb, tpb) | |
griddim = bpg, bpg | |
blockdim = tpb, tpb | |
@numba.cuda.jit("void(float32[:,:], float32[:,:], float32[:,:])") | |
def naive_matrix_mult(A, B, C): | |
x, y = cuda.grid(2) | |
if x >= n or y >= n: | |
return | |
C[y, x] = 0 | |
for i in range(n): | |
C[y, x] += A[y, i] * B[i, x] | |
@numba.cuda.jit("void(float32[:,:], float32[:,:], float32[:,:])") | |
def optimized_matrix_mult(A, B, C): | |
# Declare shared memory | |
sA = cuda.shared.array(shape=shared_mem_size, dtype=float32) | |
sB = cuda.shared.array(shape=shared_mem_size, dtype=float32) | |
tx = cuda.threadIdx.x | |
ty = cuda.threadIdx.y | |
x, y = cuda.grid(2) | |
acc = 0 | |
for i in range(bpg): | |
if x < n and y < n: | |
# Prefill cache | |
sA[ty, tx] = A[y, tx + i * tpb] | |
sB[ty, tx] = B[ty + i * tpb, x] | |
# Synchronize all threads in the block | |
cuda.syncthreads() | |
if x < n and y < n: | |
# Compute product | |
for j in range(tpb): | |
acc += sA[ty, j] * sB[j, tx] | |
# Wait until all threads finish the computation | |
cuda.syncthreads() | |
if x < n and y < n: | |
C[y, x] = acc | |
# Prepare data on the CPU | |
A = np.array(np.random.random((n, n)), dtype=np.float32) | |
B = np.array(np.random.random((n, n)), dtype=np.float32) | |
print "(%d x %d) x (%d x %d)" % (n, n, n, n) | |
# Prepare data on the GPU | |
dA = cuda.to_device(A) | |
dB = cuda.to_device(B) | |
dC = cuda.device_array_like(A) | |
# Time the unoptimized version | |
s = timer() | |
naive_matrix_mult[griddim, blockdim](dA, dB, dC) | |
numba.cuda.synchronize() | |
e = timer() | |
unopt_ans = dC.copy_to_host() | |
tcuda_unopt = e - s | |
# Time the optimized version | |
s = timer() | |
optimized_matrix_mult[griddim, blockdim](dA, dB, dC) | |
numba.cuda.synchronize() | |
e = timer() | |
opt_ans = dC.copy_to_host() | |
tcuda_opt = e - s | |
# Time the cuBLAS version | |
blas = cublas.Blas() | |
s = timer() | |
blas.gemm('T', 'T', n, n, n, 1, dA, dB, 0, dC) | |
numba.cuda.synchronize() | |
e = timer() | |
tcuda_blas = e - s | |
blas_ans = dC.copy_to_host().T | |
assert np.allclose(unopt_ans, opt_ans) | |
assert np.allclose(unopt_ans, blas_ans) | |
print "CUDA without shared memory:", "%.3f" % tcuda_unopt, "s" | |
print "CUDA with shared memory :", "%.3f" % tcuda_opt, "s" | |
print "CUDA BLAS GEMM :", "%.3f" % tcuda_blas, "s" | |
s = timer() | |
np.dot(A,B) | |
e = timer() | |
npt=e-s | |
print "NumPy dot product :", "%.3f" % npt, "s" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment