Skip to content

Instantly share code, notes, and snippets.

@sklam
Created December 8, 2014 19:04
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save sklam/dc8fce2cf37b82f66a48 to your computer and use it in GitHub Desktop.
from timeit import default_timer as timer
import numpy as np
import numbapro
from numbapro.cudalib import curand
import numba.cuda
# Use the builtin matrix_multiply in NumPy for CPU test
import numpy.core.umath_tests as ut
@numbapro.guvectorize(['void(float32[:,:], float32[:,:], float32[:,:])'],
'(m, n),(n, p)->(m, p)', target='gpu')
def batch_matrix_mult(a, b, c):
for i in range(c.shape[0]):
for j in range(c.shape[1]):
tmp = 0
for n in range(a.shape[1]):
tmp += a[i, n] * b[n, j]
c[i, j] = tmp
# batch_matrix_mult.max_blocksize = 512
prng = curand.PRNG()
def main():
n = 400000
dim = 2
sK = 0
KN = 10
# this array can be reused
db = numba.cuda.device_array((n, dim, dim), dtype=np.float32)
for K in range(KN):
# Matrix Multiplication: c = a x b
a = np.random.random(n * dim * dim).astype(np.float32).reshape(n, dim,
dim)
c = np.random.random(n * dim * dim).astype(np.float32).reshape(n, dim,
dim)
# NUMPY -------------------------------------------------------------
start = timer()
b = np.random.random(n * dim * dim).astype(np.float32).reshape(n, dim,
dim)
d = ut.matrix_multiply(a, b)
np_time = timer() - start
# CUDA --------------------------------------------------------------
dc = numba.cuda.device_array_like(c)
da = numba.cuda.to_device(a)
start = timer()
prng.uniform(db.ravel()) # generate random number on the device
batch_matrix_mult(da, db, out=dc)
numba.cuda.synchronize()
dc.copy_to_host(c)
cuda_time = timer() - start
sK += np_time / cuda_time
del da
print("\nThe average CUDA speed-up: %.5fx") % (sK / KN)
if __name__ == '__main__':
main()
import numpy as np
from numbapro import cuda
import numbapro.cudalib.cublas as cublas
import numba
from timeit import default_timer as timer
from numba import float32
bpg = 32
tpb = 32
n = bpg * tpb
shared_mem_size = (tpb, tpb)
griddim = bpg, bpg
blockdim = tpb, tpb
@numba.cuda.jit("void(float32[:,:], float32[:,:], float32[:,:])")
def naive_matrix_mult(A, B, C):
x, y = cuda.grid(2)
if x >= n or y >= n:
return
C[y, x] = 0
for i in range(n):
C[y, x] += A[y, i] * B[i, x]
@numba.cuda.jit("void(float32[:,:], float32[:,:], float32[:,:])")
def optimized_matrix_mult(A, B, C):
# Declare shared memory
sA = cuda.shared.array(shape=shared_mem_size, dtype=float32)
sB = cuda.shared.array(shape=shared_mem_size, dtype=float32)
tx = cuda.threadIdx.x
ty = cuda.threadIdx.y
x, y = cuda.grid(2)
acc = 0
for i in range(bpg):
if x < n and y < n:
# Prefill cache
sA[ty, tx] = A[y, tx + i * tpb]
sB[ty, tx] = B[ty + i * tpb, x]
# Synchronize all threads in the block
cuda.syncthreads()
if x < n and y < n:
# Compute product
for j in range(tpb):
acc += sA[ty, j] * sB[j, tx]
# Wait until all threads finish the computation
cuda.syncthreads()
if x < n and y < n:
C[y, x] = acc
# Prepare data on the CPU
A = np.array(np.random.random((n, n)), dtype=np.float32)
B = np.array(np.random.random((n, n)), dtype=np.float32)
print "(%d x %d) x (%d x %d)" % (n, n, n, n)
# Prepare data on the GPU
dA = cuda.to_device(A)
dB = cuda.to_device(B)
dC = cuda.device_array_like(A)
# Time the unoptimized version
s = timer()
naive_matrix_mult[griddim, blockdim](dA, dB, dC)
numba.cuda.synchronize()
e = timer()
unopt_ans = dC.copy_to_host()
tcuda_unopt = e - s
# Time the optimized version
s = timer()
optimized_matrix_mult[griddim, blockdim](dA, dB, dC)
numba.cuda.synchronize()
e = timer()
opt_ans = dC.copy_to_host()
tcuda_opt = e - s
# Time the cuBLAS version
blas = cublas.Blas()
s = timer()
blas.gemm('T', 'T', n, n, n, 1, dA, dB, 0, dC)
numba.cuda.synchronize()
e = timer()
tcuda_blas = e - s
blas_ans = dC.copy_to_host().T
assert np.allclose(unopt_ans, opt_ans)
assert np.allclose(unopt_ans, blas_ans)
print "CUDA without shared memory:", "%.3f" % tcuda_unopt, "s"
print "CUDA with shared memory :", "%.3f" % tcuda_opt, "s"
print "CUDA BLAS GEMM :", "%.3f" % tcuda_blas, "s"
s = timer()
np.dot(A,B)
e = timer()
npt=e-s
print "NumPy dot product :", "%.3f" % npt, "s"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment