sklam/batch_matrix_mult.py

## batch_matrix_mult.py
from timeit import default_timer as timer

import numpy as np

import numbapro
from numbapro.cudalib import curand
import numba.cuda

# Use the builtin matrix_multiply in NumPy for CPU test
import numpy.core.umath_tests as ut


@numbapro.guvectorize(['void(float32[:,:], float32[:,:], float32[:,:])'],
                      '(m, n),(n, p)->(m, p)', target='gpu')
def batch_matrix_mult(a, b, c):
    for i in range(c.shape[0]):
        for j in range(c.shape[1]):
            tmp = 0
            for n in range(a.shape[1]):
                tmp += a[i, n] * b[n, j]
            c[i, j] = tmp

# batch_matrix_mult.max_blocksize = 512
prng = curand.PRNG()

def main():
    n = 400000
    dim = 2

    sK = 0
    KN = 10

    # this array can be reused
    db = numba.cuda.device_array((n, dim, dim), dtype=np.float32)

    for K in range(KN):
        # Matrix Multiplication:   c = a x b

        a = np.random.random(n * dim * dim).astype(np.float32).reshape(n, dim,
                                                                       dim)
        c = np.random.random(n * dim * dim).astype(np.float32).reshape(n, dim,
                                                                       dim)

        # NUMPY -------------------------------------------------------------
        start = timer()
        b = np.random.random(n * dim * dim).astype(np.float32).reshape(n, dim,
                                                                       dim)
        d = ut.matrix_multiply(a, b)
        np_time = timer() - start

        # CUDA --------------------------------------------------------------
        dc = numba.cuda.device_array_like(c)
        da = numba.cuda.to_device(a)

        start = timer()
        prng.uniform(db.ravel())  # generate random number on the device
        batch_matrix_mult(da, db, out=dc)
        numba.cuda.synchronize()
        dc.copy_to_host(c)
        cuda_time = timer() - start

        sK += np_time / cuda_time

        del da

    print("\nThe average CUDA speed-up: %.5fx") % (sK / KN)


if __name__ == '__main__':
    main()

## randomportfolios.py
import numpy as np
from numbapro import cuda
import numbapro.cudalib.cublas as cublas
import numba
from timeit import default_timer as timer
from numba import float32

bpg = 32
tpb = 32

n = bpg * tpb

shared_mem_size = (tpb, tpb)
griddim = bpg, bpg
blockdim = tpb, tpb

@numba.cuda.jit("void(float32[:,:], float32[:,:], float32[:,:])")
def naive_matrix_mult(A, B, C):
    x, y = cuda.grid(2)
    if x >= n or y >= n:
        return

    C[y, x] = 0
    for i in range(n):
        C[y, x] += A[y, i] * B[i, x]


@numba.cuda.jit("void(float32[:,:], float32[:,:], float32[:,:])")
def optimized_matrix_mult(A, B, C):

    # Declare shared memory
    sA = cuda.shared.array(shape=shared_mem_size, dtype=float32)
    sB = cuda.shared.array(shape=shared_mem_size, dtype=float32)

    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    x, y = cuda.grid(2)

    acc = 0
    for i in range(bpg):
        if x < n and y < n:
            # Prefill cache
            sA[ty, tx] = A[y, tx + i * tpb]
            sB[ty, tx] = B[ty + i * tpb, x]

        # Synchronize all threads in the block
        cuda.syncthreads()

        if x < n and y < n:
            # Compute product
            for j in range(tpb):
                acc += sA[ty, j] * sB[j, tx]

        # Wait until all threads finish the computation
        cuda.syncthreads()

    if x < n and y < n:
        C[y, x] = acc


# Prepare data on the CPU
A = np.array(np.random.random((n, n)), dtype=np.float32)
B = np.array(np.random.random((n, n)), dtype=np.float32)

print "(%d x %d) x (%d x %d)" % (n, n, n, n)

# Prepare data on the GPU
dA = cuda.to_device(A)
dB = cuda.to_device(B)
dC = cuda.device_array_like(A)

# Time the unoptimized version
s = timer()
naive_matrix_mult[griddim, blockdim](dA, dB, dC)
numba.cuda.synchronize()
e = timer()
unopt_ans = dC.copy_to_host()
tcuda_unopt = e - s

# Time the optimized version
s = timer()
optimized_matrix_mult[griddim, blockdim](dA, dB, dC)
numba.cuda.synchronize()
e = timer()
opt_ans = dC.copy_to_host()
tcuda_opt = e - s


# Time the cuBLAS version
blas = cublas.Blas()
s = timer()
blas.gemm('T', 'T', n, n, n, 1, dA, dB, 0, dC)
numba.cuda.synchronize()
e = timer()
tcuda_blas = e - s
blas_ans = dC.copy_to_host().T

assert np.allclose(unopt_ans, opt_ans)
assert np.allclose(unopt_ans, blas_ans)
print "CUDA without shared memory:", "%.3f" % tcuda_unopt, "s"
print "CUDA with shared memory   :", "%.3f" % tcuda_opt, "s"
print "CUDA BLAS GEMM            :", "%.3f" % tcuda_blas, "s"

s = timer()
np.dot(A,B)
e = timer()
npt=e-s
print "NumPy dot product         :", "%.3f" % npt, "s"
	from timeit import default_timer as timer

	import numpy as np

	import numbapro
	from numbapro.cudalib import curand
	import numba.cuda

	# Use the builtin matrix_multiply in NumPy for CPU test
	import numpy.core.umath_tests as ut


	@numbapro.guvectorize(['void(float32[:,:], float32[:,:], float32[:,:])'],
	'(m, n),(n, p)->(m, p)', target='gpu')
	def batch_matrix_mult(a, b, c):
	for i in range(c.shape[0]):
	for j in range(c.shape[1]):
	tmp = 0
	for n in range(a.shape[1]):
	tmp += a[i, n] * b[n, j]
	c[i, j] = tmp

	# batch_matrix_mult.max_blocksize = 512
	prng = curand.PRNG()

	def main():
	n = 400000
	dim = 2

	sK = 0
	KN = 10

	# this array can be reused
	db = numba.cuda.device_array((n, dim, dim), dtype=np.float32)

	for K in range(KN):
	# Matrix Multiplication: c = a x b

	a = np.random.random(n * dim * dim).astype(np.float32).reshape(n, dim,
	dim)
	c = np.random.random(n * dim * dim).astype(np.float32).reshape(n, dim,
	dim)

	# NUMPY -------------------------------------------------------------
	start = timer()
	b = np.random.random(n * dim * dim).astype(np.float32).reshape(n, dim,
	dim)
	d = ut.matrix_multiply(a, b)
	np_time = timer() - start

	# CUDA --------------------------------------------------------------
	dc = numba.cuda.device_array_like(c)
	da = numba.cuda.to_device(a)

	start = timer()
	prng.uniform(db.ravel()) # generate random number on the device
	batch_matrix_mult(da, db, out=dc)
	numba.cuda.synchronize()
	dc.copy_to_host(c)
	cuda_time = timer() - start

	sK += np_time / cuda_time

	del da

	print("\nThe average CUDA speed-up: %.5fx") % (sK / KN)


	if __name__ == '__main__':
	main()
	import numpy as np
	from numbapro import cuda
	import numbapro.cudalib.cublas as cublas
	import numba
	from timeit import default_timer as timer
	from numba import float32

	bpg = 32
	tpb = 32

	n = bpg * tpb

	shared_mem_size = (tpb, tpb)
	griddim = bpg, bpg
	blockdim = tpb, tpb

	@numba.cuda.jit("void(float32[:,:], float32[:,:], float32[:,:])")
	def naive_matrix_mult(A, B, C):
	x, y = cuda.grid(2)
	if x >= n or y >= n:
	return

	C[y, x] = 0
	for i in range(n):
	C[y, x] += A[y, i] * B[i, x]


	@numba.cuda.jit("void(float32[:,:], float32[:,:], float32[:,:])")
	def optimized_matrix_mult(A, B, C):

	# Declare shared memory
	sA = cuda.shared.array(shape=shared_mem_size, dtype=float32)
	sB = cuda.shared.array(shape=shared_mem_size, dtype=float32)

	tx = cuda.threadIdx.x
	ty = cuda.threadIdx.y
	x, y = cuda.grid(2)

	acc = 0
	for i in range(bpg):
	if x < n and y < n:
	# Prefill cache
	sA[ty, tx] = A[y, tx + i * tpb]
	sB[ty, tx] = B[ty + i * tpb, x]

	# Synchronize all threads in the block
	cuda.syncthreads()

	if x < n and y < n:
	# Compute product
	for j in range(tpb):
	acc += sA[ty, j] * sB[j, tx]

	# Wait until all threads finish the computation
	cuda.syncthreads()

	if x < n and y < n:
	C[y, x] = acc


	# Prepare data on the CPU
	A = np.array(np.random.random((n, n)), dtype=np.float32)
	B = np.array(np.random.random((n, n)), dtype=np.float32)

	print "(%d x %d) x (%d x %d)" % (n, n, n, n)

	# Prepare data on the GPU
	dA = cuda.to_device(A)
	dB = cuda.to_device(B)
	dC = cuda.device_array_like(A)

	# Time the unoptimized version
	s = timer()
	naive_matrix_mult[griddim, blockdim](dA, dB, dC)
	numba.cuda.synchronize()
	e = timer()
	unopt_ans = dC.copy_to_host()
	tcuda_unopt = e - s

	# Time the optimized version
	s = timer()
	optimized_matrix_mult[griddim, blockdim](dA, dB, dC)
	numba.cuda.synchronize()
	e = timer()
	opt_ans = dC.copy_to_host()
	tcuda_opt = e - s


	# Time the cuBLAS version
	blas = cublas.Blas()
	s = timer()
	blas.gemm('T', 'T', n, n, n, 1, dA, dB, 0, dC)
	numba.cuda.synchronize()
	e = timer()
	tcuda_blas = e - s
	blas_ans = dC.copy_to_host().T

	assert np.allclose(unopt_ans, opt_ans)
	assert np.allclose(unopt_ans, blas_ans)
	print "CUDA without shared memory:", "%.3f" % tcuda_unopt, "s"
	print "CUDA with shared memory :", "%.3f" % tcuda_opt, "s"
	print "CUDA BLAS GEMM :", "%.3f" % tcuda_blas, "s"

	s = timer()
	np.dot(A,B)
	e = timer()
	npt=e-s
	print "NumPy dot product :", "%.3f" % npt, "s"