Skip to content

Instantly share code, notes, and snippets.

@suminb
Created February 3, 2015 12:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save suminb/c6fb6df9cc357cf6f6eb to your computer and use it in GitHub Desktop.
Save suminb/c6fb6df9cc357cf6f6eb to your computer and use it in GitHub Desktop.
Matrix Multiplication
import numpy as np
import pyopencl as cl
ROWS = 10
COLS = 10
def run():
a_np = np.random.rand(ROWS, COLS).astype(np.float32)
b_np = np.random.rand(ROWS, COLS).astype(np.float32)
# ctx = cl.create_some_context()
platform = cl.get_platforms()[0]
device = platform.get_devices(device_type=cl.device_type.ALL)[0]
#import pdb; pdb.set_trace()
ctx = cl.Context([device])
queue = cl.CommandQueue(ctx)
mf = cl.mem_flags
a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
prg = cl.Program(ctx, """
__kernel void
mul(__global float* C,
__global float* A,
__global float* B,
int wA, int wB)
{
// 2D Thread ID
int tx = get_global_id(0);
int ty = get_global_id(1);
// value stores the element
// that is computed by the thread
float value = 0;
for (int k = 0; k < wA; ++k)
{
float elementA = A[ty * wA + k];
float elementB = B[k * wB + tx];
value += elementA * elementB;
}
// Write the matrix to device memory each
// thread writes one element
C[ty * wA + tx] = value;
}
""").build()
res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes)
for i in range(5):
event = prg.mul(queue, (ROWS, COLS), None,
res_g, a_g, b_g, np.int32(COLS), np.int32(COLS))
event.wait()
res_np = np.empty_like(a_np)
cl.enqueue_copy(queue, res_np, res_g)
# Check on CPU with Numpy:
print(res_np - (a_np * b_np))
print(np.linalg.norm(res_np - (a_np * b_np)))
if __name__ == '__main__':
run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment