suminb/matmul.py

## matmul.py
import numpy as np
import pyopencl as cl


ROWS = 10
COLS = 10


def run():
    a_np = np.random.rand(ROWS, COLS).astype(np.float32)
    b_np = np.random.rand(ROWS, COLS).astype(np.float32)

    # ctx = cl.create_some_context()

    platform = cl.get_platforms()[0]
    device = platform.get_devices(device_type=cl.device_type.ALL)[0]
    #import pdb; pdb.set_trace()
    ctx = cl.Context([device])

    queue = cl.CommandQueue(ctx)

    mf = cl.mem_flags
    a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
    b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)

    prg = cl.Program(ctx, """
        __kernel void
        mul(__global float* C,
                  __global float* A,
                  __global float* B,
                  int wA, int wB)
        {

           // 2D Thread ID
           int tx = get_global_id(0);
           int ty = get_global_id(1);

           // value stores the element
           // that is computed by the thread
           float value = 0;
           for (int k = 0; k < wA; ++k)
           {
              float elementA = A[ty * wA + k];
              float elementB = B[k * wB + tx];
              value += elementA * elementB;
           }

           // Write the matrix to device memory each
           // thread writes one element
           C[ty * wA + tx] = value;
        }
    """).build()

    res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes)

    for i in range(5):

        event = prg.mul(queue, (ROWS, COLS), None,
                        res_g, a_g, b_g, np.int32(COLS), np.int32(COLS))
        event.wait()

        res_np = np.empty_like(a_np)
        cl.enqueue_copy(queue, res_np, res_g)

        # Check on CPU with Numpy:
        print(res_np - (a_np * b_np))
        print(np.linalg.norm(res_np - (a_np * b_np)))


if __name__ == '__main__':
    run()
	import numpy as np
	import pyopencl as cl


	ROWS = 10
	COLS = 10


	def run():
	a_np = np.random.rand(ROWS, COLS).astype(np.float32)
	b_np = np.random.rand(ROWS, COLS).astype(np.float32)

	# ctx = cl.create_some_context()

	platform = cl.get_platforms()[0]
	device = platform.get_devices(device_type=cl.device_type.ALL)[0]
	#import pdb; pdb.set_trace()
	ctx = cl.Context([device])

	queue = cl.CommandQueue(ctx)

	mf = cl.mem_flags
	a_g = cl.Buffer(ctx, mf.READ_ONLY \| mf.COPY_HOST_PTR, hostbuf=a_np)
	b_g = cl.Buffer(ctx, mf.READ_ONLY \| mf.COPY_HOST_PTR, hostbuf=b_np)

	prg = cl.Program(ctx, """
	__kernel void
	mul(__global float* C,
	__global float* A,
	__global float* B,
	int wA, int wB)
	{

	// 2D Thread ID
	int tx = get_global_id(0);
	int ty = get_global_id(1);

	// value stores the element
	// that is computed by the thread
	float value = 0;
	for (int k = 0; k < wA; ++k)
	{
	float elementA = A[ty * wA + k];
	float elementB = B[k * wB + tx];
	value += elementA * elementB;
	}

	// Write the matrix to device memory each
	// thread writes one element
	C[ty * wA + tx] = value;
	}
	""").build()

	res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes)

	for i in range(5):

	event = prg.mul(queue, (ROWS, COLS), None,
	res_g, a_g, b_g, np.int32(COLS), np.int32(COLS))
	event.wait()

	res_np = np.empty_like(a_np)
	cl.enqueue_copy(queue, res_np, res_g)

	# Check on CPU with Numpy:
	print(res_np - (a_np * b_np))
	print(np.linalg.norm(res_np - (a_np * b_np)))


	if __name__ == '__main__':
	run()