alexbw/opencl_test.py

## opencl_test.py
import pyopencl as cl
import pyopencl.array as cl_array
from pyopencl.elementwise import ElementwiseKernel
import numpy.linalg as la
import numpy as np
from time import clock
from pylab import *

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

do_times = 20 # how many times do you want to perform the computation?
n = 10000000
a = 5
b = 6

lin_comb = ElementwiseKernel(ctx, "float a, float *x, float b, float *y, float *z",
                                    "z[i] = a*x[i] + b*y[i]",
                                    "linear_combination")

n_log = 8 # maximum 10th power array size - 1. My GPU (GeForce GT 330M) craps out at 10^8 floats.
cpu_time = np.empty((n_log,))
gpu_time = np.empty((n_log,))
cpu_time_accessed = np.empty((n_log,))
gpu_time_accessed = np.empty((n_log,))

array_sizes = np.logspace(0, n_log-1, n_log)

print "\n\n"
for (counter, n) in enumerate(array_sizes):

    print counter

    a_cpu = np.ones((n,)).astype('float32')
    b_cpu = np.ones((n,)).astype('float32')
    c_cpu = np.empty_like(a_cpu)
    a_gpu = cl_array.to_device(ctx, queue, a_cpu)
    b_gpu = cl_array.to_device(ctx, queue, b_cpu)
    c_gpu = cl_array.empty_like(a_gpu)

    t_cpu = clock()
    for i in range(do_times):
        a*a_cpu+b*b_cpu
    cpu_time[counter] = (clock() - t_cpu)/(do_times/1000.0)

    t_gpu = clock()
    for i in range(do_times):
        lin_comb(a, a_gpu, b, b_gpu, c_gpu)
    gpu_time[counter] = (clock() - t_gpu)/(do_times/1000.0)

    t_cpu = clock()
    for i in range(do_times):
        c_cpu = a*a_cpu+b*b_cpu
        c_cpu[0]
    cpu_time_accessed[counter] = (clock() - t_cpu)/(do_times/1000.0)

    t_gpu = clock()
    for i in range(do_times):
        lin_comb(a, a_gpu, b, b_gpu, c_gpu)
        c_gpu.get()[0]
    gpu_time_accessed[counter] = (clock() - t_gpu)/(do_times/1000.0)


cla()
loglog(array_sizes, cpu_time, '-gx')
loglog(array_sizes, gpu_time, '-rx')
loglog(array_sizes, cpu_time_accessed, '-go')
loglog(array_sizes, gpu_time_accessed, '-ro')
ylabel('ms / iteration')
xlabel('array size')
title('Execution time for a*x[] + b*y[] operation')
legend(('CPU, w/out retrieval', 'GPU, w/out retrieval', 'CPU w/ retrieval', 'GPU w/ retrieval'), loc='best')
	import pyopencl as cl
	import pyopencl.array as cl_array
	from pyopencl.elementwise import ElementwiseKernel
	import numpy.linalg as la
	import numpy as np
	from time import clock
	from pylab import *

	ctx = cl.create_some_context()
	queue = cl.CommandQueue(ctx)

	do_times = 20 # how many times do you want to perform the computation?
	n = 10000000
	a = 5
	b = 6

	lin_comb = ElementwiseKernel(ctx, "float a, float x, float b, float y, float *z",
	"z[i] = ax[i] + by[i]",
	"linear_combination")

	n_log = 8 # maximum 10th power array size - 1. My GPU (GeForce GT 330M) craps out at 10^8 floats.
	cpu_time = np.empty((n_log,))
	gpu_time = np.empty((n_log,))
	cpu_time_accessed = np.empty((n_log,))
	gpu_time_accessed = np.empty((n_log,))

	array_sizes = np.logspace(0, n_log-1, n_log)

	print "\n\n"
	for (counter, n) in enumerate(array_sizes):

	print counter

	a_cpu = np.ones((n,)).astype('float32')
	b_cpu = np.ones((n,)).astype('float32')
	c_cpu = np.empty_like(a_cpu)
	a_gpu = cl_array.to_device(ctx, queue, a_cpu)
	b_gpu = cl_array.to_device(ctx, queue, b_cpu)
	c_gpu = cl_array.empty_like(a_gpu)

	t_cpu = clock()
	for i in range(do_times):
	aa_cpu+bb_cpu
	cpu_time[counter] = (clock() - t_cpu)/(do_times/1000.0)

	t_gpu = clock()
	for i in range(do_times):
	lin_comb(a, a_gpu, b, b_gpu, c_gpu)
	gpu_time[counter] = (clock() - t_gpu)/(do_times/1000.0)

	t_cpu = clock()
	for i in range(do_times):
	c_cpu = aa_cpu+bb_cpu
	c_cpu[0]
	cpu_time_accessed[counter] = (clock() - t_cpu)/(do_times/1000.0)

	t_gpu = clock()
	for i in range(do_times):
	lin_comb(a, a_gpu, b, b_gpu, c_gpu)
	c_gpu.get()[0]
	gpu_time_accessed[counter] = (clock() - t_gpu)/(do_times/1000.0)


	cla()
	loglog(array_sizes, cpu_time, '-gx')
	loglog(array_sizes, gpu_time, '-rx')
	loglog(array_sizes, cpu_time_accessed, '-go')
	loglog(array_sizes, gpu_time_accessed, '-ro')
	ylabel('ms / iteration')
	xlabel('array size')
	title('Execution time for ax[] + by[] operation')
	legend(('CPU, w/out retrieval', 'GPU, w/out retrieval', 'CPU w/ retrieval', 'GPU w/ retrieval'), loc='best')