fasiha/test_pinned_register_v2.py

## test_pinned_register_v2.py
#!/usr/bin/env python

"""
Compare performance of using host-registered pinned and unpinned host memory,
with more than one block for larger arrays, and with unpinned tried first.
"""

import numpy as np

import pycuda.autoinit
import pycuda.driver as drv
from pycuda.compiler import SourceModule

from time import time

increment_mod = SourceModule("""
__global__ void increment(double *a, int N)
{
    int idx = threadIdx.x + blockIdx.x*blockDim.x;
    if (idx < N)
        a[idx] = a[idx]+1;
}
""")
increment = increment_mod.get_function("increment")

N = 23 # breaks. Works if <= 22
M = 3

# Time use of pageable host memory:
x = np.empty((N, N), np.float64)

times = np.empty(M)
for i in xrange(M):
    x[:, :] = np.random.rand(N, N)
    x_orig = x.copy()
    start = time()
    increment(drv.InOut(x), np.uint32(x.size), block=(512, 1, 1),grid=(int(np.ceil(N*N/float(512))),1,1))
    times[i] = time()-start
    assert np.allclose(x_orig + 1, x)

print "Average kernel execution time with pageable memory: %3.7f" % np.mean(times)

# Time use of pinned host memory:
x = drv.aligned_empty((N, N), dtype=np.float64, order='C')
x = drv.register_host_memory(x, flags=drv.mem_host_register_flags.DEVICEMAP)
x_gpu_ptr = np.intp(x.base.get_device_pointer())

times = np.empty(M)
for i in xrange(M):
    x[:, :] = np.random.rand(N, N)
    x_orig = x.copy()
    start = time()
    increment(x_gpu_ptr, np.uint32(x.size), block=(512, 1, 1), grid=(int(np.ceil(N*N/float(512))),1,1))
    times[i] = time()-start
    assert np.allclose(x_orig + 1, x)
print "Average kernel execution time with pinned memory:   %3.7f" % np.mean(times)
	#!/usr/bin/env python

	"""
	Compare performance of using host-registered pinned and unpinned host memory,
	with more than one block for larger arrays, and with unpinned tried first.
	"""

	import numpy as np

	import pycuda.autoinit
	import pycuda.driver as drv
	from pycuda.compiler import SourceModule

	from time import time

	increment_mod = SourceModule("""
	__global__ void increment(double *a, int N)
	{
	int idx = threadIdx.x + blockIdx.x*blockDim.x;
	if (idx < N)
	a[idx] = a[idx]+1;
	}
	""")
	increment = increment_mod.get_function("increment")

	N = 23 # breaks. Works if <= 22
	M = 3

	# Time use of pageable host memory:
	x = np.empty((N, N), np.float64)

	times = np.empty(M)
	for i in xrange(M):
	x[:, :] = np.random.rand(N, N)
	x_orig = x.copy()
	start = time()
	increment(drv.InOut(x), np.uint32(x.size), block=(512, 1, 1),grid=(int(np.ceil(N*N/float(512))),1,1))
	times[i] = time()-start
	assert np.allclose(x_orig + 1, x)

	print "Average kernel execution time with pageable memory: %3.7f" % np.mean(times)

	# Time use of pinned host memory:
	x = drv.aligned_empty((N, N), dtype=np.float64, order='C')
	x = drv.register_host_memory(x, flags=drv.mem_host_register_flags.DEVICEMAP)
	x_gpu_ptr = np.intp(x.base.get_device_pointer())

	times = np.empty(M)
	for i in xrange(M):
	x[:, :] = np.random.rand(N, N)
	x_orig = x.copy()
	start = time()
	increment(x_gpu_ptr, np.uint32(x.size), block=(512, 1, 1), grid=(int(np.ceil(N*N/float(512))),1,1))
	times[i] = time()-start
	assert np.allclose(x_orig + 1, x)
	print "Average kernel execution time with pinned memory: %3.7f" % np.mean(times)