Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Compare speed of several methods of copying data between two GPU devices
#!/usr/bin/env python
"""
Compare speed of several methods of copying data between two GPU devices.
"""
import atexit, re, time
import numpy as np
import pycuda.driver as drv
import pycuda.gpuarray as gpuarray
def func_timer(f):
def wrapper(*args, **kwargs):
start = time.time()
res = f(*args, **kwargs)
stop = time.time()
print 'execution time = %.5f s' % (stop-start)
return res
return wrapper
# Expose default memory copy function:
if drv.get_driver_version() >= 4000:
def memcpy_peer_uva(dest, src, size):
drv.memcpy_dtod(dest, src, size)
def memcpy_peer_host(dest, src, size, dest_ctx, src_ctx):
# Created pinned host memory buffer; copying to and from it is
# slightly faster than with an ordinary array as the size of the
# data copied increases:
host_buffer = drv.pagelocked_empty(size, np.byte)
# Make src_context current and copy data to the host:
src_ctx.push()
drv.memcpy_dtoh(host_buffer, src)
src_ctx.pop()
# Make dest_context current and copy data from the host:
dest_ctx.push()
drv.memcpy_htod(dest, host_buffer)
dest_ctx.pop()
def memcpy_peer_peer(dest, src, size, dest_ctx=None, src_ctx=None):
drv.memcpy_peer(dest, src, size, dest_ctx, src_ctx)
if __name__ == '__main__':
# Set up devices:
drv.init()
dev0 = drv.Device(0)
if dev0.count() < 2:
raise ValueError('need more than one GPU to run')
dev1 = drv.Device(1)
ctx0 = dev0.make_context()
ctx1 = dev1.make_context()
atexit.register(ctx0.pop)
atexit.register(ctx1.pop)
ctx1.pop()
ctx0.push()
x = np.random.rand(5*10**5)
x_gpu = gpuarray.to_gpu(x)
ctx0.pop()
ctx1.push()
y_gpu = gpuarray.zeros_like(x_gpu)
func_timer(memcpy_peer_host)(y_gpu.ptr, x_gpu.ptr, x_gpu.dtype.itemsize*x_gpu.size,
ctx1, ctx0)
if not np.allclose(x, y_gpu.get()):
print 'host copy failed'
if drv.get_driver_version() < 4000:
print 'need CUDA 4.0 or later to test UVA copy'
else:
y_gpu = gpuarray.zeros_like(x_gpu)
func_timer(memcpy_peer_uva)(y_gpu.ptr, x_gpu.ptr, x_gpu.dtype.itemsize*x_gpu.size)
if not np.allclose(x, y_gpu.get()):
print 'UVA copy failed'
if not (re.match('Tesla', dev0.name()) and \
re.match('Tesla', dev1.name())):
print 'not testing peer-to-peer copy on non-Tesla devices'
else:
ctx1.enable_peer_access(ctx0)
ctx1.pop()
ctx0.push()
ctx0.enable_peer_access(ctx1)
ctx0.pop()
ctx1.push()
y_gpu = gpuarray.zeros_like(x_gpu)
func_timer(memcpy_peer_peer)(y_gpu.ptr, x_gpu.ptr,
x_gpu.dtype.itemsize*x_gpu.size, ctx1, ctx0)
if not np.allclose(x, y_gpu.get()):
print 'Peer-to-peer copy failed'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.