Skip to content

Instantly share code, notes, and snippets.

@lebedov
Last active April 29, 2023 21:10
Show Gist options
  • Save lebedov/3078644 to your computer and use it in GitHub Desktop.
Save lebedov/3078644 to your computer and use it in GitHub Desktop.
Compare speed of several methods of copying data between two GPU devices
#!/usr/bin/env python
"""
Compare speed of several methods of copying data between two GPU devices.
"""
import atexit, re, time
import numpy as np
import pycuda.driver as drv
import pycuda.gpuarray as gpuarray
def func_timer(f):
def wrapper(*args, **kwargs):
start = time.time()
res = f(*args, **kwargs)
stop = time.time()
print 'execution time = %.5f s' % (stop-start)
return res
return wrapper
# Expose default memory copy function:
if drv.get_driver_version() >= 4000:
def memcpy_peer_uva(dest, src, size):
drv.memcpy_dtod(dest, src, size)
def memcpy_peer_host(dest, src, size, dest_ctx, src_ctx):
# Created pinned host memory buffer; copying to and from it is
# slightly faster than with an ordinary array as the size of the
# data copied increases:
host_buffer = drv.pagelocked_empty(size, np.byte)
# Make src_context current and copy data to the host:
src_ctx.push()
drv.memcpy_dtoh(host_buffer, src)
src_ctx.pop()
# Make dest_context current and copy data from the host:
dest_ctx.push()
drv.memcpy_htod(dest, host_buffer)
dest_ctx.pop()
def memcpy_peer_peer(dest, src, size, dest_ctx=None, src_ctx=None):
drv.memcpy_peer(dest, src, size, dest_ctx, src_ctx)
if __name__ == '__main__':
# Set up devices:
drv.init()
dev0 = drv.Device(0)
if dev0.count() < 2:
raise ValueError('need more than one GPU to run')
dev1 = drv.Device(1)
ctx0 = dev0.make_context()
ctx1 = dev1.make_context()
atexit.register(ctx0.pop)
atexit.register(ctx1.pop)
ctx1.pop()
ctx0.push()
x = np.random.rand(5*10**5)
x_gpu = gpuarray.to_gpu(x)
ctx0.pop()
ctx1.push()
y_gpu = gpuarray.zeros_like(x_gpu)
func_timer(memcpy_peer_host)(y_gpu.ptr, x_gpu.ptr, x_gpu.dtype.itemsize*x_gpu.size,
ctx1, ctx0)
if not np.allclose(x, y_gpu.get()):
print 'host copy failed'
if drv.get_driver_version() < 4000:
print 'need CUDA 4.0 or later to test UVA copy'
else:
y_gpu = gpuarray.zeros_like(x_gpu)
func_timer(memcpy_peer_uva)(y_gpu.ptr, x_gpu.ptr, x_gpu.dtype.itemsize*x_gpu.size)
if not np.allclose(x, y_gpu.get()):
print 'UVA copy failed'
if not (re.match('Tesla', dev0.name()) and \
re.match('Tesla', dev1.name())):
print 'not testing peer-to-peer copy on non-Tesla devices'
else:
ctx1.enable_peer_access(ctx0)
ctx1.pop()
ctx0.push()
ctx0.enable_peer_access(ctx1)
ctx0.pop()
ctx1.push()
y_gpu = gpuarray.zeros_like(x_gpu)
func_timer(memcpy_peer_peer)(y_gpu.ptr, x_gpu.ptr,
x_gpu.dtype.itemsize*x_gpu.size, ctx1, ctx0)
if not np.allclose(x, y_gpu.get()):
print 'Peer-to-peer copy failed'
@MAbdelatti
Copy link

Thanks for your reply, the CuPy function looks helpful but I couldn't get any clue how to use it since there's no example and I don't know a workaround to use pointers in python (I use numba or CuPy arrays).

The memcpy_peer() seems to be an intermediate step for future development.

@lebedov
Copy link
Author

lebedov commented Jun 16, 2021

@MarwanAbdelatti Sounds plausible - wouldn't hurt to ask the cupy developers about it.

@MAbdelatti
Copy link

Will do, thanks.
Have a good one.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment