Skip to content

Instantly share code, notes, and snippets.

@rjpower
Forked from lebedov/p2p_mem_copy.py
Last active December 19, 2015 15:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rjpower/5979059 to your computer and use it in GitHub Desktop.
Save rjpower/5979059 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
"""
Compare speed of several methods of copying data between two GPU devices.
"""
import atexit, ctypes, re, time
import numpy as np
import pycuda.driver as drv
import pycuda.gpuarray as gpuarray
import pycuda.autoinit
import sys
if len(sys.argv) < 2:
bytes = 100 * 1000 * 1000
else:
bytes = int(sys.argv[1])
host_buffer = None
x = np.ones((bytes,), dtype=np.byte)
def func_timer(f, a, b):
def wrapper(*args, **kwargs):
start = time.time()
for i in range(10):
res = f(*args, **kwargs)
stop = time.time()
print bytes, a, b, f.__name__, '%.5f' % (stop - start)
return res
return wrapper
def sync_ctx(ctx):
ctx.push()
ctx.synchronize()
ctx.pop()
def force_sync(dest):
dest[100:101].get()[0]
# Expose default memory copy function:
if drv.get_driver_version() >= 4000:
cuda = ctypes.cdll.LoadLibrary('libcuda.so')
cuda.cuMemcpy.restype = int
cuda.cuMemcpy.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int]
def memcpy_peer_uva(dest, src, size, dest_ctx=None, src_ctx=None):
res = cuda.cuMemcpy(dest.ptr, src.ptr, size)
assert res == 0, res
force_sync(dest)
def memcpy_peer_host(dest, src, size, dest_ctx, src_ctx):
# Make src_context current and copy data to the host:
src_ctx.push()
drv.memcpy_dtoh(host_buffer, src.ptr)
src_ctx.pop()
# Make dest_context current and copy data from the host:
dest_ctx.push()
drv.memcpy_htod(dest.ptr, host_buffer)
dest_ctx.pop()
force_sync(dest)
def memcpy_peer_peer(dest, src, size, dest_ctx=None, src_ctx=None):
drv.memcpy_peer(dest.ptr, src.ptr, size, dest_ctx, src_ctx)
force_sync(dest)
def enable_peer_access(a, b):
try:
a.push()
a.enable_peer_access(b)
finally:
a.pop()
def test_devices(a, b):
global host_buffer
dev0 = drv.Device(a)
dev1 = drv.Device(b)
ctx0 = dev0.make_context(drv.ctx_flags.SCHED_BLOCKING_SYNC)
host_buffer = drv.pagelocked_empty(bytes, np.byte)
x_gpu = gpuarray.to_gpu(x)
ctx1 = dev1.make_context(drv.ctx_flags.SCHED_BLOCKING_SYNC)
y_gpu = gpuarray.zeros((bytes,), dtype=np.byte)
func_timer(memcpy_peer_host, a, b)(y_gpu, x_gpu, x_gpu.dtype.itemsize * x_gpu.size, ctx1, ctx0)
func_timer(memcpy_peer_uva, a, b)(y_gpu, x_gpu, x_gpu.dtype.itemsize * x_gpu.size, ctx1, ctx0)
try:
enable_peer_access(ctx0, ctx1)
enable_peer_access(ctx1, ctx0)
func_timer(memcpy_peer_peer, a, b)(y_gpu, x_gpu, x_gpu.dtype.itemsize * x_gpu.size, ctx1, ctx0)
except:
print bytes, a, b, 'memcpy_peer_peer', 0
ctx1.pop()
ctx0.pop()
if __name__ == '__main__':
drv.init()
count = drv.Device.count()
for i in range(count):
for j in range(i + 1, count):
test_devices(i, j)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment