lebedov/p2p_mem_copy.py

## p2p_mem_copy.py
#!/usr/bin/env python

"""
Compare speed of several methods of copying data between two GPU devices.
"""

import atexit, re, time
import numpy as np
import pycuda.driver as drv
import pycuda.gpuarray as gpuarray

def func_timer(f):
    def wrapper(*args, **kwargs):
        start = time.time()
        res = f(*args, **kwargs)
        stop = time.time()
        print 'execution time = %.5f s' % (stop-start)
        return res
    return wrapper

# Expose default memory copy function:
if drv.get_driver_version() >= 4000:

    def memcpy_peer_uva(dest, src, size):
        drv.memcpy_dtod(dest, src, size)

def memcpy_peer_host(dest, src, size, dest_ctx, src_ctx):

    # Created pinned host memory buffer; copying to and from it is
    # slightly faster than with an ordinary array as the size of the
    # data copied increases:
    host_buffer = drv.pagelocked_empty(size, np.byte)

    # Make src_context current and copy data to the host:
    src_ctx.push()
    drv.memcpy_dtoh(host_buffer, src)
    src_ctx.pop()

    # Make dest_context current and copy data from the host:
    dest_ctx.push()
    drv.memcpy_htod(dest, host_buffer)
    dest_ctx.pop()

def memcpy_peer_peer(dest, src, size, dest_ctx=None, src_ctx=None):
    drv.memcpy_peer(dest, src, size, dest_ctx, src_ctx)

if __name__ == '__main__':

    # Set up devices:
    drv.init()

    dev0 = drv.Device(0)
    if dev0.count() < 2:
        raise ValueError('need more than one GPU to run')
    dev1 = drv.Device(1)

    ctx0 = dev0.make_context()
    ctx1 = dev1.make_context()

    atexit.register(ctx0.pop)
    atexit.register(ctx1.pop)

    ctx1.pop()
    ctx0.push()
    x = np.random.rand(5*10**5)
    x_gpu = gpuarray.to_gpu(x)

    ctx0.pop()
    ctx1.push()

    y_gpu = gpuarray.zeros_like(x_gpu)
    func_timer(memcpy_peer_host)(y_gpu.ptr, x_gpu.ptr, x_gpu.dtype.itemsize*x_gpu.size,
                                 ctx1, ctx0)
    if not np.allclose(x, y_gpu.get()):
        print 'host copy failed'

    if drv.get_driver_version() < 4000:
        print 'need CUDA 4.0 or later to test UVA copy'
    else:
        y_gpu = gpuarray.zeros_like(x_gpu)
        func_timer(memcpy_peer_uva)(y_gpu.ptr, x_gpu.ptr, x_gpu.dtype.itemsize*x_gpu.size)
        if not np.allclose(x, y_gpu.get()):
            print 'UVA copy failed'
    if not (re.match('Tesla', dev0.name()) and \
        re.match('Tesla', dev1.name())):
        print 'not testing peer-to-peer copy on non-Tesla devices'
    else:
        ctx1.enable_peer_access(ctx0)
        ctx1.pop()
        ctx0.push()
        ctx0.enable_peer_access(ctx1)
        ctx0.pop()
        ctx1.push()
        y_gpu = gpuarray.zeros_like(x_gpu)
        func_timer(memcpy_peer_peer)(y_gpu.ptr, x_gpu.ptr,
                                     x_gpu.dtype.itemsize*x_gpu.size, ctx1, ctx0)
        if not np.allclose(x, y_gpu.get()):
            print 'Peer-to-peer copy failed'
	#!/usr/bin/env python

	"""
	Compare speed of several methods of copying data between two GPU devices.
	"""

	import atexit, re, time
	import numpy as np
	import pycuda.driver as drv
	import pycuda.gpuarray as gpuarray

	def func_timer(f):
	def wrapper(args, *kwargs):
	start = time.time()
	res = f(args, *kwargs)
	stop = time.time()
	print 'execution time = %.5f s' % (stop-start)
	return res
	return wrapper

	# Expose default memory copy function:
	if drv.get_driver_version() >= 4000:

	def memcpy_peer_uva(dest, src, size):
	drv.memcpy_dtod(dest, src, size)

	def memcpy_peer_host(dest, src, size, dest_ctx, src_ctx):

	# Created pinned host memory buffer; copying to and from it is
	# slightly faster than with an ordinary array as the size of the
	# data copied increases:
	host_buffer = drv.pagelocked_empty(size, np.byte)

	# Make src_context current and copy data to the host:
	src_ctx.push()
	drv.memcpy_dtoh(host_buffer, src)
	src_ctx.pop()

	# Make dest_context current and copy data from the host:
	dest_ctx.push()
	drv.memcpy_htod(dest, host_buffer)
	dest_ctx.pop()

	def memcpy_peer_peer(dest, src, size, dest_ctx=None, src_ctx=None):
	drv.memcpy_peer(dest, src, size, dest_ctx, src_ctx)

	if __name__ == '__main__':

	# Set up devices:
	drv.init()

	dev0 = drv.Device(0)
	if dev0.count() < 2:
	raise ValueError('need more than one GPU to run')
	dev1 = drv.Device(1)

	ctx0 = dev0.make_context()
	ctx1 = dev1.make_context()

	atexit.register(ctx0.pop)
	atexit.register(ctx1.pop)

	ctx1.pop()
	ctx0.push()
	x = np.random.rand(510*5)
	x_gpu = gpuarray.to_gpu(x)

	ctx0.pop()
	ctx1.push()

	y_gpu = gpuarray.zeros_like(x_gpu)
	func_timer(memcpy_peer_host)(y_gpu.ptr, x_gpu.ptr, x_gpu.dtype.itemsize*x_gpu.size,
	ctx1, ctx0)
	if not np.allclose(x, y_gpu.get()):
	print 'host copy failed'

	if drv.get_driver_version() < 4000:
	print 'need CUDA 4.0 or later to test UVA copy'
	else:
	y_gpu = gpuarray.zeros_like(x_gpu)
	func_timer(memcpy_peer_uva)(y_gpu.ptr, x_gpu.ptr, x_gpu.dtype.itemsize*x_gpu.size)
	if not np.allclose(x, y_gpu.get()):
	print 'UVA copy failed'
	if not (re.match('Tesla', dev0.name()) and \
	re.match('Tesla', dev1.name())):
	print 'not testing peer-to-peer copy on non-Tesla devices'
	else:
	ctx1.enable_peer_access(ctx0)
	ctx1.pop()
	ctx0.push()
	ctx0.enable_peer_access(ctx1)
	ctx0.pop()
	ctx1.push()
	y_gpu = gpuarray.zeros_like(x_gpu)
	func_timer(memcpy_peer_peer)(y_gpu.ptr, x_gpu.ptr,
	x_gpu.dtype.itemsize*x_gpu.size, ctx1, ctx0)
	if not np.allclose(x, y_gpu.get()):
	print 'Peer-to-peer copy failed'