Skip to content

Instantly share code, notes, and snippets.

@geohot
Created November 21, 2023 19:21
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save geohot/3b23bb6be846d3097834d07806ca6563 to your computer and use it in GitHub Desktop.
Save geohot/3b23bb6be846d3097834d07806ca6563 to your computer and use it in GitHub Desktop.
Fast memcpy using GPUs
# tiny@tiny9:~/tinygrad$ python3 examples/benchmark_copies.py
# CPU copy 6.18 ms, 16.28 GB/s
# GPU copy 4.38 ms, 23.00 GB/s
# GPU 6x 1.85 ms, 54.54 GB/s
import time
def timeit(fxn):
tms = []
for _ in range(10):
st = time.perf_counter()
fxn()
tms.append(time.perf_counter() - st)
return min(tms)
import ctypes
sz_bytes = 4096 * 4096 * 6
import extra.hip_wrapper as hip
inp = hip.hipHostMalloc(sz_bytes)
out = hip.hipHostMalloc(sz_bytes)
# ***** CPU timing *****
def cpu_memcpy(): ctypes.memmove(out, inp, sz_bytes)
print(f"CPU copy {(tm:=timeit(cpu_memcpy))*1000:.2f} ms, {sz_bytes*1e-9/tm:.2f} GB/s")
# ***** GPU timing *****
STREAMS = 16
sz_bytes_chunk = sz_bytes//STREAMS
buf = [hip.hipMalloc(sz_bytes_chunk) for _ in range(STREAMS)]
streams = [hip.hipStreamCreate() for _ in range(STREAMS)]
def gpu_roundtrip():
for i in range(STREAMS):
hip.hipMemcpyAsync(buf[i], ctypes.c_void_p(inp+sz_bytes_chunk*i), sz_bytes_chunk, hip.hipMemcpyHostToDevice, streams[i])
hip.hipMemcpyAsync(ctypes.c_void_p(out+sz_bytes_chunk*i), buf[i], sz_bytes_chunk, hip.hipMemcpyDeviceToHost, streams[i])
for i in range(STREAMS):
hip.hipStreamSynchronize(streams[i])
print(f"GPU copy {(tm:=timeit(gpu_roundtrip))*1000:.2f} ms, {sz_bytes*1e-9/tm:.2f} GB/s")
# ***** multiGPU timing *****
STREAMS = 4
GPUS = 6
sz_bytes_chunk = sz_bytes//(STREAMS*GPUS)
buf = [hip.hipSetDevice(j) or [hip.hipMalloc(sz_bytes_chunk) for _ in range(STREAMS)] for j in range(GPUS)]
streams = [hip.hipSetDevice(j) or [hip.hipStreamCreate() for _ in range(STREAMS)] for j in range(GPUS)]
def multigpu_roundtrip():
for i in range(STREAMS):
for j in range(GPUS):
hip.hipSetDevice(j)
offset = sz_bytes_chunk * (j*STREAMS + i)
hip.hipMemcpyAsync(buf[j][i], ctypes.c_void_p(inp+offset), sz_bytes_chunk, hip.hipMemcpyHostToDevice, streams[j][i])
hip.hipMemcpyAsync(ctypes.c_void_p(out+offset), buf[j][i], sz_bytes_chunk, hip.hipMemcpyDeviceToHost, streams[j][i])
for i in range(STREAMS):
for j in range(GPUS):
hip.hipSetDevice(j)
hip.hipStreamSynchronize(streams[j][i])
print(f"GPU 6x {(tm:=timeit(multigpu_roundtrip))*1000:.2f} ms, {sz_bytes*1e-9/tm:.2f} GB/s")
@antferdom
Copy link

antferdom commented Dec 1, 2023

# CPU copy 6.20 ms, 16.24 GB/s
# GPU copy 5.55 ms, 18.15 GB/s
# env: 2xA100-SXM4-40GB
import time
import ctypes
from cuda import cuda, cudart
# common -> https://github.com/NVIDIA/cuda-python/tree/dfd31fa609b9c81bcff925824f38531ab3c96706/examples/common
from common import common
from common.helper_cuda import checkCudaErrors, findCudaDeviceDRV


def timeit(fxn):
    tms = []
    for _ in range(10):
        st = time.perf_counter()
        fxn()
        tms.append(time.perf_counter() - st)
    return min(tms)

sz_bytes = 4096 * 4096 * 2

# Initialize
checkCudaErrors(cuda.cuInit(0))
# Create a context
cuDevice = findCudaDeviceDRV()
cuContext = checkCudaErrors(cuda.cuCtxCreate(0, cuDevice))

cuResult_inp, inp_ptr_raw = cuda.cuMemHostAlloc(sz_bytes, cuda.CU_MEMHOSTALLOC_DEVICEMAP)
cuResult_out, out_ptr_raw = cuda.cuMemHostAlloc(sz_bytes, cuda.CU_MEMHOSTALLOC_DEVICEMAP)
inp = ctypes.c_void_p(inp_ptr_raw)
out = ctypes.c_void_p(out_ptr_raw)
# ***** CPU timing *****

def cpu_memcpy():

    ctypes.memmove(inp, out, sz_bytes)
print(f"CPU copy {(tm:=timeit(cpu_memcpy))*1000:.2f} ms, {sz_bytes*1e-9/tm:.2f} GB/s")

# ***** GPU timing *****

STREAMS = 16
sz_bytes_chunk = sz_bytes // STREAMS
# err, dX = cuda.cuMemAlloc(buffer_size)
buf = [cuda.cuMemAlloc(sz_bytes_chunk)[1] for _ in range(STREAMS)]
# err, stream = cuda.cuStreamCreate(0) -> Tuple
streams = [cuda.cuStreamCreate(0)[1] for _ in range(STREAMS)]

def gpu_roundtrip():
    for i in range(STREAMS):
        # Calculate the offset for each chunk
        offset_inp = inp.value + sz_bytes_chunk * i
        offset_out = out.value + sz_bytes_chunk * i
        # Perform the memory copy operations
        cuda.cuMemcpyHtoDAsync(buf[i], offset_inp, sz_bytes_chunk, streams[i])
        cuda.cuMemcpyDtoHAsync(offset_out, buf[i], sz_bytes_chunk, streams[i])

    for stream in streams:
        checkCudaErrors(cuda.cuStreamSynchronize(stream))
# exit(0)
print(f"GPU copy {(tm:=timeit(gpu_roundtrip))*1000:.2f} ms, {sz_bytes*1e-9/tm:.2f} GB/s")

# Cleanup
for buf_mem in buf:
    checkCudaErrors(cuda.cuMemFree(buf_mem))

checkCudaErrors(cuda.cuMemFreeHost(inp_ptr_raw))
checkCudaErrors(cuda.cuMemFreeHost(out_ptr_raw))
checkCudaErrors(cuda.cuCtxDestroy(cuContext))

# ***** multiGPU timing *****

STREAMS = 4
_, NUM_DEVICES = cuda.cuDeviceGetCount()

sz_bytes_chunk = sz_bytes//(STREAMS*NUM_DEVICES)
buf = [cudart.cudaSetDevice(device_id)[0].value or [cuda.cuMemAlloc(sz_bytes_chunk)[1] for _ in range(STREAMS)] for device_id in range(NUM_DEVICES)]
streams = [cudart.cudaSetDevice(device_id)[0].value or [cuda.cuStreamCreate(device_id)[1] for _ in range(STREAMS)] for device_id in range(NUM_DEVICES)]

device_contexts = []
for device_id in range(NUM_DEVICES):
    device = checkCudaErrors(cuda.cuDeviceGet(device_id))
    ctx = checkCudaErrors(cuda.cuCtxCreate(0, device))
    device_contexts.append(ctx)

# not working: segmentation fault
def multigpu_roundtrip():
    for i in range(STREAMS):
        for device_id in range(NUM_DEVICES):
            # Set the current device and use its pre-created context
            cudart.cudaSetDevice(device_id)
            print(checkCudaErrors(cudart.cudaGetDevice()))
            checkCudaErrors(cuda.cuCtxSetCurrent(device_contexts[device_id]))
            print(cuda.cuCtxGetCurrent())
            offset = sz_bytes_chunk * (device_id*STREAMS + i)
            cuda.cuMemcpyHtoDAsync(buf[device_id][i], inp.value + offset, sz_bytes_chunk, streams[device_id][i])
            cuda.cuMemcpyDtoHAsync(out.value + offset, buf[device_id][i], sz_bytes_chunk, streams[device_id][i])
    for i in range(STREAMS):
        for device_id in range(NUM_DEVICES):
            cudart.cudaSetDevice(device_id)
            checkCudaErrors(cuda.cuStreamSynchronize(streams[device_id][i]))
print(f"GPU  {NUM_DEVICES}x  {(tm:=timeit(multigpu_roundtrip))*1000:.2f} ms, {sz_bytes*1e-9/tm:.2f} GB/s")

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment