Skip to content

Instantly share code, notes, and snippets.

@scott-gray
Created August 19, 2020 04:40
Show Gist options
  • Save scott-gray/567ba5fe7c0f6ff70e72daea79a54100 to your computer and use it in GitHub Desktop.
Save scott-gray/567ba5fe7c0f6ff70e72daea79a54100 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import pycuda.driver as drv
from pycuda.autoinit import context, device
from pycuda.compiler import SourceModule
SMs = drv.Context.get_device().get_attributes()[drv.device_attribute.MULTIPROCESSOR_COUNT]
print(device.name())
code = r"""
// use 128 bit loads for maximum efficiency
__global__ void test(float4* Y, float4* X, uint size4)
{
uint tid = threadIdx.x;
uint bid = blockIdx.x;
for (uint i = bid*1024 + tid; i < size4; i += gridDim.x*1024)
Y[i] = X[i];
}
"""
kernel = SourceModule(code).get_function("test")
kernel.prepare("PPI")
size = 1024**2 * SMs
X = drv.mem_alloc(size*4)
drv.memset_d32(X, 0, size)
start = drv.Event()
end = drv.Event()
repeat = 1000
start.record()
for _ in range(repeat):
drv.memset_d32(X, 0, size)
kernel.prepared_call((SMs*2,1,1), (1024,1,1), X, X, size//4)
end.record()
end.synchronize()
ms = end.time_since(start) / repeat
gbps = size*4*3 / (ms * 1e6) # 4x for float, 1x in memset + 2x for round trip in kernel
print(ms, gbps)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment