scott-gray/memset_bench.py

## memset_bench.py
#!/usr/bin/env python

import pycuda.driver as drv
from pycuda.autoinit import context, device
from pycuda.compiler import SourceModule

SMs = drv.Context.get_device().get_attributes()[drv.device_attribute.MULTIPROCESSOR_COUNT]

print(device.name())

code = r"""
// use 128 bit loads for maximum efficiency
__global__ void test(float4* Y, float4* X, uint size4)
{
    uint tid = threadIdx.x;
    uint bid = blockIdx.x;

    for (uint i = bid*1024 + tid; i < size4; i += gridDim.x*1024)
        Y[i] = X[i];
}
"""

kernel = SourceModule(code).get_function("test")
kernel.prepare("PPI")

size = 1024**2 * SMs
X = drv.mem_alloc(size*4)
drv.memset_d32(X, 0, size)

start = drv.Event()
end   = drv.Event()

repeat = 1000
start.record()

for _ in range(repeat):
    drv.memset_d32(X, 0, size)
    kernel.prepared_call((SMs*2,1,1), (1024,1,1), X, X, size//4)

end.record()
end.synchronize()
ms = end.time_since(start) / repeat

gbps = size*4*3 / (ms * 1e6) # 4x for float, 1x in memset + 2x for round trip in kernel

print(ms, gbps)
	#!/usr/bin/env python

	import pycuda.driver as drv
	from pycuda.autoinit import context, device
	from pycuda.compiler import SourceModule

	SMs = drv.Context.get_device().get_attributes()[drv.device_attribute.MULTIPROCESSOR_COUNT]

	print(device.name())

	code = r"""
	// use 128 bit loads for maximum efficiency
	__global__ void test(float4* Y, float4* X, uint size4)
	{
	uint tid = threadIdx.x;
	uint bid = blockIdx.x;

	for (uint i = bid1024 + tid; i < size4; i += gridDim.x1024)
	Y[i] = X[i];
	}
	"""

	kernel = SourceModule(code).get_function("test")
	kernel.prepare("PPI")

	size = 1024*2 SMs
	X = drv.mem_alloc(size*4)
	drv.memset_d32(X, 0, size)

	start = drv.Event()
	end = drv.Event()

	repeat = 1000
	start.record()

	for _ in range(repeat):
	drv.memset_d32(X, 0, size)
	kernel.prepared_call((SMs*2,1,1), (1024,1,1), X, X, size//4)

	end.record()
	end.synchronize()
	ms = end.time_since(start) / repeat

	gbps = size43 / (ms * 1e6) # 4x for float, 1x in memset + 2x for round trip in kernel

	print(ms, gbps)