Skip to content

Instantly share code, notes, and snippets.

@asi1024
Last active April 21, 2021 17:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save asi1024/06b3481ec90fdd40c23e99ca97f410c0 to your computer and use it in GitHub Desktop.
Save asi1024/06b3481ec90fdd40c23e99ca97f410c0 to your computer and use it in GitHub Desktop.
import numpy
import cupy
from cupyx import jit
# https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf (page #7)
@jit.rawkernel()
def reduce0(idata, odata):
sdata = jit.shared_memory(numpy.int32, size=None)
tid = jit.threadIdx.x
i = jit.blockIdx.x * jit.blockDim.x + jit.threadIdx.x
sdata[tid] = idata[i]
jit.syncthreads()
s = 1
while s < jit.blockDim.x:
if tid % (2 * s) == 0:
sdata[tid] += sdata[tid + s]
jit.syncthreads()
s *= 2
if tid == 0:
odata[jit.blockIdx.x] = sdata[0]
grid_size = 16
block_size = 1024
idata = cupy.arange(grid_size * block_size, dtype=numpy.int32)
odata = cupy.zeros(grid_size, dtype=numpy.int32)
reduce0[grid_size, block_size](idata, odata, shared_mem=block_size * 32)
print(odata)
# The above kernel is equivalent to the following one.
expected = idata.reshape(grid_size, block_size).sum(axis=1)
assert (odata == expected).all()
# Print generated CUDA code.
print(reduce0.cached_code)
[ 523776 1572352 2620928 3669504 4718080 5766656 6815232 7863808
8912384 9960960 11009536 12058112 13106688 14155264 15203840 16252416]
// Generated CUDA code.
template <typename T0, typename T1>
__device__ unsigned int cupy_remainder_uint32(T0 in0, T1 in1) {
return (in0 - _floor_divide(in0, in1) * in1) * (in1 != 0);
}
extern "C" __global__ void reduce0(CArray<int, 1, true, true> idata, CArray<int, 1, true, true> odata) {
extern __shared__ int _smem1[];
int* sdata;
unsigned int tid;
unsigned int i;
int s;
sdata = _smem1;
tid = threadIdx.x;
i = ((blockIdx.x * blockDim.x) + threadIdx.x);
sdata[tid] = idata[i];
__syncthreads();
s = 1;
while (((unsigned int)(s) < blockDim.x)) {
if ((cupy_remainder_uint32(tid, (unsigned int)((2 * s))) == 0u)) {
sdata[tid] = (sdata[tid] + sdata[(tid + (unsigned int)(s))]);
}
else {
}
__syncthreads();
s = (s * 2);
}
if ((tid == 0u)) {
odata[blockIdx.x] = sdata[0];
}
else {
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment