-
-
Save asi1024/06b3481ec90fdd40c23e99ca97f410c0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy | |
import cupy | |
from cupyx import jit | |
# https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf (page #7) | |
@jit.rawkernel() | |
def reduce0(idata, odata): | |
sdata = jit.shared_memory(numpy.int32, size=None) | |
tid = jit.threadIdx.x | |
i = jit.blockIdx.x * jit.blockDim.x + jit.threadIdx.x | |
sdata[tid] = idata[i] | |
jit.syncthreads() | |
s = 1 | |
while s < jit.blockDim.x: | |
if tid % (2 * s) == 0: | |
sdata[tid] += sdata[tid + s] | |
jit.syncthreads() | |
s *= 2 | |
if tid == 0: | |
odata[jit.blockIdx.x] = sdata[0] | |
grid_size = 16 | |
block_size = 1024 | |
idata = cupy.arange(grid_size * block_size, dtype=numpy.int32) | |
odata = cupy.zeros(grid_size, dtype=numpy.int32) | |
reduce0[grid_size, block_size](idata, odata, shared_mem=block_size * 32) | |
print(odata) | |
# The above kernel is equivalent to the following one. | |
expected = idata.reshape(grid_size, block_size).sum(axis=1) | |
assert (odata == expected).all() | |
# Print generated CUDA code. | |
print(reduce0.cached_code) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[ 523776 1572352 2620928 3669504 4718080 5766656 6815232 7863808 | |
8912384 9960960 11009536 12058112 13106688 14155264 15203840 16252416] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Generated CUDA code. | |
template <typename T0, typename T1> | |
__device__ unsigned int cupy_remainder_uint32(T0 in0, T1 in1) { | |
return (in0 - _floor_divide(in0, in1) * in1) * (in1 != 0); | |
} | |
extern "C" __global__ void reduce0(CArray<int, 1, true, true> idata, CArray<int, 1, true, true> odata) { | |
extern __shared__ int _smem1[]; | |
int* sdata; | |
unsigned int tid; | |
unsigned int i; | |
int s; | |
sdata = _smem1; | |
tid = threadIdx.x; | |
i = ((blockIdx.x * blockDim.x) + threadIdx.x); | |
sdata[tid] = idata[i]; | |
__syncthreads(); | |
s = 1; | |
while (((unsigned int)(s) < blockDim.x)) { | |
if ((cupy_remainder_uint32(tid, (unsigned int)((2 * s))) == 0u)) { | |
sdata[tid] = (sdata[tid] + sdata[(tid + (unsigned int)(s))]); | |
} | |
else { | |
} | |
__syncthreads(); | |
s = (s * 2); | |
} | |
if ((tid == 0u)) { | |
odata[blockIdx.x] = sdata[0]; | |
} | |
else { | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment