asi1024/cupy_jit_example_full.py Secret

## cupy_jit_example_full.py
import numpy
import cupy
from cupyx import jit


# https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf (page #7)

@jit.rawkernel()
def reduce0(idata, odata):
    sdata = jit.shared_memory(numpy.int32, size=None)

    tid = jit.threadIdx.x
    i = jit.blockIdx.x * jit.blockDim.x + jit.threadIdx.x
    sdata[tid] = idata[i]
    jit.syncthreads()

    s = 1
    while s < jit.blockDim.x:
        if tid % (2 * s) == 0:
            sdata[tid] += sdata[tid + s]
        jit.syncthreads()
        s *= 2

    if tid == 0:
        odata[jit.blockIdx.x] = sdata[0]


grid_size = 16
block_size = 1024
idata = cupy.arange(grid_size * block_size, dtype=numpy.int32)
odata = cupy.zeros(grid_size, dtype=numpy.int32)

reduce0[grid_size, block_size](idata, odata, shared_mem=block_size * 32)

print(odata)

# The above kernel is equivalent to the following one.
expected = idata.reshape(grid_size, block_size).sum(axis=1)
assert (odata == expected).all()

# Print generated CUDA code.
print(reduce0.cached_code)

## out.txt
[  523776  1572352  2620928  3669504  4718080  5766656  6815232  7863808
  8912384  9960960 11009536 12058112 13106688 14155264 15203840 16252416]

## reduce0.cu
// Generated CUDA code.

template <typename T0, typename T1>
__device__ unsigned int cupy_remainder_uint32(T0 in0, T1 in1) {
    return (in0 - _floor_divide(in0, in1) * in1) * (in1 != 0);
}

extern "C" __global__ void reduce0(CArray<int, 1, true, true> idata, CArray<int, 1, true, true> odata) {
  extern __shared__ int _smem1[];
  int* sdata;
  unsigned int tid;
  unsigned int i;
  int s;
  sdata = _smem1;
  tid = threadIdx.x;
  i = ((blockIdx.x * blockDim.x) + threadIdx.x);
  sdata[tid] = idata[i];
  __syncthreads();
  s = 1;
  while (((unsigned int)(s) < blockDim.x)) {
    if ((cupy_remainder_uint32(tid, (unsigned int)((2 * s))) == 0u)) {
      sdata[tid] = (sdata[tid] + sdata[(tid + (unsigned int)(s))]);
    }
    else {
    }
    __syncthreads();
    s = (s * 2);
  }
  if ((tid == 0u)) {
    odata[blockIdx.x] = sdata[0];
  }
  else {
  }
}
	import numpy
	import cupy
	from cupyx import jit


	# https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf (page #7)

	@jit.rawkernel()
	def reduce0(idata, odata):
	sdata = jit.shared_memory(numpy.int32, size=None)

	tid = jit.threadIdx.x
	i = jit.blockIdx.x * jit.blockDim.x + jit.threadIdx.x
	sdata[tid] = idata[i]
	jit.syncthreads()

	s = 1
	while s < jit.blockDim.x:
	if tid % (2 * s) == 0:
	sdata[tid] += sdata[tid + s]
	jit.syncthreads()
	s *= 2

	if tid == 0:
	odata[jit.blockIdx.x] = sdata[0]


	grid_size = 16
	block_size = 1024
	idata = cupy.arange(grid_size * block_size, dtype=numpy.int32)
	odata = cupy.zeros(grid_size, dtype=numpy.int32)

	reduce0[grid_size, block_size](idata, odata, shared_mem=block_size * 32)

	print(odata)

	# The above kernel is equivalent to the following one.
	expected = idata.reshape(grid_size, block_size).sum(axis=1)
	assert (odata == expected).all()

	# Print generated CUDA code.
	print(reduce0.cached_code)
	[ 523776 1572352 2620928 3669504 4718080 5766656 6815232 7863808
	8912384 9960960 11009536 12058112 13106688 14155264 15203840 16252416]
	// Generated CUDA code.

	template <typename T0, typename T1>
	__device__ unsigned int cupy_remainder_uint32(T0 in0, T1 in1) {
	return (in0 - _floor_divide(in0, in1) * in1) * (in1 != 0);
	}

	extern "C" __global__ void reduce0(CArray<int, 1, true, true> idata, CArray<int, 1, true, true> odata) {
	extern __shared__ int _smem1[];
	int* sdata;
	unsigned int tid;
	unsigned int i;
	int s;
	sdata = _smem1;
	tid = threadIdx.x;
	i = ((blockIdx.x * blockDim.x) + threadIdx.x);
	sdata[tid] = idata[i];
	__syncthreads();
	s = 1;
	while (((unsigned int)(s) < blockDim.x)) {
	if ((cupy_remainder_uint32(tid, (unsigned int)((2 * s))) == 0u)) {
	sdata[tid] = (sdata[tid] + sdata[(tid + (unsigned int)(s))]);
	}
	else {
	}
	__syncthreads();
	s = (s * 2);
	}
	if ((tid == 0u)) {
	odata[blockIdx.x] = sdata[0];
	}
	else {
	}
	}