daxiongshu/compute_skew.py

## compute_skew.py
@cuda.jit(device=True)
def compute_skew_with_mean(array,skew,mean):
    # skew is a shared memory array
    # mean is a scaler, the mean value of array
    # len(skew) == TPB+1
    # TPB: constant, threads per block, 32 in this case
    # the kernel has only one thread block, so no global sync required.
    # the final result is in skew[0]
    tid = cuda.threadIdx.x
    initialize(skew,0,len(skew))
    cuda.syncthreads()

    tid = cuda.threadIdx.x
    for i in range(cuda.threadIdx.x, len(array), cuda.blockDim.x):
        skew[tid] += (array[i]-mean)**2
    cuda.syncthreads()

    reduction_sum(skew)
    if tid == 0:
        skew[TPB] = skew[0]/(len(array))
    cuda.syncthreads()

    initialize(skew,0,TPB)
    cuda.syncthreads()

    for i in range(cuda.threadIdx.x, len(array), cuda.blockDim.x):
        skew[tid] += (array[i]-mean)**3
    cuda.syncthreads()

    reduction_sum(skew)
    if tid == 0:
        n = len(array)
        m3 = skew[0]/(len(array)) # 3rd moment
        m2 = skew[TPB] # 2nd momemnt
        if m2>0 and n>2:
            skew[0] = math.sqrt((n-1.0)*n)/(n-2.0)*m3/m2**1.5
        else:
            skew[0] = 0
    cuda.syncthreads()
	@cuda.jit(device=True)
	def compute_skew_with_mean(array,skew,mean):
	# skew is a shared memory array
	# mean is a scaler, the mean value of array
	# len(skew) == TPB+1
	# TPB: constant, threads per block, 32 in this case
	# the kernel has only one thread block, so no global sync required.
	# the final result is in skew[0]
	tid = cuda.threadIdx.x
	initialize(skew,0,len(skew))
	cuda.syncthreads()

	tid = cuda.threadIdx.x
	for i in range(cuda.threadIdx.x, len(array), cuda.blockDim.x):
	skew[tid] += (array[i]-mean)**2
	cuda.syncthreads()

	reduction_sum(skew)
	if tid == 0:
	skew[TPB] = skew[0]/(len(array))
	cuda.syncthreads()

	initialize(skew,0,TPB)
	cuda.syncthreads()

	for i in range(cuda.threadIdx.x, len(array), cuda.blockDim.x):
	skew[tid] += (array[i]-mean)**3
	cuda.syncthreads()

	reduction_sum(skew)
	if tid == 0:
	n = len(array)
	m3 = skew[0]/(len(array)) # 3rd moment
	m2 = skew[TPB] # 2nd momemnt
	if m2>0 and n>2:
	skew[0] = math.sqrt((n-1.0)n)/(n-2.0)m3/m2**1.5
	else:
	skew[0] = 0
	cuda.syncthreads()