Skip to content

Instantly share code, notes, and snippets.

@randompast
Last active October 22, 2020 08:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save randompast/73b23a7d2560305be8bddb3a2b9f3a53 to your computer and use it in GitHub Desktop.
Save randompast/73b23a7d2560305be8bddb3a2b9f3a53 to your computer and use it in GitHub Desktop.
Third order convolution. Mini sanity check with benchmark: cusignal vs njit vs numba.cuda
# (cusignal-dev) ~/Desktop/code/cs_fork/cusignal$ python3 discourse_2.py
# [-0.06169884 -0.07237074 0.08575766 -0.03845544 0.37396236]
# [-0.06169884 -0.07237074 0.08575766 -0.03845544 0.37396236]
# [-0.06169884 -0.07237074 0.08575766 -0.03845544 0.37396236]
# [-0.06169884 -0.07237074 0.08575766 -0.03845544 0.37396236]
# True True True
# 481 -135.8448081001215
# 481 -135.84480810012172
# 481 -135.8448081001215
# 481 -135.84480810012175
# init 0.22148 0.25764 0.15601 0.18100
# size 1k-ops cusig njit cuda_1 cuda_2
# 10 491 0.02558 0.07133 0.12641 0.11735
# 20 3848 0.02641 0.51370 0.34600 0.31686
# 30 12717 0.03262 1.67938 0.88682 0.86864
# 40 29504 0.04509 4.28684 1.99569 2.01400
# 50 56375 0.06790 7.96369 3.92720 3.92942
import time
from numba import cuda, njit, jit
import numpy as np
import cupy as cp
import cusignal as cs
import warnings
warnings.filterwarnings('ignore')
@cuda.jit
def conv_1d3o_cuda_1(x,k,y): #convolution 1 dimension 3rd order
n = cuda.grid(1)
if (0 <= n) and (n < y.size):
d = n+k.shape[0]-1
for i in range(k.shape[0]):
for j in range(k.shape[1]):
for l in range(k.shape[2]):
y[n] += x[d-i] * x[d-j] * x[d-l] * k[i,j,l]
@cuda.jit
def conv_1d3o_cuda_2(x,k,y): #convolution 1 dimension 3rd order
n = cuda.grid(1)
stride = cuda.gridsize(1)
for i in range(n, y.size, stride):
d = n+k.shape[0]-1
for i in range(k.shape[0]):
for j in range(k.shape[1]):
for l in range(k.shape[2]):
cuda.atomic.add( y, n, x[d-i] * x[d-j] * x[d-l] * k[i,j,l] )
@njit
def conv_1d3o_njit(x,k,y): #convolution 1 dimension 3rd order
for n in range(0, y.size):
d = n+k.shape[0]-1
for i in range(k.shape[0]):
for j in range(k.shape[1]):
for l in range(k.shape[2]):
y[n] += x[d-i] * x[d-j] * x[d-l] * k[i,j,l]
def conv_1d3o_cj(f,x,k):
xc = cuda.to_device(x)
kc = cuda.to_device(k)
device_id = cp.cuda.Device()
numSM = device_id.attributes["MultiProcessorCount"]
th = (128, )
b = (numSM * 20,)
# y = cuda.device_array_like(x)
y = cp.zeros(x.size - k.shape[0] + 1)
f[b,th](xc, kc, y)
# return y.copy_to_host()[:-k.shape[0]+1]
return y
def conv_1d3o_nj(x, k):
y = np.zeros(x.size-k.shape[0]+1)
conv_1d3o_njit(x,k,y)
return y
def make_xKs(d, xsize):
np.random.seed(0)
x = np.random.uniform(-1,1,xsize)
k = np.random.uniform(-1,1,(d,d,d))
return x,k
def test_time_1d_conv(n,f,x,k):
start = time.time()
for i in range(n):
y = f(x,k)
elapsed = time.time() - start
return y, elapsed
def test_time_1d_conv_cuda(n,f,x,k):
start = time.time()
for i in range(n):
y = conv_1d3o_cj(f,x,k)
elapsed = time.time() - start
return y, elapsed
def prime():
args = make_xKs(2, 5)
n = 1
y0, t0 = test_time_1d_conv(n, cs.convolve1d3o, *args)
y1, t1 = test_time_1d_conv(n, conv_1d3o_nj, *args)
y2, t2 = test_time_1d_conv_cuda(n, conv_1d3o_cuda_1, *args)
y3, t3 = test_time_1d_conv_cuda(n, conv_1d3o_cuda_2, *args)
return t0, t1, t2, t3
def benchmark(n):
print("{:s}\t{:s}\t{:s}\t{:s}\t{:s}\t{:s}".format("size", "1k-ops", "cusig", "njit", "cuda_1", "cuda_2"))
for d in range(10,60,10):
args = make_xKs(d, 500)
x, k = args
ops = k.shape[0] * k.shape[1] * k.shape[2] * (x.size - k.shape[0] + 1) // 1000
y0, t0 = test_time_1d_conv(n, cs.convolve1d3o, *args)
y1, t1 = test_time_1d_conv(n, conv_1d3o_nj, *args)
y2, t2 = test_time_1d_conv_cuda(n, conv_1d3o_cuda_1, *args)
y3, t3 = test_time_1d_conv_cuda(n, conv_1d3o_cuda_2, *args)
print("{:4d}\t{:4d}\t{:0.5f}\t{:0.5f}\t{:0.5f}\t{:0.5f}".format(d, ops, t0, t1, t2, t3))
def check_simple():
args = x, k = np.arange(5), np.arange(8).reshape(2,2,2)
args = x, k = make_xKs(4, 8)
y0 = cs.convolve1d3o(*args)
y1 = conv_1d3o_nj(*args)
y2 = conv_1d3o_cj(conv_1d3o_cuda_1, *args)
y3 = conv_1d3o_cj(conv_1d3o_cuda_2, *args)
# print(args[0])
# print(args[1])
print(y0)
print(y1)
print(y2)
print(y3)
c1 = np.all( np.isclose(y0, y1) )
c2 = np.all( np.isclose(y0, y2) )
c3 = np.all( np.isclose(y0, y3) )
print(c1, c2, c3)
def check_consistancy():
args = make_xKs(20, 500)
y0 = cs.convolve1d3o(*args)
y1 = conv_1d3o_nj(*args)
y2 = conv_1d3o_cj(conv_1d3o_cuda_1, *args)
y3 = conv_1d3o_cj(conv_1d3o_cuda_2, *args)
print(y0.size, np.sum(y0))
print(y1.size, np.sum(y1))
print(y2.size, np.sum(y2))
print(y3.size, np.sum(y3))
def check_sanity():
t0, t1 ,t2, t3 = prime()
check_simple()
check_consistancy()
print("init\t\t{:0.5f}\t{:0.5f}\t{:0.5f}\t{:0.5f}".format(t0, t1, t2, t3))
if __name__ == '__main__':
check_sanity()
benchmark(100)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment