Skip to content

Instantly share code, notes, and snippets.

@albop
Last active January 19, 2016 15:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save albop/eaf622ca88847af46d8a to your computer and use it in GitHub Desktop.
Save albop/eaf622ca88847af46d8a to your computer and use it in GitHub Desktop.
interpolation with numba.cuda
from __future__ import division
from numba import double, int64
from numba import jit, njit
import numpy as np
from numpy import zeros, array
from math import floor
from numpy import empty
Ad = array([
# t^3 t^2 t 1
[-1.0/6.0, 3.0/6.0, -3.0/6.0, 1.0/6.0],
[ 3.0/6.0, -6.0/6.0, 0.0/6.0, 4.0/6.0],
[-3.0/6.0, 3.0/6.0, 3.0/6.0, 1.0/6.0],
[ 1.0/6.0, 0.0/6.0, 0.0/6.0, 0.0/6.0]
])
dAd = zeros((4,4))
for i in range(1,4):
dAd[:,i] = Ad[:,i-1]*(4-i)
d2Ad = zeros((4,4))
for i in range(1,4):
d2Ad[:,i] = dAd[:,i-1]*(4-i)
from numba import cuda
def vec_eval_cubic_spline_3(a, b, orders, coefs, points, out, Ad, dAd):
d = a.shape[0]
N = points.shape[0]
n = cuda.grid(1)
x0 = points[n,0]
x1 = points[n,1]
x2 = points[n,2]
M0 = orders[0]
start0 = a[0]
dinv0 = (orders[0]-1.0)/(b[0]-a[0])
M1 = orders[1]
start1 = a[1]
dinv1 = (orders[1]-1.0)/(b[1]-a[1])
M2 = orders[2]
start2 = a[2]
dinv2 = (orders[2]-1.0)/(b[2]-a[2])
# normalize coordinates to t \in [0,1]
u0 = (x0 - start0)*dinv0
i0 = int( floor( u0 ) )
i0 = max( min(i0,M0-2), 0 )
t0 = u0-i0
u1 = (x1 - start1)*dinv1
i1 = int( floor( u1 ) )
i1 = max( min(i1,M1-2), 0 )
t1 = u1-i1
u2 = (x2 - start2)*dinv2
i2 = int( floor( u2 ) )
i2 = max( min(i2,M2-2), 0 )
t2 = u2-i2
tp0_0 = t0*t0*t0; tp0_1 = t0*t0; tp0_2 = t0; tp0_3 = 1.0;
tp1_0 = t1*t1*t1; tp1_1 = t1*t1; tp1_2 = t1; tp1_3 = 1.0;
tp2_0 = t2*t2*t2; tp2_1 = t2*t2; tp2_2 = t2; tp2_3 = 1.0;
Phi0_0 = 0
Phi0_1 = 0
Phi0_2 = 0
Phi0_3 = 0
if t0 < 0:
Phi0_0 = dAd[0,3]*t0 + Ad[0,3]
Phi0_1 = dAd[1,3]*t0 + Ad[1,3]
Phi0_2 = dAd[2,3]*t0 + Ad[2,3]
Phi0_3 = dAd[3,3]*t0 + Ad[3,3]
elif t0 > 1:
Phi0_0 = (3*Ad[0,0] + 2*Ad[0,1] + Ad[0,2])*(t0-1) + (Ad[0,0]+Ad[0,1]+Ad[0,2]+Ad[0,3])
Phi0_1 = (3*Ad[1,0] + 2*Ad[1,1] + Ad[1,2])*(t0-1) + (Ad[1,0]+Ad[1,1]+Ad[1,2]+Ad[1,3])
Phi0_2 = (3*Ad[2,0] + 2*Ad[2,1] + Ad[2,2])*(t0-1) + (Ad[2,0]+Ad[2,1]+Ad[2,2]+Ad[2,3])
Phi0_3 = (3*Ad[3,0] + 2*Ad[3,1] + Ad[3,2])*(t0-1) + (Ad[3,0]+Ad[3,1]+Ad[3,2]+Ad[3,3])
else:
Phi0_0 = (Ad[0,0]*tp0_0 + Ad[0,1]*tp0_1 + Ad[0,2]*tp0_2 + Ad[0,3]*tp0_3)
Phi0_1 = (Ad[1,0]*tp0_0 + Ad[1,1]*tp0_1 + Ad[1,2]*tp0_2 + Ad[1,3]*tp0_3)
Phi0_2 = (Ad[2,0]*tp0_0 + Ad[2,1]*tp0_1 + Ad[2,2]*tp0_2 + Ad[2,3]*tp0_3)
Phi0_3 = (Ad[3,0]*tp0_0 + Ad[3,1]*tp0_1 + Ad[3,2]*tp0_2 + Ad[3,3]*tp0_3)
Phi1_0 = 0
Phi1_1 = 0
Phi1_2 = 0
Phi1_3 = 0
if t1 < 0:
Phi1_0 = dAd[0,3]*t1 + Ad[0,3]
Phi1_1 = dAd[1,3]*t1 + Ad[1,3]
Phi1_2 = dAd[2,3]*t1 + Ad[2,3]
Phi1_3 = dAd[3,3]*t1 + Ad[3,3]
elif t1 > 1:
Phi1_0 = (3*Ad[0,0] + 2*Ad[0,1] + Ad[0,2])*(t1-1) + (Ad[0,0]+Ad[0,1]+Ad[0,2]+Ad[0,3])
Phi1_1 = (3*Ad[1,0] + 2*Ad[1,1] + Ad[1,2])*(t1-1) + (Ad[1,0]+Ad[1,1]+Ad[1,2]+Ad[1,3])
Phi1_2 = (3*Ad[2,0] + 2*Ad[2,1] + Ad[2,2])*(t1-1) + (Ad[2,0]+Ad[2,1]+Ad[2,2]+Ad[2,3])
Phi1_3 = (3*Ad[3,0] + 2*Ad[3,1] + Ad[3,2])*(t1-1) + (Ad[3,0]+Ad[3,1]+Ad[3,2]+Ad[3,3])
else:
Phi1_0 = (Ad[0,0]*tp1_0 + Ad[0,1]*tp1_1 + Ad[0,2]*tp1_2 + Ad[0,3]*tp1_3)
Phi1_1 = (Ad[1,0]*tp1_0 + Ad[1,1]*tp1_1 + Ad[1,2]*tp1_2 + Ad[1,3]*tp1_3)
Phi1_2 = (Ad[2,0]*tp1_0 + Ad[2,1]*tp1_1 + Ad[2,2]*tp1_2 + Ad[2,3]*tp1_3)
Phi1_3 = (Ad[3,0]*tp1_0 + Ad[3,1]*tp1_1 + Ad[3,2]*tp1_2 + Ad[3,3]*tp1_3)
Phi2_0 = 0
Phi2_1 = 0
Phi2_2 = 0
Phi2_3 = 0
if t2 < 0:
Phi2_0 = dAd[0,3]*t2 + Ad[0,3]
Phi2_1 = dAd[1,3]*t2 + Ad[1,3]
Phi2_2 = dAd[2,3]*t2 + Ad[2,3]
Phi2_3 = dAd[3,3]*t2 + Ad[3,3]
elif t2 > 1:
Phi2_0 = (3*Ad[0,0] + 2*Ad[0,1] + Ad[0,2])*(t2-1) + (Ad[0,0]+Ad[0,1]+Ad[0,2]+Ad[0,3])
Phi2_1 = (3*Ad[1,0] + 2*Ad[1,1] + Ad[1,2])*(t2-1) + (Ad[1,0]+Ad[1,1]+Ad[1,2]+Ad[1,3])
Phi2_2 = (3*Ad[2,0] + 2*Ad[2,1] + Ad[2,2])*(t2-1) + (Ad[2,0]+Ad[2,1]+Ad[2,2]+Ad[2,3])
Phi2_3 = (3*Ad[3,0] + 2*Ad[3,1] + Ad[3,2])*(t2-1) + (Ad[3,0]+Ad[3,1]+Ad[3,2]+Ad[3,3])
else:
Phi2_0 = (Ad[0,0]*tp2_0 + Ad[0,1]*tp2_1 + Ad[0,2]*tp2_2 + Ad[0,3]*tp2_3)
Phi2_1 = (Ad[1,0]*tp2_0 + Ad[1,1]*tp2_1 + Ad[1,2]*tp2_2 + Ad[1,3]*tp2_3)
Phi2_2 = (Ad[2,0]*tp2_0 + Ad[2,1]*tp2_1 + Ad[2,2]*tp2_2 + Ad[2,3]*tp2_3)
Phi2_3 = (Ad[3,0]*tp2_0 + Ad[3,1]*tp2_1 + Ad[3,2]*tp2_2 + Ad[3,3]*tp2_3)
out[n] = Phi0_0*(Phi1_0*(Phi2_0*(coefs[i0+0,i1+0,i2+0]) + Phi2_1*(coefs[i0+0,i1+0,i2+1]) + Phi2_2*(coefs[i0+0,i1+0,i2+2]) + Phi2_3*(coefs[i0+0,i1+0,i2+3])) + Phi1_1*(Phi2_0*(coefs[i0+0,i1+1,i2+0]) + Phi2_1*(coefs[i0+0,i1+1,i2+1]) + Phi2_2*(coefs[i0+0,i1+1,i2+2]) + Phi2_3*(coefs[i0+0,i1+1,i2+3])) + Phi1_2*(Phi2_0*(coefs[i0+0,i1+2,i2+0]) + Phi2_1*(coefs[i0+0,i1+2,i2+1]) + Phi2_2*(coefs[i0+0,i1+2,i2+2]) + Phi2_3*(coefs[i0+0,i1+2,i2+3])) + Phi1_3*(Phi2_0*(coefs[i0+0,i1+3,i2+0]) + Phi2_1*(coefs[i0+0,i1+3,i2+1]) + Phi2_2*(coefs[i0+0,i1+3,i2+2]) + Phi2_3*(coefs[i0+0,i1+3,i2+3]))) + Phi0_1*(Phi1_0*(Phi2_0*(coefs[i0+1,i1+0,i2+0]) + Phi2_1*(coefs[i0+1,i1+0,i2+1]) + Phi2_2*(coefs[i0+1,i1+0,i2+2]) + Phi2_3*(coefs[i0+1,i1+0,i2+3])) + Phi1_1*(Phi2_0*(coefs[i0+1,i1+1,i2+0]) + Phi2_1*(coefs[i0+1,i1+1,i2+1]) + Phi2_2*(coefs[i0+1,i1+1,i2+2]) + Phi2_3*(coefs[i0+1,i1+1,i2+3])) + Phi1_2*(Phi2_0*(coefs[i0+1,i1+2,i2+0]) + Phi2_1*(coefs[i0+1,i1+2,i2+1]) + Phi2_2*(coefs[i0+1,i1+2,i2+2]) + Phi2_3*(coefs[i0+1,i1+2,i2+3])) + Phi1_3*(Phi2_0*(coefs[i0+1,i1+3,i2+0]) + Phi2_1*(coefs[i0+1,i1+3,i2+1]) + Phi2_2*(coefs[i0+1,i1+3,i2+2]) + Phi2_3*(coefs[i0+1,i1+3,i2+3]))) + Phi0_2*(Phi1_0*(Phi2_0*(coefs[i0+2,i1+0,i2+0]) + Phi2_1*(coefs[i0+2,i1+0,i2+1]) + Phi2_2*(coefs[i0+2,i1+0,i2+2]) + Phi2_3*(coefs[i0+2,i1+0,i2+3])) + Phi1_1*(Phi2_0*(coefs[i0+2,i1+1,i2+0]) + Phi2_1*(coefs[i0+2,i1+1,i2+1]) + Phi2_2*(coefs[i0+2,i1+1,i2+2]) + Phi2_3*(coefs[i0+2,i1+1,i2+3])) + Phi1_2*(Phi2_0*(coefs[i0+2,i1+2,i2+0]) + Phi2_1*(coefs[i0+2,i1+2,i2+1]) + Phi2_2*(coefs[i0+2,i1+2,i2+2]) + Phi2_3*(coefs[i0+2,i1+2,i2+3])) + Phi1_3*(Phi2_0*(coefs[i0+2,i1+3,i2+0]) + Phi2_1*(coefs[i0+2,i1+3,i2+1]) + Phi2_2*(coefs[i0+2,i1+3,i2+2]) + Phi2_3*(coefs[i0+2,i1+3,i2+3]))) + Phi0_3*(Phi1_0*(Phi2_0*(coefs[i0+3,i1+0,i2+0]) + Phi2_1*(coefs[i0+3,i1+0,i2+1]) + Phi2_2*(coefs[i0+3,i1+0,i2+2]) + Phi2_3*(coefs[i0+3,i1+0,i2+3])) + Phi1_1*(Phi2_0*(coefs[i0+3,i1+1,i2+0]) + Phi2_1*(coefs[i0+3,i1+1,i2+1]) + Phi2_2*(coefs[i0+3,i1+1,i2+2]) + Phi2_3*(coefs[i0+3,i1+1,i2+3])) + Phi1_2*(Phi2_0*(coefs[i0+3,i1+2,i2+0]) + Phi2_1*(coefs[i0+3,i1+2,i2+1]) + Phi2_2*(coefs[i0+3,i1+2,i2+2]) + Phi2_3*(coefs[i0+3,i1+2,i2+3])) + Phi1_3*(Phi2_0*(coefs[i0+3,i1+3,i2+0]) + Phi2_1*(coefs[i0+3,i1+3,i2+1]) + Phi2_2*(coefs[i0+3,i1+3,i2+2]) + Phi2_3*(coefs[i0+3,i1+3,i2+3])))
import time
import numpy
## prepare the problem
d = 3 # number of dimensions
K = 50 # number of points along each dimension
N = 10**6 # number of points at which to evaluate splines
T = 10 # number of repetitions (to cancel effect of transfer times)
dtype = numpy.float64
a = numpy.array([0.0,0.0,0.0],dtype=dtype)
b = numpy.array([1.0,1.0,1.0],dtype=dtype)
orders = numpy.array([K,K,K],dtype=numpy.int32)
#
# random coefficients
C = numpy.random.random(orders+2)
C = C.astype(dtype)
X = numpy.random.random((N,3))*0.5+0.5
X = X.astype(dtype)
res = numpy.zeros(N,dtype=dtype)
res2 = res.copy()
from numba import cuda
from eval_cubic_cuda import vec_eval_cubic_spline_3 as original
from eval_cubic_cuda import Ad,dAd
Ad = Ad.astype(dtype)
dAd = dAd.astype(dtype)
jitted = cuda.jit(original)
# warmup
res_cuda = numpy.zeros_like(res)
jitted[(N,1)](a,b,orders,C,X,res_cuda,Ad,dAd)
cuda.synchronize()
t1 = time.time()
d_a = cuda.to_device(a)
d_b = cuda.to_device(b)
d_orders = cuda.to_device(orders)
d_C = cuda.to_device(C)
d_Ad = cuda.to_device(Ad)
d_dAd = cuda.to_device(dAd)
d_X = cuda.to_device(X)
d_res_cuda = cuda.to_device(res_cuda)
for t in range(T):
jitted[(N,1)](d_a,d_b,d_orders,d_C,d_X,d_res_cuda,d_Ad,d_dAd)
res_cuda = d_res_cuda.copy_to_host()
cuda.synchronize()
t2 = time.time()
print("CUDA: {}" .format((t2-t1)/T))
# Compare with CPU impplementations (requires interpolation.py)
from interpolation.splines.eval_cubic_numba import vec_eval_cubic_spline_3
vec_eval_cubic_spline_3(a,b,orders,C,X,res)
t1 = time.time()
for t in range(T):
vec_eval_cubic_spline_3(a,b,orders,C,X,res)
t2 = time.time()
print("Cubic (CPU): {}".format((t2-t1)/T))
# print("Linear (CPU): {}".format((t3-t2)/T))
# check that cuda and CPU give the same result
assert( abs(res_cuda - res).max()<1e-8 )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment