Skip to content

Instantly share code, notes, and snippets.

@Nicholaswogan
Last active May 26, 2023 23:21
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Nicholaswogan/ca156adb065cb598bd3903b3eaab2381 to your computer and use it in GitHub Desktop.
Save Nicholaswogan/ca156adb065cb598bd3903b3eaab2381 to your computer and use it in GitHub Desktop.
Benchmark of a common radiative transfer algorithm with the Mojo programming language

Radiative transfer with Mojo

Lots of scientific codes need to estimate how light passes through an atmosphere. For example, to quantify the greenhouse effect, climate models need to know how an atmosphere absorbs and scatters sunlight and how efficiently infrared radiation emitted by the atmosphere escapes to space. Also, researchers need models of radiative transfer to interpret telescope observations of atmospheres on planets orbiting distant stars. By simulating what the telescope should see under different atmospheric compositions, a researcher can ultimately back out the composition that best agrees with the telescope observations.

A problem is that many scientific codes are bottlenecked by doing radiative transfer. Lots of computation is required because gases (e.g., H2O) absorb in very complicated ways as a function of the wavelength of light, so many radiative transfer calculations are required to revolve this structure.

I was excited to see initial release of the Mojo programming language, because it might allow for faster radiative transfer calculations for science applications. In particular, Mojo seems to make it easy to write SIMD instructions, which could potentially speed up radiative transfer by a lot.

To test if Mojo might speed up scientific codes, I coded up a popular "two stream" algorithm for approximately solving the radiative transfer equation. There are many different "two stream" methods. Below, I code up the Quadrature two stream method described in Toon et al. (1989).

Here is a summary of the results comparing the Mojo implementations with and without SIMD ("vectorized" means with SIMD) to a Numba implementation.

time (microseconds)
Mojo (scalar) 12.0
Mojo (vectorized) 8.4
Numba 19.2

The vectorized (i.e., SIMD) Mojo calculation is a factor of 1.4 faster than scalar Mojo code and a factor of 2.3 faster than Numba. A big caveat is that I have very little experience with SIMD. Maybe there are better ways to improve my SIMD implementation.

Mojo

from Benchmark import Benchmark
from DType import DType
from Pointer import DTypePointer
from Random import rand
from Memory import memset_zero
from Math import sqrt, exp, pow
from Functional import vectorize
from TargetInfo import dtype_simd_width
struct Vector:
    var data: DTypePointer[DType.f64]
    var size: Int

    fn __init__(inout self, size: Int):
        self.data = DTypePointer[DType.f64].alloc(size)
        self.size = size

    fn __del__(owned self):
        self.data.free()
        
    fn random(inout self):
        rand(self.data, self.size)

    fn zero(inout self):
        memset_zero(self.data, self.size)

    @always_inline
    fn __getitem__(self, i: Int) -> F64:
        return self.load[1](i)

    @always_inline
    fn load[nelts:Int](self, i: Int) -> SIMD[DType.f64, nelts]:
        return self.data.simd_load[nelts](i)

    @always_inline
    fn __setitem__(self, i: Int, val: F64):
        return self.store[1](i, val)

    @always_inline
    fn store[nelts:Int](self, i: Int, val: SIMD[DType.f64, nelts]):
        self.data.simd_store[nelts](i, val)
struct WorkSpace:
    var tauc: Vector
    var e1: Vector
    var e2: Vector
    var e3: Vector
    var e4: Vector
    var direct: Vector
    var cp0: Vector
    var cpb: Vector
    var cm0: Vector
    var cmb: Vector
    var A: Vector; var B: Vector; var D: Vector; var E: Vector;
    var A_even: Vector; var B_even: Vector; var D_even: Vector; var E_even: Vector;
    var A_odd: Vector; var B_odd: Vector; var D_odd: Vector; var E_odd: Vector;
    var y1: Vector
    var y2: Vector
        
    def __init__(inout self, nz: Int):
        self.tauc = Vector(nz+1)
        self.e1 = Vector(nz)
        self.e2 = Vector(nz)
        self.e3 = Vector(nz)
        self.e4 = Vector(nz)
        self.direct = Vector(nz+1)
        self.cp0 = Vector(nz)
        self.cpb = Vector(nz)
        self.cm0 = Vector(nz)
        self.cmb = Vector(nz)
        self.A = Vector(2*nz)
        self.B = Vector(2*nz)
        self.D = Vector(2*nz)
        self.E = Vector(2*nz)
        
        self.A_even = Vector(nz-1)
        self.B_even = Vector(nz-1)
        self.D_even = Vector(nz-1)
        self.E_even = Vector(nz-1)
        self.A_odd = Vector(nz-1)
        self.B_odd = Vector(nz-1)
        self.D_odd = Vector(nz-1)
        self.E_odd = Vector(nz-1)
        
        self.y1 = Vector(nz)
        self.y2 = Vector(nz)
        
fn twostream[nelts: Int](tau: Vector, w0: Vector, gt: Vector, u0: F64, Rsfc: F64, inout amean: Vector, inout w: WorkSpace):
    
    alias Fs_pi = 1.0
    let u1 = 1.0/sqrt[1,DType.f64](3.0)
    let nz = tau.size
    
    # cumulative optical depth at the top of each layer
    w.tauc[0] = 0.0
    for i in range(1,nz+1):
        w.tauc[i] = w.tauc[i-1] + tau[i-1]
    
    @parameter
    fn set_e_and_c[nelts : Int](i : Int):
        
        # Quadrature Two-Stream coefficients (Table 1)
        let gam1 = sqrt[1,DType.f64](3.0)*(2.0 - w0.load[nelts](i)*(1.0 - gt.load[nelts](i)))/2.0
        let gam2 = sqrt[1,DType.f64](3.0)*w0.load[nelts](i)*(1.0-gt.load[nelts](i))/2.0
        let gam3 = (1.0 - sqrt[1,DType.f64](3.0)*gt.load[nelts](i)*u0)/2.0
        let gam4 = 1.0 - gam3
        # u1 = 1.0_dp/sqrt(3.0_dp) (parameter)
        
        # lambda, and capital Gamma (Equations 21, 22)
        # let gam12_minus_gam22 = gam1**2 - gam2**2
        let gam12_minus_gam22 = pow[nelts,DType.f64,DType.f64](gam1, 2.0) - pow[nelts,DType.f64,DType.f64](gam2, 2.0)
        let lambd = sqrt[nelts,DType.f64](gam12_minus_gam22)
        let cap_gam = gam2 / (gam1 + lambd)
        
        # e's (Equation 44)
        let wrk_real = exp[nelts,DType.f64](-lambd*tau.load[nelts](i))
        w.e1.store[nelts](i, 1.0 + cap_gam*wrk_real)
        w.e2.store[nelts](i, 1.0 - cap_gam*wrk_real)
        w.e3.store[nelts](i, cap_gam + wrk_real)
        w.e4.store[nelts](i, cap_gam - wrk_real)
        
        # C+ and C- (Equation 23, 24)
        # Fs_pi = 1.0_dp (parameter) 
        # We take the solar flux at the top of the atmosphere to be = 1
        # Note: Toon et al. 1989 defines Fs * pi = solar flux. So Fs = solar flux / pi
        let facp = w0.load[nelts](i)*Fs_pi*((gam1-1.0/u0)*gam3 + gam4*gam2)
        let facm = w0.load[nelts](i)*Fs_pi*((gam1+1.0/u0)*gam4 + gam2*gam3) 
        let et0 = exp[nelts,DType.f64](-w.tauc.load[nelts](i)/u0)
        let etb = et0*exp(-tau.load[nelts](i)/u0)
        let denom = gam12_minus_gam22 - 1.0/(pow[nelts,DType.f64,DType.f64](u0,2.0))
        
        w.direct.store[nelts](i+1, u0*Fs_pi*etb)
        w.cp0.store[nelts](i, et0*facp/denom)
        w.cpb.store[nelts](i, etb*facp/denom)
        w.cm0.store[nelts](i, et0*facm/denom)
        w.cmb.store[nelts](i, etb*facm/denom)
    vectorize[nelts, set_e_and_c](nz)
    
    w.direct[0] = u0*Fs_pi
    let Ssfc = Rsfc*w.direct[nz]
    
    # Coefficients of tridiagonal linear system (Equations 39 - 43)
    w.A[0] = 0.0
    w.B[0] = w.e1[0]
    w.D[0] = -w.e2[0]
    w.E[0] = 0.0 - w.cm0[0] # assumes no downward diffuse flux at top-of-atmosphere
    
    # Odd coefficients (Equation 41)
    @parameter
    fn set_ABDE_odd[nelts : Int](i : Int):
        w.A_odd.store[nelts](i, w.e2.load[nelts](i)*w.e3.load[nelts](i) - w.e4.load[nelts](i)*w.e1.load[nelts](i))
        w.B_odd.store[nelts](i, w.e1.load[nelts](i)*w.e1.load[nelts](i+1) - w.e3.load[nelts](i)*w.e3.load[nelts](i+1))
        w.D_odd.store[nelts](i, w.e3.load[nelts](i)*w.e4.load[nelts](i+1) - w.e1.load[nelts](i)*w.e2.load[nelts](i+1))
        w.E_odd.store[nelts](i, w.e3.load[nelts](i)*(w.cp0.load[nelts](i+1) - w.cpb.load[nelts](i)) + w.e1.load[nelts](i)*(w.cmb.load[nelts](i) - w.cm0.load[nelts](i+1)))
    vectorize[nelts, set_ABDE_odd](nz-1)
    
    # Even coefficients (Equation 42)
    @parameter
    fn set_ABDE_even[nelts : Int](i : Int):
        w.A_even.store[nelts](i, w.e2.load[nelts](i+1)*w.e1.load[nelts](i) - w.e3.load[nelts](i)*w.e4.load[nelts](i+1))
        w.B_even.store[nelts](i, w.e2.load[nelts](i)*w.e2.load[nelts](i+1) - w.e4.load[nelts](i)*w.e4.load[nelts](i+1))
        w.D_even.store[nelts](i, w.e1.load[nelts](i+1)*w.e4.load[nelts](i+1) - w.e2.load[nelts](i+1)*w.e3.load[nelts](i+1))
        w.E_even.store[nelts](i, w.e2.load[nelts](i+1)*(w.cp0.load[nelts](i+1) - w.cpb.load[nelts](i)) - w.e4.load[nelts](i+1)*(w.cm0.load[nelts](i+1) - w.cmb.load[nelts](i)))
    vectorize[nelts, set_ABDE_even](nz-1)
    
    var l: Int
    for i in range(nz-1):
        l = 2*i + 2
        w.A[l] = w.A_odd[i]
        w.B[l] = w.B_odd[i]
        w.D[l] = w.D_odd[i]
        w.E[l] = w.E_odd[i]
        
        l = 2*i + 1
        w.A[l] = w.A_even[i]
        w.B[l] = w.B_even[i]
        w.D[l] = w.D_even[i]
        w.E[l] = w.E_even[i]
        
    l = 2*nz - 1
    w.A[l] = w.e1[nz-1] - Rsfc*w.e3[nz-1];
    w.B[l] = w.e2[nz-1] - Rsfc*w.e4[nz-1];
    w.D[l] = 0.0;
    w.E[l] = Ssfc - w.cpb[nz-1] + Rsfc*w.cmb[nz-1];
    
    solve_tridiag(w.A, w.B, w.D, w.E)
    
    for i in range(nz):
        l = 2*i + 1
        w.y1[i] = w.E[l-1]
        w.y2[i] = w.E[l]
        
    # amean = integral(J_n d_Omega) = J_n*4*pi
    # Above is an integration of J_n over a complete sphere, 
    # and has units of flux density, or irradiance.
    # amean(i) is for at top of layer i. amean(nz + 1) is the ground.
  
    # very top edge of atmosphere (Not in paper. Derive from Equation 17 and 31. bit confusing)
    amean[0] = (1.0/u1)*(w.y1[0]*w.e3[0]-w.y2[0]*w.e4[0]+w.cp0[0]) + w.direct[0]/u0;
    # J_n*4*pi = amean (Equation 49)
    @parameter
    fn set_amean[nelts : Int](i : Int):
        amean.store[nelts](i+1, (1.0/u1)*(w.y1.load[nelts](i)*(w.e1.load[nelts](i)+w.e3.load[nelts](i))+w.y2.load[nelts](i)*(w.e2.load[nelts](i)+w.e4.load[nelts](i)) 
                                + w.cpb.load[nelts](i)+w.cmb.load[nelts](i)) + w.direct.load[nelts](i+1)/u0)
    vectorize[nelts, set_amean](nz)
    
@always_inline
fn solve_tridiag(a: Vector, b: Vector, inout c: Vector, inout d: Vector):
    let n = a.size - 1
    c[0] /= b[0];
    d[0] /= b[0];
    
    for i in range(1,n):
        c[i] /= b[i] - a[i]*c[i-1];
        d[i] = (d[i] - a[i]*d[i-1]) / (b[i] - a[i]*c[i-1])
        
    d[n] = (d[n] - a[n]*d[n-1]) / (b[n] - a[n]*c[n-1])
    
    for j in range(n):
        let i = n-j-1
        d[i] -= c[i]*d[i+1]
alias nz = 200
var tau = Vector(nz)
var w0 = Vector(nz)
var gt = Vector(nz)
gt.zero()
let u0: F64 = 0.6427876096865394
let Rsfc: F64 = 0.25
var amean = Vector(nz+1)
var w = WorkSpace(nz)

tau[0] = 8.7767996682530329e-08; tau[1] = 9.5947097904378163e-08; tau[2] = 1.0489052756541996e-07; 
tau[3] = 1.1466995675270959e-07; tau[4] = 1.2536380811254373e-07; tau[5] = 1.3705789618437358e-07; 
tau[6] = 1.4984612914194580e-07; tau[7] = 1.6383127794765110e-07; tau[8] = 1.7912581953502761e-07; 
tau[9] = 1.9585286130848492e-07; tau[10] = 2.1414715499671573e-07; tau[11] = 2.3415620872566158e-07; 
tau[12] = 2.5604150709720415e-07; tau[13] = 2.7997985008321631e-07; tau[14] = 3.0616482268293842e-07; 
tau[15] = 3.3480840855869273e-07; tau[16] = 3.6614276227859870e-07; tau[17] = 4.0042215637110061e-07; 
tau[18] = 4.3792512115823633e-07; tau[19] = 4.7895679730553810e-07; tau[20] = 5.2385152323406823e-07; 
tau[21] = 5.7297568201604804e-07; tau[22] = 6.2673083515698063e-07; tau[23] = 6.8555717379606160e-07; 
tau[24] = 7.4993732137969107e-07; tau[25] = 8.2040052583949341e-07; tau[26] = 8.9752728379739709e-07; 
tau[27] = 9.8195444440443219e-07; tau[28] = 1.0743808461797366e-06; tau[29] = 1.1755735467535040e-06; 
tau[30] = 1.2863747128512830e-06; tau[31] = 1.4077092463227407e-06; tau[32] = 1.5405932316575096e-06; 
tau[33] = 1.6861433014556164e-06; tau[34] = 1.8455870289203520e-06; tau[35] = 2.0202744708601724e-06; 
tau[36] = 2.2116910012294060e-06; tau[37] = 2.4214715942473803e-06; tau[38] = 2.6514167379642028e-06; 
tau[39] = 2.9035101842693509e-06; tau[40] = 3.1799387703266947e-06; tau[41] = 3.4831145798177533e-06; 
tau[42] = 3.8156997509793911e-06; tau[43] = 4.1806342830157372e-06; tau[44] = 4.5811672439896014e-06; 
tau[45] = 5.0208918431149617e-06; tau[46] = 5.5037848994039188e-06; tau[47] = 6.0342513187615696e-06; 
tau[48] = 6.6171742844996624e-06; tau[49] = 7.2579719738055458e-06; tau[50] = 7.9626617375794997e-06; 
tau[51] = 8.7379328255527379e-06; tau[52] = 9.5912289061901981e-06; tau[53] = 1.0530841825193256e-05; 
tau[54] = 1.1566018270795857e-05; tau[55] = 1.2707081274015154e-05; tau[56] = 1.3965568772106545e-05; 
tau[57] = 1.5354391809370636e-05; tau[58] = 1.6888015348134820e-05; tau[59] = 1.8582665120003893e-05; 
tau[60] = 2.0456564472667114e-05; tau[61] = 2.2530205765175080e-05; tau[62] = 2.4826661548100218e-05; 
tau[63] = 2.7371941534785026e-05; tau[64] = 3.0195402243950013e-05; tau[65] = 3.3330217163257071e-05; 
tau[66] = 3.6813916372382030e-05; tau[67] = 4.0689005750403127e-05; tau[68] = 4.5003677189107140e-05; 
tau[69] = 4.9812622624297257e-05; tau[70] = 5.5177966142078792e-05; tau[71] = 6.1170329907152607e-05; 
tau[72] = 6.7870051104862677e-05; tau[73] = 7.5368568415506559e-05; tau[74] = 8.3769997605174037e-05; 
tau[75] = 9.3192916477134497e-05; tau[76] = 1.0377237936370605e-04; tau[77] = 1.1566218028227589e-04; 
tau[78] = 1.2903738130325400e-04; tau[79] = 1.4409711804014798e-04; tau[80] = 1.6106768650787847e-04; 
tau[81] = 1.8020590401221682e-04; tau[82] = 2.0180271951203158e-04; tau[83] = 2.2618702435261593e-04; 
tau[84] = 2.5372957971436598e-04; tau[85] = 2.8499085847225129e-04; tau[86] = 3.2072793130441182e-04; 
tau[87] = 3.6172038216249729e-04; tau[88] = 4.0883409334140487e-04; tau[89] = 4.6308361487628643e-04; 
tau[90] = 5.2565578019456703e-04; tau[91] = 5.9793596197029830e-04; tau[92] = 6.8153685512867973e-04; 
tau[93] = 7.7832951627810915e-04; tau[94] = 8.9047619550709984e-04; tau[95] = 1.0204642828402439e-03; 
tau[96] = 1.1711404714380698e-03; tau[97] = 1.3457440390547924e-03; tau[98] = 1.5479380130098432e-03; 
tau[99] = 1.7818369502875632e-03; tau[100] = 2.0520301990285964e-03; tau[101] = 2.3635998383980863e-03; 
tau[102] = 2.7221330470807528e-03; tau[103] = 3.1337294374715365e-03; tau[104] = 3.6050225687279105e-03; 
tau[105] = 4.1432174515158250e-03; tau[106] = 4.7560587646881024e-03; tau[107] = 5.4518194898090918e-03; 
tau[108] = 6.2393740872849577e-03; tau[109] = 7.1282605624539697e-03; tau[110] = 8.1287730355002370e-03; 
tau[111] = 9.2520919156331774e-03; tau[112] = 1.0510459289421320e-02; tau[113] = 1.1917408459878433e-02; 
tau[114] = 1.3488058899910772e-02; tau[115] = 1.5239490954294801e-02; tau[116] = 1.7191215895174722e-02; 
tau[117] = 1.9365747959630791e-02; tau[118] = 2.1789244185028790e-02; tau[119] = 2.4492051273927565e-02; 
tau[120] = 2.7508662976724094e-02; tau[121] = 3.0875812746695153e-02; tau[122] = 3.4625933661229222e-02; 
tau[123] = 3.8771507194934468e-02; tau[124] = 4.3278351607027789e-02; tau[125] = 4.8046866543172910e-02; 
tau[126] = 5.2968730144676401e-02; tau[127] = 5.8080080318497186e-02; tau[128] = 6.3512117073305424e-02; 
tau[129] = 6.9334818787612357e-02; tau[130] = 7.5580500565664441e-02; tau[131] = 8.2268694684091204e-02; 
tau[132] = 8.9410407413410012e-02; tau[133] = 9.7014366545305167e-02; tau[134] = 1.0508676551553092e-01; 
tau[135] = 1.1363074698597439e-01; tau[136] = 1.2264676085190102e-01; tau[137] = 1.3213313885917077e-01; 
tau[138] = 1.4208431117677991e-01; tau[139] = 1.5248507419772675e-01; tau[140] = 1.6331818759153677e-01; 
tau[141] = 1.7457363049899460e-01; tau[142] = 1.8624471255477631e-01; tau[143] = 1.9833035537844179e-01; 
tau[144] = 2.1083726583856535e-01; tau[145] = 2.2378158933184314e-01; tau[146] = 2.3718960522404994e-01; 
tau[147] = 2.5109715445037328e-01; tau[148] = 2.6554778373896060e-01; tau[149] = 2.8058996227549782e-01; 
tau[150] = 2.9627401857870805e-01; tau[151] = 3.1264948596979114e-01; tau[152] = 3.2976328683030964e-01; 
tau[153] = 3.4765878733307898e-01; tau[154] = 3.6637545533146748e-01; tau[155] = 3.8594878862798920e-01; 
tau[156] = 4.0641030224751162e-01; tau[157] = 4.2778750516236580e-01; tau[158] = 4.5010382615701916e-01; 
tau[159] = 4.7337839740328619e-01; tau[160] = 4.9762555188214369e-01; tau[161] = 5.2276004558359734e-01; 
tau[162] = 5.4862232621183815e-01; tau[163] = 5.7510375299943239e-01; tau[164] = 6.0224859454892643e-01; 
tau[165] = 6.3079868128618721e-01; tau[166] = 6.6150503383107084e-01; tau[167] = 6.9479262019749455e-01; 
tau[168] = 7.3135971057268112e-01; tau[169] = 7.7207900416531505e-01; tau[170] = 8.1777260097037741e-01; 
tau[171] = 8.7003464761828964e-01; tau[172] = 9.3024357036853700e-01; tau[173] = 9.9877360370577395e-01; 
tau[174] = 1.0762228460543470e+00; tau[175] = 1.1562951730417654e+00; tau[176] = 1.2377641657793577e+00; 
tau[177] = 1.3216565090158796e+00; tau[178] = 2.6574734564569380e+00; tau[179] = 4.2105772058622346e+00; 
tau[180] = 7.0339868009456881e+00; tau[181] = 1.1521276456355757e+01; tau[182] = 1.8285964859512521e+01; 
tau[183] = 2.8784368028594791e+01; tau[184] = 4.4808994648392989e+01; tau[185] = 6.8880927399198839e+01; 
tau[186] = 1.0449251595375671e+02; tau[187] = 1.5641266261187351e+02; tau[188] = 2.3106587764604379e+02; 
tau[189] = 3.3699697083889566e+02; tau[190] = 4.8543396220701010e+02; tau[191] = 6.9096229688139806e+02; 
tau[192] = 9.7232374589713959e+02; tau[193] = 1.3811178079094689e+03; tau[194] = 1.9771447087450604e+03; 
tau[195] = 2.7426755572022894e+03; tau[196] = 3.6957662267266865e+03; tau[197] = 4.9351475964671836e+03; 
tau[198] = 6.5332896118304561e+03; tau[199] = 8.5775143354619722e+03; 

w0[0] = 9.9711179180048182e-01; w0[1] = 9.9710577604125894e-01; w0[2] = 9.9709335053701875e-01; 
w0[3] = 9.9707409718189177e-01; w0[4] = 9.9704756934937544e-01; w0[5] = 9.9701328938498046e-01; 
w0[6] = 9.9697074581156886e-01; w0[7] = 9.9691939021948628e-01; w0[8] = 9.9685863381171902e-01; 
w0[9] = 9.9678784357544925e-01; w0[10] = 9.9670633805145825e-01; w0[11] = 9.9661338267233490e-01; 
w0[12] = 9.9650818464346891e-01; w0[13] = 9.9638988734055600e-01; w0[14] = 9.9625756419873834e-01; 
w0[15] = 9.9611021207159700e-01; w0[16] = 9.9594674403681771e-01; w0[17] = 9.9576598162879704e-01; 
w0[18] = 9.9556664647758897e-01; w0[19] = 9.9534735133343222e-01; w0[20] = 9.9510659045563765e-01; 
w0[21] = 9.9484272934368789e-01; w0[22] = 9.9455399378589959e-01; w0[23] = 9.9423845819787393e-01; 
w0[24] = 9.9389403322209602e-01; w0[25] = 9.9351845255550364e-01; w0[26] = 9.9310925897056246e-01; 
w0[27] = 9.9266378949192857e-01; w0[28] = 9.9217915969101766e-01; w0[29] = 9.9165224706199229e-01; 
w0[30] = 9.9107967344303005e-01; w0[31] = 9.9045778644803517e-01; w0[32] = 9.8978263988756232e-01; 
w0[33] = 9.8904997315915044e-01; w0[34] = 9.8825518959925840e-01; w0[35] = 9.8739333380968586e-01; 
w0[36] = 9.8645906798846772e-01; w0[37] = 9.8544664731122855e-01; w0[38] = 9.8434989444384591e-01; 
w0[39] = 9.8316217330321953e-01; w0[40] = 9.8187636221329821e-01; w0[41] = 9.8048482665722558e-01; 
w0[42] = 9.7897939187322636e-01; w0[43] = 9.7735131560593647e-01; w0[44] = 9.7559126140833230e-01; 
w0[45] = 9.7368927293970564e-01; w0[46] = 9.7163474982622933e-01; w0[47] = 9.6941642574050779e-01; 
w0[48] = 9.6702234946428145e-01; w0[49] = 9.6443986983423624e-01; w0[50] = 9.6165562558909190e-01; 
w0[51] = 9.5865554130176256e-01; w0[52] = 9.5542483072735251e-01; w0[53] = 9.5194800903411125e-01; 
w0[54] = 9.4820891557954556e-01; w0[55] = 9.4419074902690803e-01; w0[56] = 9.3987611674461291e-01; 
w0[57] = 9.3524710057171967e-01; w0[58] = 9.3028534110971439e-01; w0[59] = 9.2497214278425166e-01; 
w0[60] = 9.1928860187296246e-01; w0[61] = 9.1321575969783042e-01; w0[62] = 9.0673478291743770e-01; 
w0[63] = 8.9982717269670254e-01; w0[64] = 8.9247500397279889e-01; w0[65] = 8.8466119568641544e-01; 
w0[66] = 8.7636981190996832e-01; w0[67] = 8.6758639310529395e-01; w0[68] = 8.5829831560583947e-01; 
w0[69] = 8.4849517611710745e-01; w0[70] = 8.3816919684997526e-01; w0[71] = 8.2731564522719481e-01; 
w0[72] = 8.1593326067846506e-01; w0[73] = 8.0402467952201917e-01; w0[74] = 7.9159684763866422e-01; 
w0[75] = 7.7866140928303185e-01; w0[76] = 7.6523505988054275e-01; w0[77] = 7.5133985023733718e-01; 
w0[78] = 7.3700343012197278e-01; w0[79] = 7.2225922012228905e-01; w0[80] = 7.0714650311348992e-01; 
w0[81] = 6.9171042928311888e-01; w0[82] = 6.7600193312752554e-01; w0[83] = 6.6007756565534514e-01; 
w0[84] = 6.4399925112087075e-01; w0[85] = 6.2751690913313019e-01; w0[86] = 6.1027499828015230e-01; 
w0[87] = 5.9224510437414302e-01; w0[88] = 5.7351602373384358e-01; w0[89] = 5.5418855325877014e-01; 
w0[90] = 5.3437475255737310e-01; w0[91] = 5.1419661917235160e-01; w0[92] = 4.9378417655716828e-01; 
w0[93] = 4.7327302015086270e-01; w0[94] = 4.5280141715326816e-01; w0[95] = 4.3250709824656874e-01; 
w0[96] = 4.1252391349300427e-01; w0[97] = 3.9297854411973260e-01; w0[98] = 3.7398746095910612e-01; 
w0[99] = 3.5565430274683579e-01; w0[100] = 3.3806781061485630e-01; w0[101] = 3.2130040985921160e-01; 
w0[102] = 3.0540747904392268e-01; w0[103] = 2.9042729373237675e-01; w0[104] = 2.7638022880356511e-01; 
w0[105] = 2.6326880017521570e-01; w0[106] = 2.5108351465026180e-01; w0[107] = 2.3980507108518678e-01; 
w0[108] = 2.2940313887670982e-01; w0[109] = 2.1983837091461964e-01; w0[110] = 2.1106434856035647e-01; 
w0[111] = 2.0302938150311162e-01; w0[112] = 1.9567810006274122e-01; w0[113] = 1.8895279358968187e-01; 
w0[114] = 1.8279446934896854e-01; w0[115] = 1.7714363349486195e-01; w0[116] = 1.7194084623673653e-01; 
w0[117] = 1.6712721159390778e-01; w0[118] = 1.6264518453364277e-01; w0[119] = 1.5844053282765996e-01; 
w0[120] = 1.5446717275166658e-01; w0[121] = 1.5069805821385496e-01; w0[122] = 1.4714688950237698e-01; 
w0[123] = 1.4390390146402926e-01; w0[124] = 1.4117355924585473e-01; w0[125] = 1.3925281217141075e-01; 
w0[126] = 1.3832497295389018e-01; w0[127] = 1.3814978008396606e-01; w0[128] = 1.3835160646341094e-01; 
w0[129] = 1.3879024386623634e-01; w0[130] = 1.3943642059052910e-01; w0[131] = 1.4029209031588918e-01; 
w0[132] = 1.4137338559055690e-01; w0[133] = 1.4269664294338003e-01; w0[134] = 1.4427862559127622e-01; 
w0[135] = 1.4613710893515686e-01; w0[136] = 1.4829031288790176e-01; w0[137] = 1.5075639573344454e-01; 
w0[138] = 1.5355577328662448e-01; w0[139] = 1.5671691155491141e-01; w0[140] = 1.6026759551587053e-01; 
w0[141] = 1.6422711196183729e-01; w0[142] = 1.6861217530896319e-01; w0[143] = 1.7343599664804954e-01; 
w0[144] = 1.7870749622029697e-01; w0[145] = 1.8443089822995290e-01; w0[146] = 1.9060594959733176e-01; 
w0[147] = 1.9722890743114718e-01; w0[148] = 2.0429424128179388e-01; w0[149] = 2.1179676211688642e-01; 
w0[150] = 2.1973372569350255e-01; w0[151] = 2.2810647413984569e-01; w0[152] = 2.3692139199104423e-01; 
w0[153] = 2.4619023520258562e-01; w0[154] = 2.5593007712435828e-01; w0[155] = 2.6616312793182034e-01; 
w0[156] = 2.7691657556564053e-01; w0[157] = 2.8822248978446646e-01; w0[158] = 3.0011781136252774e-01; 
w0[159] = 3.1264448345127460e-01; w0[160] = 3.2584981513994765e-01; w0[161] = 3.3984816250379990e-01; 
w0[162] = 3.5480294275155777e-01; w0[163] = 3.7084641223360615e-01; w0[164] = 3.8801659473095779e-01; 
w0[165] = 4.0590776426938302e-01; w0[166] = 4.2411463421746515e-01; w0[167] = 4.4245155797750657e-01; 
w0[168] = 4.6057530551628978e-01; w0[169] = 4.7806498804785041e-01; w0[170] = 4.9458321307491998e-01; 
w0[171] = 5.0940696342852609e-01; w0[172] = 5.2208413946637822e-01; w0[173] = 5.3285877380442748e-01; 
w0[174] = 5.4190756796308170e-01; w0[175] = 5.4874444561683677e-01; w0[176] = 5.5343636086921844e-01; 
w0[177] = 5.5665571130037195e-01; w0[178] = 2.9574890756478689e-01; w0[179] = 1.9886942554336376e-01; 
w0[180] = 1.2649443133824198e-01; w0[181] = 8.2055381707757405e-02; w0[182] = 5.4932432103361739e-02; 
w0[183] = 3.7036615277279070e-02; w0[184] = 2.5222279402791422e-02; w0[185] = 1.7375975057806702e-02; 
w0[186] = 1.2117538023085468e-02; w0[187] = 8.5555194333282857e-03; w0[188] = 6.1147950360911198e-03; 
w0[189] = 4.4226660277904978e-03; w0[190] = 3.2357616929942499e-03; w0[191] = 2.3936627467879821e-03; 
w0[192] = 1.7895336731204119e-03; w0[193] = 1.3229150832381939e-03; w0[194] = 9.6852078283902500e-04; 
w0[195] = 7.3181934878072249e-04; w0[196] = 5.6933457177442079e-04; w0[197] = 4.4659287344306218e-04; 
w0[198] = 3.5307409636260321e-04; w0[199] = 2.8123294322360731e-04; 
@always_inline
fn benchmark_twostream[nelts: Int](tau: Vector, w0: Vector, gt: Vector, 
                                   u0: F64, Rsfc: F64, inout amean: Vector, 
                                   inout w: WorkSpace) -> F64:
    
    @always_inline
    @parameter
    fn test_fn():
        twostream[nelts](tau, w0, gt, u0, Rsfc, amean, w)
        
    let us: F64 = F64(Benchmark().run[test_fn]())/1.0e3
    
    return us
let us_scalar = benchmark_twostream[1](tau, w0, gt, u0, Rsfc, amean, w)
print('Mojo scalar twostream runtime =', us_scalar,'us')
Mojo scalar twostream runtime = 12.026999999999999 us
alias nelts = dtype_simd_width[DType.f64]()
let us_vectorized = benchmark_twostream[nelts](tau, w0, gt, u0, Rsfc, amean, w)
print('Mojo vectorized twostream runtime =', us_vectorized,'us')
Mojo vectorized twostream runtime = 8.4299999999999997 us
print('scalar runtime / vectorized runtime =',(us_scalar/us_vectorized))
scalar runtime / vectorized runtime = 1.4266903914590747

Numba

%%python

import numpy as np
import numba as nb
import timeit

@nb.njit
def two_stream(tau, w0, gt, u0, Rsfc):
    nz = len(tau)
    
    gam1 = np.empty((nz),np.float64)
    gam2 = np.empty((nz),np.float64)
    gam3 = np.empty((nz),np.float64)
    gam4 = np.empty((nz),np.float64)
    lamda = np.empty((nz),np.float64)
    cap_gam = np.empty((nz),np.float64)
    e1 = np.empty((nz),np.float64)
    e2 = np.empty((nz),np.float64)
    e3 = np.empty((nz),np.float64)
    e4 = np.empty((nz),np.float64)
    tauc = np.empty((nz+1),np.float64)
    direct = np.empty((nz+1),np.float64)
    cp0 = np.empty((nz),np.float64)
    cpb = np.empty((nz),np.float64)
    cm0 = np.empty((nz),np.float64)
    cmb = np.empty((nz),np.float64)
    A = np.empty((nz*2),np.float64)
    B = np.empty((nz*2),np.float64)
    D = np.empty((nz*2),np.float64)
    E = np.empty((nz*2),np.float64)
    y1 = np.empty((nz),np.float64)
    y2 = np.empty((nz),np.float64)
    
    gam1 = np.sqrt(3.0)*(2.0-w0*(1+gt))/2.0
    gam2 = np.sqrt(3.0)*w0*(1.0-gt)/2.0
    gam3 = (1.0-np.sqrt(3.0)*gt*u0)/2.0
    gam4 = 1.0 - gam3
    u1 = 1.0/np.sqrt(3.0)

    lamda = (gam1**2.0 - gam2**2.0)**(0.50)
    
    cap_gam = gam2 / (gam1 + lamda)
    
    e1 = 1.0 + cap_gam*np.exp(-lamda*tau)
    e2 = 1.0 - cap_gam*np.exp(-lamda*tau)
    e3 = cap_gam + np.exp(-lamda*tau)
    e4 = cap_gam - np.exp(-lamda*tau)
    
    tauc[0] = 0.0
    for i in range(1,nz+1):
        tauc[i] = tauc[i-1] + tau[i-1]
        
    Fs_pi = 1.0
    direct[0] = u0*Fs_pi
    
    for i in range(nz):
        facp = w0[i]*Fs_pi*((gam1[i]-1.0/u0)*gam3[i]+gam4[i]*gam2[i])
        facm = w0[i]*Fs_pi*((gam1[i]+1.0/u0)*gam4[i]+gam2[i]*gam3[i])
        et0 = np.exp(-tauc[i]/u0)
        etb = et0*np.exp(-tau[i]/u0)
        denom = lamda[i]**2.0 - 1.0/u0**2.0

        direct[i+1] = u0*Fs_pi*etb
        cp0[i] = et0*facp/denom
        cpb[i] = etb*facp/denom
        cm0[i] = et0*facm/denom
        cmb[i] = etb*facm/denom
        
    Ssfc = Rsfc*direct[nz]
        
    A[0] = 0.0
    B[0] = e1[0]
    D[0] = -e2[0]
    E[0] = 0.0 - cm0[0]
    
    # Odd coeficients (Equation 41)
    for i in range(nz-1):
        l = 2*i + 2
        A[l] = e2[i]*e3[i] - e4[i]*e1[i]
        B[l] = e1[i]*e1[i+1] - e3[i]*e3[i+1]
        D[l] = e3[i]*e4[i+1] - e1[i]*e2[i+1]
        E[l] = e3[i]*(cp0[i+1] - cpb[i]) + e1[i]*(cmb[i] - cm0[i+1])
    
    # Even coefficients (Equation 42)
    for i in range(nz-1):
        l = 2*i + 1
        A[l] = e2[i+1]*e1[i] - e3[i]*e4[i+1]
        B[l] = e2[i]*e2[i+1] - e4[i]*e4[i+1]
        D[l] = e1[i+1]*e4[i+1] - e2[i+1]*e3[i+1]
        E[l] = e2[i+1]*(cp0[i+1] - cpb[i]) - e4[i+1]*(cm0[i+1] - cmb[i])
    
    l = 2*nz - 1
    A[l] = e1[nz-1] - Rsfc*e3[nz-1]
    B[l] = e2[nz-1] - Rsfc*e4[nz-1]
    D[l] = 0.0
    E[l] = Ssfc - cpb[nz-1] + Rsfc*cmb[nz-1]
    
    solve_tridiag(A, B, D, E)
    
    for i in range(nz):
        l = 2*i + 1
        y1[i] = E[l-1]
        y2[i] = E[l]
    
    amean = np.empty((nz+1),np.float64)
    amean[0] = (1.0/u1)*(y1[0]*e3[0]-y2[0]*e4[0]+cp0[0]) + direct[0]/u0;
    for i in range(nz):
        # J_n*4*pi = amean (Equation 49)
        amean[i+1] = (1.0/u1)*(y1[i]*(e1[i]+e3[i])+y2[i]*(e2[i]+e4[i]) + cpb[i]+cmb[i]) + direct[i+1]/u0

    return amean
    
@nb.njit
def solve_tridiag(a, b, c, d):
    n = len(a)
    n = n - 1
    
    c[0] /= b[0]
    d[0] /= b[0]

    for i in range(1,n):
        c[i] /= b[i] - a[i]*c[i-1]
        d[i] = (d[i] - a[i]*d[i-1]) / (b[i] - a[i]*c[i-1])

    d[n] = (d[n] - a[n]*d[n-1]) / (b[n] - a[n]*c[n-1])

    for i in range(n-1, -1, -1):
        d[i] -= c[i]*d[i+1]

@nb.njit
def make_inputs():
    u0 = 0.6427876096865394
    Rsfc = 0.25

    nz = 200
    tau = np.empty(nz)
    w0 = np.empty(nz)
    gt = np.zeros(nz)

    tau[0] = 8.7767996682530329e-08; tau[1] = 9.5947097904378163e-08; tau[2] = 1.0489052756541996e-07; 
    tau[3] = 1.1466995675270959e-07; tau[4] = 1.2536380811254373e-07; tau[5] = 1.3705789618437358e-07; 
    tau[6] = 1.4984612914194580e-07; tau[7] = 1.6383127794765110e-07; tau[8] = 1.7912581953502761e-07; 
    tau[9] = 1.9585286130848492e-07; tau[10] = 2.1414715499671573e-07; tau[11] = 2.3415620872566158e-07; 
    tau[12] = 2.5604150709720415e-07; tau[13] = 2.7997985008321631e-07; tau[14] = 3.0616482268293842e-07; 
    tau[15] = 3.3480840855869273e-07; tau[16] = 3.6614276227859870e-07; tau[17] = 4.0042215637110061e-07; 
    tau[18] = 4.3792512115823633e-07; tau[19] = 4.7895679730553810e-07; tau[20] = 5.2385152323406823e-07; 
    tau[21] = 5.7297568201604804e-07; tau[22] = 6.2673083515698063e-07; tau[23] = 6.8555717379606160e-07; 
    tau[24] = 7.4993732137969107e-07; tau[25] = 8.2040052583949341e-07; tau[26] = 8.9752728379739709e-07; 
    tau[27] = 9.8195444440443219e-07; tau[28] = 1.0743808461797366e-06; tau[29] = 1.1755735467535040e-06; 
    tau[30] = 1.2863747128512830e-06; tau[31] = 1.4077092463227407e-06; tau[32] = 1.5405932316575096e-06; 
    tau[33] = 1.6861433014556164e-06; tau[34] = 1.8455870289203520e-06; tau[35] = 2.0202744708601724e-06; 
    tau[36] = 2.2116910012294060e-06; tau[37] = 2.4214715942473803e-06; tau[38] = 2.6514167379642028e-06; 
    tau[39] = 2.9035101842693509e-06; tau[40] = 3.1799387703266947e-06; tau[41] = 3.4831145798177533e-06; 
    tau[42] = 3.8156997509793911e-06; tau[43] = 4.1806342830157372e-06; tau[44] = 4.5811672439896014e-06; 
    tau[45] = 5.0208918431149617e-06; tau[46] = 5.5037848994039188e-06; tau[47] = 6.0342513187615696e-06; 
    tau[48] = 6.6171742844996624e-06; tau[49] = 7.2579719738055458e-06; tau[50] = 7.9626617375794997e-06; 
    tau[51] = 8.7379328255527379e-06; tau[52] = 9.5912289061901981e-06; tau[53] = 1.0530841825193256e-05; 
    tau[54] = 1.1566018270795857e-05; tau[55] = 1.2707081274015154e-05; tau[56] = 1.3965568772106545e-05; 
    tau[57] = 1.5354391809370636e-05; tau[58] = 1.6888015348134820e-05; tau[59] = 1.8582665120003893e-05; 
    tau[60] = 2.0456564472667114e-05; tau[61] = 2.2530205765175080e-05; tau[62] = 2.4826661548100218e-05; 
    tau[63] = 2.7371941534785026e-05; tau[64] = 3.0195402243950013e-05; tau[65] = 3.3330217163257071e-05; 
    tau[66] = 3.6813916372382030e-05; tau[67] = 4.0689005750403127e-05; tau[68] = 4.5003677189107140e-05; 
    tau[69] = 4.9812622624297257e-05; tau[70] = 5.5177966142078792e-05; tau[71] = 6.1170329907152607e-05; 
    tau[72] = 6.7870051104862677e-05; tau[73] = 7.5368568415506559e-05; tau[74] = 8.3769997605174037e-05; 
    tau[75] = 9.3192916477134497e-05; tau[76] = 1.0377237936370605e-04; tau[77] = 1.1566218028227589e-04; 
    tau[78] = 1.2903738130325400e-04; tau[79] = 1.4409711804014798e-04; tau[80] = 1.6106768650787847e-04; 
    tau[81] = 1.8020590401221682e-04; tau[82] = 2.0180271951203158e-04; tau[83] = 2.2618702435261593e-04; 
    tau[84] = 2.5372957971436598e-04; tau[85] = 2.8499085847225129e-04; tau[86] = 3.2072793130441182e-04; 
    tau[87] = 3.6172038216249729e-04; tau[88] = 4.0883409334140487e-04; tau[89] = 4.6308361487628643e-04; 
    tau[90] = 5.2565578019456703e-04; tau[91] = 5.9793596197029830e-04; tau[92] = 6.8153685512867973e-04; 
    tau[93] = 7.7832951627810915e-04; tau[94] = 8.9047619550709984e-04; tau[95] = 1.0204642828402439e-03; 
    tau[96] = 1.1711404714380698e-03; tau[97] = 1.3457440390547924e-03; tau[98] = 1.5479380130098432e-03; 
    tau[99] = 1.7818369502875632e-03; tau[100] = 2.0520301990285964e-03; tau[101] = 2.3635998383980863e-03; 
    tau[102] = 2.7221330470807528e-03; tau[103] = 3.1337294374715365e-03; tau[104] = 3.6050225687279105e-03; 
    tau[105] = 4.1432174515158250e-03; tau[106] = 4.7560587646881024e-03; tau[107] = 5.4518194898090918e-03; 
    tau[108] = 6.2393740872849577e-03; tau[109] = 7.1282605624539697e-03; tau[110] = 8.1287730355002370e-03; 
    tau[111] = 9.2520919156331774e-03; tau[112] = 1.0510459289421320e-02; tau[113] = 1.1917408459878433e-02; 
    tau[114] = 1.3488058899910772e-02; tau[115] = 1.5239490954294801e-02; tau[116] = 1.7191215895174722e-02; 
    tau[117] = 1.9365747959630791e-02; tau[118] = 2.1789244185028790e-02; tau[119] = 2.4492051273927565e-02; 
    tau[120] = 2.7508662976724094e-02; tau[121] = 3.0875812746695153e-02; tau[122] = 3.4625933661229222e-02; 
    tau[123] = 3.8771507194934468e-02; tau[124] = 4.3278351607027789e-02; tau[125] = 4.8046866543172910e-02; 
    tau[126] = 5.2968730144676401e-02; tau[127] = 5.8080080318497186e-02; tau[128] = 6.3512117073305424e-02; 
    tau[129] = 6.9334818787612357e-02; tau[130] = 7.5580500565664441e-02; tau[131] = 8.2268694684091204e-02; 
    tau[132] = 8.9410407413410012e-02; tau[133] = 9.7014366545305167e-02; tau[134] = 1.0508676551553092e-01; 
    tau[135] = 1.1363074698597439e-01; tau[136] = 1.2264676085190102e-01; tau[137] = 1.3213313885917077e-01; 
    tau[138] = 1.4208431117677991e-01; tau[139] = 1.5248507419772675e-01; tau[140] = 1.6331818759153677e-01; 
    tau[141] = 1.7457363049899460e-01; tau[142] = 1.8624471255477631e-01; tau[143] = 1.9833035537844179e-01; 
    tau[144] = 2.1083726583856535e-01; tau[145] = 2.2378158933184314e-01; tau[146] = 2.3718960522404994e-01; 
    tau[147] = 2.5109715445037328e-01; tau[148] = 2.6554778373896060e-01; tau[149] = 2.8058996227549782e-01; 
    tau[150] = 2.9627401857870805e-01; tau[151] = 3.1264948596979114e-01; tau[152] = 3.2976328683030964e-01; 
    tau[153] = 3.4765878733307898e-01; tau[154] = 3.6637545533146748e-01; tau[155] = 3.8594878862798920e-01; 
    tau[156] = 4.0641030224751162e-01; tau[157] = 4.2778750516236580e-01; tau[158] = 4.5010382615701916e-01; 
    tau[159] = 4.7337839740328619e-01; tau[160] = 4.9762555188214369e-01; tau[161] = 5.2276004558359734e-01; 
    tau[162] = 5.4862232621183815e-01; tau[163] = 5.7510375299943239e-01; tau[164] = 6.0224859454892643e-01; 
    tau[165] = 6.3079868128618721e-01; tau[166] = 6.6150503383107084e-01; tau[167] = 6.9479262019749455e-01; 
    tau[168] = 7.3135971057268112e-01; tau[169] = 7.7207900416531505e-01; tau[170] = 8.1777260097037741e-01; 
    tau[171] = 8.7003464761828964e-01; tau[172] = 9.3024357036853700e-01; tau[173] = 9.9877360370577395e-01; 
    tau[174] = 1.0762228460543470e+00; tau[175] = 1.1562951730417654e+00; tau[176] = 1.2377641657793577e+00; 
    tau[177] = 1.3216565090158796e+00; tau[178] = 2.6574734564569380e+00; tau[179] = 4.2105772058622346e+00; 
    tau[180] = 7.0339868009456881e+00; tau[181] = 1.1521276456355757e+01; tau[182] = 1.8285964859512521e+01; 
    tau[183] = 2.8784368028594791e+01; tau[184] = 4.4808994648392989e+01; tau[185] = 6.8880927399198839e+01; 
    tau[186] = 1.0449251595375671e+02; tau[187] = 1.5641266261187351e+02; tau[188] = 2.3106587764604379e+02; 
    tau[189] = 3.3699697083889566e+02; tau[190] = 4.8543396220701010e+02; tau[191] = 6.9096229688139806e+02; 
    tau[192] = 9.7232374589713959e+02; tau[193] = 1.3811178079094689e+03; tau[194] = 1.9771447087450604e+03; 
    tau[195] = 2.7426755572022894e+03; tau[196] = 3.6957662267266865e+03; tau[197] = 4.9351475964671836e+03; 
    tau[198] = 6.5332896118304561e+03; tau[199] = 8.5775143354619722e+03; 

    w0[0] = 9.9711179180048182e-01; w0[1] = 9.9710577604125894e-01; w0[2] = 9.9709335053701875e-01; 
    w0[3] = 9.9707409718189177e-01; w0[4] = 9.9704756934937544e-01; w0[5] = 9.9701328938498046e-01; 
    w0[6] = 9.9697074581156886e-01; w0[7] = 9.9691939021948628e-01; w0[8] = 9.9685863381171902e-01; 
    w0[9] = 9.9678784357544925e-01; w0[10] = 9.9670633805145825e-01; w0[11] = 9.9661338267233490e-01; 
    w0[12] = 9.9650818464346891e-01; w0[13] = 9.9638988734055600e-01; w0[14] = 9.9625756419873834e-01; 
    w0[15] = 9.9611021207159700e-01; w0[16] = 9.9594674403681771e-01; w0[17] = 9.9576598162879704e-01; 
    w0[18] = 9.9556664647758897e-01; w0[19] = 9.9534735133343222e-01; w0[20] = 9.9510659045563765e-01; 
    w0[21] = 9.9484272934368789e-01; w0[22] = 9.9455399378589959e-01; w0[23] = 9.9423845819787393e-01; 
    w0[24] = 9.9389403322209602e-01; w0[25] = 9.9351845255550364e-01; w0[26] = 9.9310925897056246e-01; 
    w0[27] = 9.9266378949192857e-01; w0[28] = 9.9217915969101766e-01; w0[29] = 9.9165224706199229e-01; 
    w0[30] = 9.9107967344303005e-01; w0[31] = 9.9045778644803517e-01; w0[32] = 9.8978263988756232e-01; 
    w0[33] = 9.8904997315915044e-01; w0[34] = 9.8825518959925840e-01; w0[35] = 9.8739333380968586e-01; 
    w0[36] = 9.8645906798846772e-01; w0[37] = 9.8544664731122855e-01; w0[38] = 9.8434989444384591e-01; 
    w0[39] = 9.8316217330321953e-01; w0[40] = 9.8187636221329821e-01; w0[41] = 9.8048482665722558e-01; 
    w0[42] = 9.7897939187322636e-01; w0[43] = 9.7735131560593647e-01; w0[44] = 9.7559126140833230e-01; 
    w0[45] = 9.7368927293970564e-01; w0[46] = 9.7163474982622933e-01; w0[47] = 9.6941642574050779e-01; 
    w0[48] = 9.6702234946428145e-01; w0[49] = 9.6443986983423624e-01; w0[50] = 9.6165562558909190e-01; 
    w0[51] = 9.5865554130176256e-01; w0[52] = 9.5542483072735251e-01; w0[53] = 9.5194800903411125e-01; 
    w0[54] = 9.4820891557954556e-01; w0[55] = 9.4419074902690803e-01; w0[56] = 9.3987611674461291e-01; 
    w0[57] = 9.3524710057171967e-01; w0[58] = 9.3028534110971439e-01; w0[59] = 9.2497214278425166e-01; 
    w0[60] = 9.1928860187296246e-01; w0[61] = 9.1321575969783042e-01; w0[62] = 9.0673478291743770e-01; 
    w0[63] = 8.9982717269670254e-01; w0[64] = 8.9247500397279889e-01; w0[65] = 8.8466119568641544e-01; 
    w0[66] = 8.7636981190996832e-01; w0[67] = 8.6758639310529395e-01; w0[68] = 8.5829831560583947e-01; 
    w0[69] = 8.4849517611710745e-01; w0[70] = 8.3816919684997526e-01; w0[71] = 8.2731564522719481e-01; 
    w0[72] = 8.1593326067846506e-01; w0[73] = 8.0402467952201917e-01; w0[74] = 7.9159684763866422e-01; 
    w0[75] = 7.7866140928303185e-01; w0[76] = 7.6523505988054275e-01; w0[77] = 7.5133985023733718e-01; 
    w0[78] = 7.3700343012197278e-01; w0[79] = 7.2225922012228905e-01; w0[80] = 7.0714650311348992e-01; 
    w0[81] = 6.9171042928311888e-01; w0[82] = 6.7600193312752554e-01; w0[83] = 6.6007756565534514e-01; 
    w0[84] = 6.4399925112087075e-01; w0[85] = 6.2751690913313019e-01; w0[86] = 6.1027499828015230e-01; 
    w0[87] = 5.9224510437414302e-01; w0[88] = 5.7351602373384358e-01; w0[89] = 5.5418855325877014e-01; 
    w0[90] = 5.3437475255737310e-01; w0[91] = 5.1419661917235160e-01; w0[92] = 4.9378417655716828e-01; 
    w0[93] = 4.7327302015086270e-01; w0[94] = 4.5280141715326816e-01; w0[95] = 4.3250709824656874e-01; 
    w0[96] = 4.1252391349300427e-01; w0[97] = 3.9297854411973260e-01; w0[98] = 3.7398746095910612e-01; 
    w0[99] = 3.5565430274683579e-01; w0[100] = 3.3806781061485630e-01; w0[101] = 3.2130040985921160e-01; 
    w0[102] = 3.0540747904392268e-01; w0[103] = 2.9042729373237675e-01; w0[104] = 2.7638022880356511e-01; 
    w0[105] = 2.6326880017521570e-01; w0[106] = 2.5108351465026180e-01; w0[107] = 2.3980507108518678e-01; 
    w0[108] = 2.2940313887670982e-01; w0[109] = 2.1983837091461964e-01; w0[110] = 2.1106434856035647e-01; 
    w0[111] = 2.0302938150311162e-01; w0[112] = 1.9567810006274122e-01; w0[113] = 1.8895279358968187e-01; 
    w0[114] = 1.8279446934896854e-01; w0[115] = 1.7714363349486195e-01; w0[116] = 1.7194084623673653e-01; 
    w0[117] = 1.6712721159390778e-01; w0[118] = 1.6264518453364277e-01; w0[119] = 1.5844053282765996e-01; 
    w0[120] = 1.5446717275166658e-01; w0[121] = 1.5069805821385496e-01; w0[122] = 1.4714688950237698e-01; 
    w0[123] = 1.4390390146402926e-01; w0[124] = 1.4117355924585473e-01; w0[125] = 1.3925281217141075e-01; 
    w0[126] = 1.3832497295389018e-01; w0[127] = 1.3814978008396606e-01; w0[128] = 1.3835160646341094e-01; 
    w0[129] = 1.3879024386623634e-01; w0[130] = 1.3943642059052910e-01; w0[131] = 1.4029209031588918e-01; 
    w0[132] = 1.4137338559055690e-01; w0[133] = 1.4269664294338003e-01; w0[134] = 1.4427862559127622e-01; 
    w0[135] = 1.4613710893515686e-01; w0[136] = 1.4829031288790176e-01; w0[137] = 1.5075639573344454e-01; 
    w0[138] = 1.5355577328662448e-01; w0[139] = 1.5671691155491141e-01; w0[140] = 1.6026759551587053e-01; 
    w0[141] = 1.6422711196183729e-01; w0[142] = 1.6861217530896319e-01; w0[143] = 1.7343599664804954e-01; 
    w0[144] = 1.7870749622029697e-01; w0[145] = 1.8443089822995290e-01; w0[146] = 1.9060594959733176e-01; 
    w0[147] = 1.9722890743114718e-01; w0[148] = 2.0429424128179388e-01; w0[149] = 2.1179676211688642e-01; 
    w0[150] = 2.1973372569350255e-01; w0[151] = 2.2810647413984569e-01; w0[152] = 2.3692139199104423e-01; 
    w0[153] = 2.4619023520258562e-01; w0[154] = 2.5593007712435828e-01; w0[155] = 2.6616312793182034e-01; 
    w0[156] = 2.7691657556564053e-01; w0[157] = 2.8822248978446646e-01; w0[158] = 3.0011781136252774e-01; 
    w0[159] = 3.1264448345127460e-01; w0[160] = 3.2584981513994765e-01; w0[161] = 3.3984816250379990e-01; 
    w0[162] = 3.5480294275155777e-01; w0[163] = 3.7084641223360615e-01; w0[164] = 3.8801659473095779e-01; 
    w0[165] = 4.0590776426938302e-01; w0[166] = 4.2411463421746515e-01; w0[167] = 4.4245155797750657e-01; 
    w0[168] = 4.6057530551628978e-01; w0[169] = 4.7806498804785041e-01; w0[170] = 4.9458321307491998e-01; 
    w0[171] = 5.0940696342852609e-01; w0[172] = 5.2208413946637822e-01; w0[173] = 5.3285877380442748e-01; 
    w0[174] = 5.4190756796308170e-01; w0[175] = 5.4874444561683677e-01; w0[176] = 5.5343636086921844e-01; 
    w0[177] = 5.5665571130037195e-01; w0[178] = 2.9574890756478689e-01; w0[179] = 1.9886942554336376e-01; 
    w0[180] = 1.2649443133824198e-01; w0[181] = 8.2055381707757405e-02; w0[182] = 5.4932432103361739e-02; 
    w0[183] = 3.7036615277279070e-02; w0[184] = 2.5222279402791422e-02; w0[185] = 1.7375975057806702e-02; 
    w0[186] = 1.2117538023085468e-02; w0[187] = 8.5555194333282857e-03; w0[188] = 6.1147950360911198e-03; 
    w0[189] = 4.4226660277904978e-03; w0[190] = 3.2357616929942499e-03; w0[191] = 2.3936627467879821e-03; 
    w0[192] = 1.7895336731204119e-03; w0[193] = 1.3229150832381939e-03; w0[194] = 9.6852078283902500e-04; 
    w0[195] = 7.3181934878072249e-04; w0[196] = 5.6933457177442079e-04; w0[197] = 4.4659287344306218e-04; 
    w0[198] = 3.5307409636260321e-04; w0[199] = 2.8123294322360731e-04; 

    return tau, w0, gt, u0, Rsfc

# make inputs
tau, w0, gt, u0, Rsfc = make_inputs()
@nb.njit
def benchmark_twostream():
    amean = two_stream(tau, w0, gt, u0, Rsfc)

# compile
benchmark_twostream()

# time
iters = 1000
time_us = timeit.Timer(benchmark_twostream).timeit(number=iters)/iters*1e6
print('Numba twostream runtime = %.1f us'%time_us)
Numba twostream runtime = 19.2 us
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment