Lots of scientific codes need to estimate how light passes through an atmosphere. For example, to quantify the greenhouse effect, climate models need to know how an atmosphere absorbs and scatters sunlight and how efficiently infrared radiation emitted by the atmosphere escapes to space. Also, researchers need models of radiative transfer to interpret telescope observations of atmospheres on planets orbiting distant stars. By simulating what the telescope should see under different atmospheric compositions, a researcher can ultimately back out the composition that best agrees with the telescope observations.
A problem is that many scientific codes are bottlenecked by doing radiative transfer. Lots of computation is required because gases (e.g., H2O) absorb in very complicated ways as a function of the wavelength of light, so many radiative transfer calculations are required to revolve this structure.
I was excited to see initial release of the Mojo programming language, because it might allow for faster radiative transfer calculations for science applications. In particular, Mojo seems to make it easy to write SIMD instructions, which could potentially speed up radiative transfer by a lot.
To test if Mojo might speed up scientific codes, I coded up a popular "two stream" algorithm for approximately solving the radiative transfer equation. There are many different "two stream" methods. Below, I code up the Quadrature two stream method described in Toon et al. (1989).
Here is a summary of the results comparing the Mojo implementations with and without SIMD ("vectorized" means with SIMD) to a Numba implementation.
time (microseconds) | |
---|---|
Mojo (scalar) | 12.0 |
Mojo (vectorized) | 8.4 |
Numba | 19.2 |
The vectorized (i.e., SIMD) Mojo calculation is a factor of 1.4 faster than scalar Mojo code and a factor of 2.3 faster than Numba. A big caveat is that I have very little experience with SIMD. Maybe there are better ways to improve my SIMD implementation.
from Benchmark import Benchmark
from DType import DType
from Pointer import DTypePointer
from Random import rand
from Memory import memset_zero
from Math import sqrt, exp, pow
from Functional import vectorize
from TargetInfo import dtype_simd_width
struct Vector:
var data: DTypePointer[DType.f64]
var size: Int
fn __init__(inout self, size: Int):
self.data = DTypePointer[DType.f64].alloc(size)
self.size = size
fn __del__(owned self):
self.data.free()
fn random(inout self):
rand(self.data, self.size)
fn zero(inout self):
memset_zero(self.data, self.size)
@always_inline
fn __getitem__(self, i: Int) -> F64:
return self.load[1](i)
@always_inline
fn load[nelts:Int](self, i: Int) -> SIMD[DType.f64, nelts]:
return self.data.simd_load[nelts](i)
@always_inline
fn __setitem__(self, i: Int, val: F64):
return self.store[1](i, val)
@always_inline
fn store[nelts:Int](self, i: Int, val: SIMD[DType.f64, nelts]):
self.data.simd_store[nelts](i, val)
struct WorkSpace:
var tauc: Vector
var e1: Vector
var e2: Vector
var e3: Vector
var e4: Vector
var direct: Vector
var cp0: Vector
var cpb: Vector
var cm0: Vector
var cmb: Vector
var A: Vector; var B: Vector; var D: Vector; var E: Vector;
var A_even: Vector; var B_even: Vector; var D_even: Vector; var E_even: Vector;
var A_odd: Vector; var B_odd: Vector; var D_odd: Vector; var E_odd: Vector;
var y1: Vector
var y2: Vector
def __init__(inout self, nz: Int):
self.tauc = Vector(nz+1)
self.e1 = Vector(nz)
self.e2 = Vector(nz)
self.e3 = Vector(nz)
self.e4 = Vector(nz)
self.direct = Vector(nz+1)
self.cp0 = Vector(nz)
self.cpb = Vector(nz)
self.cm0 = Vector(nz)
self.cmb = Vector(nz)
self.A = Vector(2*nz)
self.B = Vector(2*nz)
self.D = Vector(2*nz)
self.E = Vector(2*nz)
self.A_even = Vector(nz-1)
self.B_even = Vector(nz-1)
self.D_even = Vector(nz-1)
self.E_even = Vector(nz-1)
self.A_odd = Vector(nz-1)
self.B_odd = Vector(nz-1)
self.D_odd = Vector(nz-1)
self.E_odd = Vector(nz-1)
self.y1 = Vector(nz)
self.y2 = Vector(nz)
fn twostream[nelts: Int](tau: Vector, w0: Vector, gt: Vector, u0: F64, Rsfc: F64, inout amean: Vector, inout w: WorkSpace):
alias Fs_pi = 1.0
let u1 = 1.0/sqrt[1,DType.f64](3.0)
let nz = tau.size
# cumulative optical depth at the top of each layer
w.tauc[0] = 0.0
for i in range(1,nz+1):
w.tauc[i] = w.tauc[i-1] + tau[i-1]
@parameter
fn set_e_and_c[nelts : Int](i : Int):
# Quadrature Two-Stream coefficients (Table 1)
let gam1 = sqrt[1,DType.f64](3.0)*(2.0 - w0.load[nelts](i)*(1.0 - gt.load[nelts](i)))/2.0
let gam2 = sqrt[1,DType.f64](3.0)*w0.load[nelts](i)*(1.0-gt.load[nelts](i))/2.0
let gam3 = (1.0 - sqrt[1,DType.f64](3.0)*gt.load[nelts](i)*u0)/2.0
let gam4 = 1.0 - gam3
# u1 = 1.0_dp/sqrt(3.0_dp) (parameter)
# lambda, and capital Gamma (Equations 21, 22)
# let gam12_minus_gam22 = gam1**2 - gam2**2
let gam12_minus_gam22 = pow[nelts,DType.f64,DType.f64](gam1, 2.0) - pow[nelts,DType.f64,DType.f64](gam2, 2.0)
let lambd = sqrt[nelts,DType.f64](gam12_minus_gam22)
let cap_gam = gam2 / (gam1 + lambd)
# e's (Equation 44)
let wrk_real = exp[nelts,DType.f64](-lambd*tau.load[nelts](i))
w.e1.store[nelts](i, 1.0 + cap_gam*wrk_real)
w.e2.store[nelts](i, 1.0 - cap_gam*wrk_real)
w.e3.store[nelts](i, cap_gam + wrk_real)
w.e4.store[nelts](i, cap_gam - wrk_real)
# C+ and C- (Equation 23, 24)
# Fs_pi = 1.0_dp (parameter)
# We take the solar flux at the top of the atmosphere to be = 1
# Note: Toon et al. 1989 defines Fs * pi = solar flux. So Fs = solar flux / pi
let facp = w0.load[nelts](i)*Fs_pi*((gam1-1.0/u0)*gam3 + gam4*gam2)
let facm = w0.load[nelts](i)*Fs_pi*((gam1+1.0/u0)*gam4 + gam2*gam3)
let et0 = exp[nelts,DType.f64](-w.tauc.load[nelts](i)/u0)
let etb = et0*exp(-tau.load[nelts](i)/u0)
let denom = gam12_minus_gam22 - 1.0/(pow[nelts,DType.f64,DType.f64](u0,2.0))
w.direct.store[nelts](i+1, u0*Fs_pi*etb)
w.cp0.store[nelts](i, et0*facp/denom)
w.cpb.store[nelts](i, etb*facp/denom)
w.cm0.store[nelts](i, et0*facm/denom)
w.cmb.store[nelts](i, etb*facm/denom)
vectorize[nelts, set_e_and_c](nz)
w.direct[0] = u0*Fs_pi
let Ssfc = Rsfc*w.direct[nz]
# Coefficients of tridiagonal linear system (Equations 39 - 43)
w.A[0] = 0.0
w.B[0] = w.e1[0]
w.D[0] = -w.e2[0]
w.E[0] = 0.0 - w.cm0[0] # assumes no downward diffuse flux at top-of-atmosphere
# Odd coefficients (Equation 41)
@parameter
fn set_ABDE_odd[nelts : Int](i : Int):
w.A_odd.store[nelts](i, w.e2.load[nelts](i)*w.e3.load[nelts](i) - w.e4.load[nelts](i)*w.e1.load[nelts](i))
w.B_odd.store[nelts](i, w.e1.load[nelts](i)*w.e1.load[nelts](i+1) - w.e3.load[nelts](i)*w.e3.load[nelts](i+1))
w.D_odd.store[nelts](i, w.e3.load[nelts](i)*w.e4.load[nelts](i+1) - w.e1.load[nelts](i)*w.e2.load[nelts](i+1))
w.E_odd.store[nelts](i, w.e3.load[nelts](i)*(w.cp0.load[nelts](i+1) - w.cpb.load[nelts](i)) + w.e1.load[nelts](i)*(w.cmb.load[nelts](i) - w.cm0.load[nelts](i+1)))
vectorize[nelts, set_ABDE_odd](nz-1)
# Even coefficients (Equation 42)
@parameter
fn set_ABDE_even[nelts : Int](i : Int):
w.A_even.store[nelts](i, w.e2.load[nelts](i+1)*w.e1.load[nelts](i) - w.e3.load[nelts](i)*w.e4.load[nelts](i+1))
w.B_even.store[nelts](i, w.e2.load[nelts](i)*w.e2.load[nelts](i+1) - w.e4.load[nelts](i)*w.e4.load[nelts](i+1))
w.D_even.store[nelts](i, w.e1.load[nelts](i+1)*w.e4.load[nelts](i+1) - w.e2.load[nelts](i+1)*w.e3.load[nelts](i+1))
w.E_even.store[nelts](i, w.e2.load[nelts](i+1)*(w.cp0.load[nelts](i+1) - w.cpb.load[nelts](i)) - w.e4.load[nelts](i+1)*(w.cm0.load[nelts](i+1) - w.cmb.load[nelts](i)))
vectorize[nelts, set_ABDE_even](nz-1)
var l: Int
for i in range(nz-1):
l = 2*i + 2
w.A[l] = w.A_odd[i]
w.B[l] = w.B_odd[i]
w.D[l] = w.D_odd[i]
w.E[l] = w.E_odd[i]
l = 2*i + 1
w.A[l] = w.A_even[i]
w.B[l] = w.B_even[i]
w.D[l] = w.D_even[i]
w.E[l] = w.E_even[i]
l = 2*nz - 1
w.A[l] = w.e1[nz-1] - Rsfc*w.e3[nz-1];
w.B[l] = w.e2[nz-1] - Rsfc*w.e4[nz-1];
w.D[l] = 0.0;
w.E[l] = Ssfc - w.cpb[nz-1] + Rsfc*w.cmb[nz-1];
solve_tridiag(w.A, w.B, w.D, w.E)
for i in range(nz):
l = 2*i + 1
w.y1[i] = w.E[l-1]
w.y2[i] = w.E[l]
# amean = integral(J_n d_Omega) = J_n*4*pi
# Above is an integration of J_n over a complete sphere,
# and has units of flux density, or irradiance.
# amean(i) is for at top of layer i. amean(nz + 1) is the ground.
# very top edge of atmosphere (Not in paper. Derive from Equation 17 and 31. bit confusing)
amean[0] = (1.0/u1)*(w.y1[0]*w.e3[0]-w.y2[0]*w.e4[0]+w.cp0[0]) + w.direct[0]/u0;
# J_n*4*pi = amean (Equation 49)
@parameter
fn set_amean[nelts : Int](i : Int):
amean.store[nelts](i+1, (1.0/u1)*(w.y1.load[nelts](i)*(w.e1.load[nelts](i)+w.e3.load[nelts](i))+w.y2.load[nelts](i)*(w.e2.load[nelts](i)+w.e4.load[nelts](i))
+ w.cpb.load[nelts](i)+w.cmb.load[nelts](i)) + w.direct.load[nelts](i+1)/u0)
vectorize[nelts, set_amean](nz)
@always_inline
fn solve_tridiag(a: Vector, b: Vector, inout c: Vector, inout d: Vector):
let n = a.size - 1
c[0] /= b[0];
d[0] /= b[0];
for i in range(1,n):
c[i] /= b[i] - a[i]*c[i-1];
d[i] = (d[i] - a[i]*d[i-1]) / (b[i] - a[i]*c[i-1])
d[n] = (d[n] - a[n]*d[n-1]) / (b[n] - a[n]*c[n-1])
for j in range(n):
let i = n-j-1
d[i] -= c[i]*d[i+1]
alias nz = 200
var tau = Vector(nz)
var w0 = Vector(nz)
var gt = Vector(nz)
gt.zero()
let u0: F64 = 0.6427876096865394
let Rsfc: F64 = 0.25
var amean = Vector(nz+1)
var w = WorkSpace(nz)
tau[0] = 8.7767996682530329e-08; tau[1] = 9.5947097904378163e-08; tau[2] = 1.0489052756541996e-07;
tau[3] = 1.1466995675270959e-07; tau[4] = 1.2536380811254373e-07; tau[5] = 1.3705789618437358e-07;
tau[6] = 1.4984612914194580e-07; tau[7] = 1.6383127794765110e-07; tau[8] = 1.7912581953502761e-07;
tau[9] = 1.9585286130848492e-07; tau[10] = 2.1414715499671573e-07; tau[11] = 2.3415620872566158e-07;
tau[12] = 2.5604150709720415e-07; tau[13] = 2.7997985008321631e-07; tau[14] = 3.0616482268293842e-07;
tau[15] = 3.3480840855869273e-07; tau[16] = 3.6614276227859870e-07; tau[17] = 4.0042215637110061e-07;
tau[18] = 4.3792512115823633e-07; tau[19] = 4.7895679730553810e-07; tau[20] = 5.2385152323406823e-07;
tau[21] = 5.7297568201604804e-07; tau[22] = 6.2673083515698063e-07; tau[23] = 6.8555717379606160e-07;
tau[24] = 7.4993732137969107e-07; tau[25] = 8.2040052583949341e-07; tau[26] = 8.9752728379739709e-07;
tau[27] = 9.8195444440443219e-07; tau[28] = 1.0743808461797366e-06; tau[29] = 1.1755735467535040e-06;
tau[30] = 1.2863747128512830e-06; tau[31] = 1.4077092463227407e-06; tau[32] = 1.5405932316575096e-06;
tau[33] = 1.6861433014556164e-06; tau[34] = 1.8455870289203520e-06; tau[35] = 2.0202744708601724e-06;
tau[36] = 2.2116910012294060e-06; tau[37] = 2.4214715942473803e-06; tau[38] = 2.6514167379642028e-06;
tau[39] = 2.9035101842693509e-06; tau[40] = 3.1799387703266947e-06; tau[41] = 3.4831145798177533e-06;
tau[42] = 3.8156997509793911e-06; tau[43] = 4.1806342830157372e-06; tau[44] = 4.5811672439896014e-06;
tau[45] = 5.0208918431149617e-06; tau[46] = 5.5037848994039188e-06; tau[47] = 6.0342513187615696e-06;
tau[48] = 6.6171742844996624e-06; tau[49] = 7.2579719738055458e-06; tau[50] = 7.9626617375794997e-06;
tau[51] = 8.7379328255527379e-06; tau[52] = 9.5912289061901981e-06; tau[53] = 1.0530841825193256e-05;
tau[54] = 1.1566018270795857e-05; tau[55] = 1.2707081274015154e-05; tau[56] = 1.3965568772106545e-05;
tau[57] = 1.5354391809370636e-05; tau[58] = 1.6888015348134820e-05; tau[59] = 1.8582665120003893e-05;
tau[60] = 2.0456564472667114e-05; tau[61] = 2.2530205765175080e-05; tau[62] = 2.4826661548100218e-05;
tau[63] = 2.7371941534785026e-05; tau[64] = 3.0195402243950013e-05; tau[65] = 3.3330217163257071e-05;
tau[66] = 3.6813916372382030e-05; tau[67] = 4.0689005750403127e-05; tau[68] = 4.5003677189107140e-05;
tau[69] = 4.9812622624297257e-05; tau[70] = 5.5177966142078792e-05; tau[71] = 6.1170329907152607e-05;
tau[72] = 6.7870051104862677e-05; tau[73] = 7.5368568415506559e-05; tau[74] = 8.3769997605174037e-05;
tau[75] = 9.3192916477134497e-05; tau[76] = 1.0377237936370605e-04; tau[77] = 1.1566218028227589e-04;
tau[78] = 1.2903738130325400e-04; tau[79] = 1.4409711804014798e-04; tau[80] = 1.6106768650787847e-04;
tau[81] = 1.8020590401221682e-04; tau[82] = 2.0180271951203158e-04; tau[83] = 2.2618702435261593e-04;
tau[84] = 2.5372957971436598e-04; tau[85] = 2.8499085847225129e-04; tau[86] = 3.2072793130441182e-04;
tau[87] = 3.6172038216249729e-04; tau[88] = 4.0883409334140487e-04; tau[89] = 4.6308361487628643e-04;
tau[90] = 5.2565578019456703e-04; tau[91] = 5.9793596197029830e-04; tau[92] = 6.8153685512867973e-04;
tau[93] = 7.7832951627810915e-04; tau[94] = 8.9047619550709984e-04; tau[95] = 1.0204642828402439e-03;
tau[96] = 1.1711404714380698e-03; tau[97] = 1.3457440390547924e-03; tau[98] = 1.5479380130098432e-03;
tau[99] = 1.7818369502875632e-03; tau[100] = 2.0520301990285964e-03; tau[101] = 2.3635998383980863e-03;
tau[102] = 2.7221330470807528e-03; tau[103] = 3.1337294374715365e-03; tau[104] = 3.6050225687279105e-03;
tau[105] = 4.1432174515158250e-03; tau[106] = 4.7560587646881024e-03; tau[107] = 5.4518194898090918e-03;
tau[108] = 6.2393740872849577e-03; tau[109] = 7.1282605624539697e-03; tau[110] = 8.1287730355002370e-03;
tau[111] = 9.2520919156331774e-03; tau[112] = 1.0510459289421320e-02; tau[113] = 1.1917408459878433e-02;
tau[114] = 1.3488058899910772e-02; tau[115] = 1.5239490954294801e-02; tau[116] = 1.7191215895174722e-02;
tau[117] = 1.9365747959630791e-02; tau[118] = 2.1789244185028790e-02; tau[119] = 2.4492051273927565e-02;
tau[120] = 2.7508662976724094e-02; tau[121] = 3.0875812746695153e-02; tau[122] = 3.4625933661229222e-02;
tau[123] = 3.8771507194934468e-02; tau[124] = 4.3278351607027789e-02; tau[125] = 4.8046866543172910e-02;
tau[126] = 5.2968730144676401e-02; tau[127] = 5.8080080318497186e-02; tau[128] = 6.3512117073305424e-02;
tau[129] = 6.9334818787612357e-02; tau[130] = 7.5580500565664441e-02; tau[131] = 8.2268694684091204e-02;
tau[132] = 8.9410407413410012e-02; tau[133] = 9.7014366545305167e-02; tau[134] = 1.0508676551553092e-01;
tau[135] = 1.1363074698597439e-01; tau[136] = 1.2264676085190102e-01; tau[137] = 1.3213313885917077e-01;
tau[138] = 1.4208431117677991e-01; tau[139] = 1.5248507419772675e-01; tau[140] = 1.6331818759153677e-01;
tau[141] = 1.7457363049899460e-01; tau[142] = 1.8624471255477631e-01; tau[143] = 1.9833035537844179e-01;
tau[144] = 2.1083726583856535e-01; tau[145] = 2.2378158933184314e-01; tau[146] = 2.3718960522404994e-01;
tau[147] = 2.5109715445037328e-01; tau[148] = 2.6554778373896060e-01; tau[149] = 2.8058996227549782e-01;
tau[150] = 2.9627401857870805e-01; tau[151] = 3.1264948596979114e-01; tau[152] = 3.2976328683030964e-01;
tau[153] = 3.4765878733307898e-01; tau[154] = 3.6637545533146748e-01; tau[155] = 3.8594878862798920e-01;
tau[156] = 4.0641030224751162e-01; tau[157] = 4.2778750516236580e-01; tau[158] = 4.5010382615701916e-01;
tau[159] = 4.7337839740328619e-01; tau[160] = 4.9762555188214369e-01; tau[161] = 5.2276004558359734e-01;
tau[162] = 5.4862232621183815e-01; tau[163] = 5.7510375299943239e-01; tau[164] = 6.0224859454892643e-01;
tau[165] = 6.3079868128618721e-01; tau[166] = 6.6150503383107084e-01; tau[167] = 6.9479262019749455e-01;
tau[168] = 7.3135971057268112e-01; tau[169] = 7.7207900416531505e-01; tau[170] = 8.1777260097037741e-01;
tau[171] = 8.7003464761828964e-01; tau[172] = 9.3024357036853700e-01; tau[173] = 9.9877360370577395e-01;
tau[174] = 1.0762228460543470e+00; tau[175] = 1.1562951730417654e+00; tau[176] = 1.2377641657793577e+00;
tau[177] = 1.3216565090158796e+00; tau[178] = 2.6574734564569380e+00; tau[179] = 4.2105772058622346e+00;
tau[180] = 7.0339868009456881e+00; tau[181] = 1.1521276456355757e+01; tau[182] = 1.8285964859512521e+01;
tau[183] = 2.8784368028594791e+01; tau[184] = 4.4808994648392989e+01; tau[185] = 6.8880927399198839e+01;
tau[186] = 1.0449251595375671e+02; tau[187] = 1.5641266261187351e+02; tau[188] = 2.3106587764604379e+02;
tau[189] = 3.3699697083889566e+02; tau[190] = 4.8543396220701010e+02; tau[191] = 6.9096229688139806e+02;
tau[192] = 9.7232374589713959e+02; tau[193] = 1.3811178079094689e+03; tau[194] = 1.9771447087450604e+03;
tau[195] = 2.7426755572022894e+03; tau[196] = 3.6957662267266865e+03; tau[197] = 4.9351475964671836e+03;
tau[198] = 6.5332896118304561e+03; tau[199] = 8.5775143354619722e+03;
w0[0] = 9.9711179180048182e-01; w0[1] = 9.9710577604125894e-01; w0[2] = 9.9709335053701875e-01;
w0[3] = 9.9707409718189177e-01; w0[4] = 9.9704756934937544e-01; w0[5] = 9.9701328938498046e-01;
w0[6] = 9.9697074581156886e-01; w0[7] = 9.9691939021948628e-01; w0[8] = 9.9685863381171902e-01;
w0[9] = 9.9678784357544925e-01; w0[10] = 9.9670633805145825e-01; w0[11] = 9.9661338267233490e-01;
w0[12] = 9.9650818464346891e-01; w0[13] = 9.9638988734055600e-01; w0[14] = 9.9625756419873834e-01;
w0[15] = 9.9611021207159700e-01; w0[16] = 9.9594674403681771e-01; w0[17] = 9.9576598162879704e-01;
w0[18] = 9.9556664647758897e-01; w0[19] = 9.9534735133343222e-01; w0[20] = 9.9510659045563765e-01;
w0[21] = 9.9484272934368789e-01; w0[22] = 9.9455399378589959e-01; w0[23] = 9.9423845819787393e-01;
w0[24] = 9.9389403322209602e-01; w0[25] = 9.9351845255550364e-01; w0[26] = 9.9310925897056246e-01;
w0[27] = 9.9266378949192857e-01; w0[28] = 9.9217915969101766e-01; w0[29] = 9.9165224706199229e-01;
w0[30] = 9.9107967344303005e-01; w0[31] = 9.9045778644803517e-01; w0[32] = 9.8978263988756232e-01;
w0[33] = 9.8904997315915044e-01; w0[34] = 9.8825518959925840e-01; w0[35] = 9.8739333380968586e-01;
w0[36] = 9.8645906798846772e-01; w0[37] = 9.8544664731122855e-01; w0[38] = 9.8434989444384591e-01;
w0[39] = 9.8316217330321953e-01; w0[40] = 9.8187636221329821e-01; w0[41] = 9.8048482665722558e-01;
w0[42] = 9.7897939187322636e-01; w0[43] = 9.7735131560593647e-01; w0[44] = 9.7559126140833230e-01;
w0[45] = 9.7368927293970564e-01; w0[46] = 9.7163474982622933e-01; w0[47] = 9.6941642574050779e-01;
w0[48] = 9.6702234946428145e-01; w0[49] = 9.6443986983423624e-01; w0[50] = 9.6165562558909190e-01;
w0[51] = 9.5865554130176256e-01; w0[52] = 9.5542483072735251e-01; w0[53] = 9.5194800903411125e-01;
w0[54] = 9.4820891557954556e-01; w0[55] = 9.4419074902690803e-01; w0[56] = 9.3987611674461291e-01;
w0[57] = 9.3524710057171967e-01; w0[58] = 9.3028534110971439e-01; w0[59] = 9.2497214278425166e-01;
w0[60] = 9.1928860187296246e-01; w0[61] = 9.1321575969783042e-01; w0[62] = 9.0673478291743770e-01;
w0[63] = 8.9982717269670254e-01; w0[64] = 8.9247500397279889e-01; w0[65] = 8.8466119568641544e-01;
w0[66] = 8.7636981190996832e-01; w0[67] = 8.6758639310529395e-01; w0[68] = 8.5829831560583947e-01;
w0[69] = 8.4849517611710745e-01; w0[70] = 8.3816919684997526e-01; w0[71] = 8.2731564522719481e-01;
w0[72] = 8.1593326067846506e-01; w0[73] = 8.0402467952201917e-01; w0[74] = 7.9159684763866422e-01;
w0[75] = 7.7866140928303185e-01; w0[76] = 7.6523505988054275e-01; w0[77] = 7.5133985023733718e-01;
w0[78] = 7.3700343012197278e-01; w0[79] = 7.2225922012228905e-01; w0[80] = 7.0714650311348992e-01;
w0[81] = 6.9171042928311888e-01; w0[82] = 6.7600193312752554e-01; w0[83] = 6.6007756565534514e-01;
w0[84] = 6.4399925112087075e-01; w0[85] = 6.2751690913313019e-01; w0[86] = 6.1027499828015230e-01;
w0[87] = 5.9224510437414302e-01; w0[88] = 5.7351602373384358e-01; w0[89] = 5.5418855325877014e-01;
w0[90] = 5.3437475255737310e-01; w0[91] = 5.1419661917235160e-01; w0[92] = 4.9378417655716828e-01;
w0[93] = 4.7327302015086270e-01; w0[94] = 4.5280141715326816e-01; w0[95] = 4.3250709824656874e-01;
w0[96] = 4.1252391349300427e-01; w0[97] = 3.9297854411973260e-01; w0[98] = 3.7398746095910612e-01;
w0[99] = 3.5565430274683579e-01; w0[100] = 3.3806781061485630e-01; w0[101] = 3.2130040985921160e-01;
w0[102] = 3.0540747904392268e-01; w0[103] = 2.9042729373237675e-01; w0[104] = 2.7638022880356511e-01;
w0[105] = 2.6326880017521570e-01; w0[106] = 2.5108351465026180e-01; w0[107] = 2.3980507108518678e-01;
w0[108] = 2.2940313887670982e-01; w0[109] = 2.1983837091461964e-01; w0[110] = 2.1106434856035647e-01;
w0[111] = 2.0302938150311162e-01; w0[112] = 1.9567810006274122e-01; w0[113] = 1.8895279358968187e-01;
w0[114] = 1.8279446934896854e-01; w0[115] = 1.7714363349486195e-01; w0[116] = 1.7194084623673653e-01;
w0[117] = 1.6712721159390778e-01; w0[118] = 1.6264518453364277e-01; w0[119] = 1.5844053282765996e-01;
w0[120] = 1.5446717275166658e-01; w0[121] = 1.5069805821385496e-01; w0[122] = 1.4714688950237698e-01;
w0[123] = 1.4390390146402926e-01; w0[124] = 1.4117355924585473e-01; w0[125] = 1.3925281217141075e-01;
w0[126] = 1.3832497295389018e-01; w0[127] = 1.3814978008396606e-01; w0[128] = 1.3835160646341094e-01;
w0[129] = 1.3879024386623634e-01; w0[130] = 1.3943642059052910e-01; w0[131] = 1.4029209031588918e-01;
w0[132] = 1.4137338559055690e-01; w0[133] = 1.4269664294338003e-01; w0[134] = 1.4427862559127622e-01;
w0[135] = 1.4613710893515686e-01; w0[136] = 1.4829031288790176e-01; w0[137] = 1.5075639573344454e-01;
w0[138] = 1.5355577328662448e-01; w0[139] = 1.5671691155491141e-01; w0[140] = 1.6026759551587053e-01;
w0[141] = 1.6422711196183729e-01; w0[142] = 1.6861217530896319e-01; w0[143] = 1.7343599664804954e-01;
w0[144] = 1.7870749622029697e-01; w0[145] = 1.8443089822995290e-01; w0[146] = 1.9060594959733176e-01;
w0[147] = 1.9722890743114718e-01; w0[148] = 2.0429424128179388e-01; w0[149] = 2.1179676211688642e-01;
w0[150] = 2.1973372569350255e-01; w0[151] = 2.2810647413984569e-01; w0[152] = 2.3692139199104423e-01;
w0[153] = 2.4619023520258562e-01; w0[154] = 2.5593007712435828e-01; w0[155] = 2.6616312793182034e-01;
w0[156] = 2.7691657556564053e-01; w0[157] = 2.8822248978446646e-01; w0[158] = 3.0011781136252774e-01;
w0[159] = 3.1264448345127460e-01; w0[160] = 3.2584981513994765e-01; w0[161] = 3.3984816250379990e-01;
w0[162] = 3.5480294275155777e-01; w0[163] = 3.7084641223360615e-01; w0[164] = 3.8801659473095779e-01;
w0[165] = 4.0590776426938302e-01; w0[166] = 4.2411463421746515e-01; w0[167] = 4.4245155797750657e-01;
w0[168] = 4.6057530551628978e-01; w0[169] = 4.7806498804785041e-01; w0[170] = 4.9458321307491998e-01;
w0[171] = 5.0940696342852609e-01; w0[172] = 5.2208413946637822e-01; w0[173] = 5.3285877380442748e-01;
w0[174] = 5.4190756796308170e-01; w0[175] = 5.4874444561683677e-01; w0[176] = 5.5343636086921844e-01;
w0[177] = 5.5665571130037195e-01; w0[178] = 2.9574890756478689e-01; w0[179] = 1.9886942554336376e-01;
w0[180] = 1.2649443133824198e-01; w0[181] = 8.2055381707757405e-02; w0[182] = 5.4932432103361739e-02;
w0[183] = 3.7036615277279070e-02; w0[184] = 2.5222279402791422e-02; w0[185] = 1.7375975057806702e-02;
w0[186] = 1.2117538023085468e-02; w0[187] = 8.5555194333282857e-03; w0[188] = 6.1147950360911198e-03;
w0[189] = 4.4226660277904978e-03; w0[190] = 3.2357616929942499e-03; w0[191] = 2.3936627467879821e-03;
w0[192] = 1.7895336731204119e-03; w0[193] = 1.3229150832381939e-03; w0[194] = 9.6852078283902500e-04;
w0[195] = 7.3181934878072249e-04; w0[196] = 5.6933457177442079e-04; w0[197] = 4.4659287344306218e-04;
w0[198] = 3.5307409636260321e-04; w0[199] = 2.8123294322360731e-04;
@always_inline
fn benchmark_twostream[nelts: Int](tau: Vector, w0: Vector, gt: Vector,
u0: F64, Rsfc: F64, inout amean: Vector,
inout w: WorkSpace) -> F64:
@always_inline
@parameter
fn test_fn():
twostream[nelts](tau, w0, gt, u0, Rsfc, amean, w)
let us: F64 = F64(Benchmark().run[test_fn]())/1.0e3
return us
let us_scalar = benchmark_twostream[1](tau, w0, gt, u0, Rsfc, amean, w)
print('Mojo scalar twostream runtime =', us_scalar,'us')
Mojo scalar twostream runtime = 12.026999999999999 us
alias nelts = dtype_simd_width[DType.f64]()
let us_vectorized = benchmark_twostream[nelts](tau, w0, gt, u0, Rsfc, amean, w)
print('Mojo vectorized twostream runtime =', us_vectorized,'us')
Mojo vectorized twostream runtime = 8.4299999999999997 us
print('scalar runtime / vectorized runtime =',(us_scalar/us_vectorized))
scalar runtime / vectorized runtime = 1.4266903914590747
%%python
import numpy as np
import numba as nb
import timeit
@nb.njit
def two_stream(tau, w0, gt, u0, Rsfc):
nz = len(tau)
gam1 = np.empty((nz),np.float64)
gam2 = np.empty((nz),np.float64)
gam3 = np.empty((nz),np.float64)
gam4 = np.empty((nz),np.float64)
lamda = np.empty((nz),np.float64)
cap_gam = np.empty((nz),np.float64)
e1 = np.empty((nz),np.float64)
e2 = np.empty((nz),np.float64)
e3 = np.empty((nz),np.float64)
e4 = np.empty((nz),np.float64)
tauc = np.empty((nz+1),np.float64)
direct = np.empty((nz+1),np.float64)
cp0 = np.empty((nz),np.float64)
cpb = np.empty((nz),np.float64)
cm0 = np.empty((nz),np.float64)
cmb = np.empty((nz),np.float64)
A = np.empty((nz*2),np.float64)
B = np.empty((nz*2),np.float64)
D = np.empty((nz*2),np.float64)
E = np.empty((nz*2),np.float64)
y1 = np.empty((nz),np.float64)
y2 = np.empty((nz),np.float64)
gam1 = np.sqrt(3.0)*(2.0-w0*(1+gt))/2.0
gam2 = np.sqrt(3.0)*w0*(1.0-gt)/2.0
gam3 = (1.0-np.sqrt(3.0)*gt*u0)/2.0
gam4 = 1.0 - gam3
u1 = 1.0/np.sqrt(3.0)
lamda = (gam1**2.0 - gam2**2.0)**(0.50)
cap_gam = gam2 / (gam1 + lamda)
e1 = 1.0 + cap_gam*np.exp(-lamda*tau)
e2 = 1.0 - cap_gam*np.exp(-lamda*tau)
e3 = cap_gam + np.exp(-lamda*tau)
e4 = cap_gam - np.exp(-lamda*tau)
tauc[0] = 0.0
for i in range(1,nz+1):
tauc[i] = tauc[i-1] + tau[i-1]
Fs_pi = 1.0
direct[0] = u0*Fs_pi
for i in range(nz):
facp = w0[i]*Fs_pi*((gam1[i]-1.0/u0)*gam3[i]+gam4[i]*gam2[i])
facm = w0[i]*Fs_pi*((gam1[i]+1.0/u0)*gam4[i]+gam2[i]*gam3[i])
et0 = np.exp(-tauc[i]/u0)
etb = et0*np.exp(-tau[i]/u0)
denom = lamda[i]**2.0 - 1.0/u0**2.0
direct[i+1] = u0*Fs_pi*etb
cp0[i] = et0*facp/denom
cpb[i] = etb*facp/denom
cm0[i] = et0*facm/denom
cmb[i] = etb*facm/denom
Ssfc = Rsfc*direct[nz]
A[0] = 0.0
B[0] = e1[0]
D[0] = -e2[0]
E[0] = 0.0 - cm0[0]
# Odd coeficients (Equation 41)
for i in range(nz-1):
l = 2*i + 2
A[l] = e2[i]*e3[i] - e4[i]*e1[i]
B[l] = e1[i]*e1[i+1] - e3[i]*e3[i+1]
D[l] = e3[i]*e4[i+1] - e1[i]*e2[i+1]
E[l] = e3[i]*(cp0[i+1] - cpb[i]) + e1[i]*(cmb[i] - cm0[i+1])
# Even coefficients (Equation 42)
for i in range(nz-1):
l = 2*i + 1
A[l] = e2[i+1]*e1[i] - e3[i]*e4[i+1]
B[l] = e2[i]*e2[i+1] - e4[i]*e4[i+1]
D[l] = e1[i+1]*e4[i+1] - e2[i+1]*e3[i+1]
E[l] = e2[i+1]*(cp0[i+1] - cpb[i]) - e4[i+1]*(cm0[i+1] - cmb[i])
l = 2*nz - 1
A[l] = e1[nz-1] - Rsfc*e3[nz-1]
B[l] = e2[nz-1] - Rsfc*e4[nz-1]
D[l] = 0.0
E[l] = Ssfc - cpb[nz-1] + Rsfc*cmb[nz-1]
solve_tridiag(A, B, D, E)
for i in range(nz):
l = 2*i + 1
y1[i] = E[l-1]
y2[i] = E[l]
amean = np.empty((nz+1),np.float64)
amean[0] = (1.0/u1)*(y1[0]*e3[0]-y2[0]*e4[0]+cp0[0]) + direct[0]/u0;
for i in range(nz):
# J_n*4*pi = amean (Equation 49)
amean[i+1] = (1.0/u1)*(y1[i]*(e1[i]+e3[i])+y2[i]*(e2[i]+e4[i]) + cpb[i]+cmb[i]) + direct[i+1]/u0
return amean
@nb.njit
def solve_tridiag(a, b, c, d):
n = len(a)
n = n - 1
c[0] /= b[0]
d[0] /= b[0]
for i in range(1,n):
c[i] /= b[i] - a[i]*c[i-1]
d[i] = (d[i] - a[i]*d[i-1]) / (b[i] - a[i]*c[i-1])
d[n] = (d[n] - a[n]*d[n-1]) / (b[n] - a[n]*c[n-1])
for i in range(n-1, -1, -1):
d[i] -= c[i]*d[i+1]
@nb.njit
def make_inputs():
u0 = 0.6427876096865394
Rsfc = 0.25
nz = 200
tau = np.empty(nz)
w0 = np.empty(nz)
gt = np.zeros(nz)
tau[0] = 8.7767996682530329e-08; tau[1] = 9.5947097904378163e-08; tau[2] = 1.0489052756541996e-07;
tau[3] = 1.1466995675270959e-07; tau[4] = 1.2536380811254373e-07; tau[5] = 1.3705789618437358e-07;
tau[6] = 1.4984612914194580e-07; tau[7] = 1.6383127794765110e-07; tau[8] = 1.7912581953502761e-07;
tau[9] = 1.9585286130848492e-07; tau[10] = 2.1414715499671573e-07; tau[11] = 2.3415620872566158e-07;
tau[12] = 2.5604150709720415e-07; tau[13] = 2.7997985008321631e-07; tau[14] = 3.0616482268293842e-07;
tau[15] = 3.3480840855869273e-07; tau[16] = 3.6614276227859870e-07; tau[17] = 4.0042215637110061e-07;
tau[18] = 4.3792512115823633e-07; tau[19] = 4.7895679730553810e-07; tau[20] = 5.2385152323406823e-07;
tau[21] = 5.7297568201604804e-07; tau[22] = 6.2673083515698063e-07; tau[23] = 6.8555717379606160e-07;
tau[24] = 7.4993732137969107e-07; tau[25] = 8.2040052583949341e-07; tau[26] = 8.9752728379739709e-07;
tau[27] = 9.8195444440443219e-07; tau[28] = 1.0743808461797366e-06; tau[29] = 1.1755735467535040e-06;
tau[30] = 1.2863747128512830e-06; tau[31] = 1.4077092463227407e-06; tau[32] = 1.5405932316575096e-06;
tau[33] = 1.6861433014556164e-06; tau[34] = 1.8455870289203520e-06; tau[35] = 2.0202744708601724e-06;
tau[36] = 2.2116910012294060e-06; tau[37] = 2.4214715942473803e-06; tau[38] = 2.6514167379642028e-06;
tau[39] = 2.9035101842693509e-06; tau[40] = 3.1799387703266947e-06; tau[41] = 3.4831145798177533e-06;
tau[42] = 3.8156997509793911e-06; tau[43] = 4.1806342830157372e-06; tau[44] = 4.5811672439896014e-06;
tau[45] = 5.0208918431149617e-06; tau[46] = 5.5037848994039188e-06; tau[47] = 6.0342513187615696e-06;
tau[48] = 6.6171742844996624e-06; tau[49] = 7.2579719738055458e-06; tau[50] = 7.9626617375794997e-06;
tau[51] = 8.7379328255527379e-06; tau[52] = 9.5912289061901981e-06; tau[53] = 1.0530841825193256e-05;
tau[54] = 1.1566018270795857e-05; tau[55] = 1.2707081274015154e-05; tau[56] = 1.3965568772106545e-05;
tau[57] = 1.5354391809370636e-05; tau[58] = 1.6888015348134820e-05; tau[59] = 1.8582665120003893e-05;
tau[60] = 2.0456564472667114e-05; tau[61] = 2.2530205765175080e-05; tau[62] = 2.4826661548100218e-05;
tau[63] = 2.7371941534785026e-05; tau[64] = 3.0195402243950013e-05; tau[65] = 3.3330217163257071e-05;
tau[66] = 3.6813916372382030e-05; tau[67] = 4.0689005750403127e-05; tau[68] = 4.5003677189107140e-05;
tau[69] = 4.9812622624297257e-05; tau[70] = 5.5177966142078792e-05; tau[71] = 6.1170329907152607e-05;
tau[72] = 6.7870051104862677e-05; tau[73] = 7.5368568415506559e-05; tau[74] = 8.3769997605174037e-05;
tau[75] = 9.3192916477134497e-05; tau[76] = 1.0377237936370605e-04; tau[77] = 1.1566218028227589e-04;
tau[78] = 1.2903738130325400e-04; tau[79] = 1.4409711804014798e-04; tau[80] = 1.6106768650787847e-04;
tau[81] = 1.8020590401221682e-04; tau[82] = 2.0180271951203158e-04; tau[83] = 2.2618702435261593e-04;
tau[84] = 2.5372957971436598e-04; tau[85] = 2.8499085847225129e-04; tau[86] = 3.2072793130441182e-04;
tau[87] = 3.6172038216249729e-04; tau[88] = 4.0883409334140487e-04; tau[89] = 4.6308361487628643e-04;
tau[90] = 5.2565578019456703e-04; tau[91] = 5.9793596197029830e-04; tau[92] = 6.8153685512867973e-04;
tau[93] = 7.7832951627810915e-04; tau[94] = 8.9047619550709984e-04; tau[95] = 1.0204642828402439e-03;
tau[96] = 1.1711404714380698e-03; tau[97] = 1.3457440390547924e-03; tau[98] = 1.5479380130098432e-03;
tau[99] = 1.7818369502875632e-03; tau[100] = 2.0520301990285964e-03; tau[101] = 2.3635998383980863e-03;
tau[102] = 2.7221330470807528e-03; tau[103] = 3.1337294374715365e-03; tau[104] = 3.6050225687279105e-03;
tau[105] = 4.1432174515158250e-03; tau[106] = 4.7560587646881024e-03; tau[107] = 5.4518194898090918e-03;
tau[108] = 6.2393740872849577e-03; tau[109] = 7.1282605624539697e-03; tau[110] = 8.1287730355002370e-03;
tau[111] = 9.2520919156331774e-03; tau[112] = 1.0510459289421320e-02; tau[113] = 1.1917408459878433e-02;
tau[114] = 1.3488058899910772e-02; tau[115] = 1.5239490954294801e-02; tau[116] = 1.7191215895174722e-02;
tau[117] = 1.9365747959630791e-02; tau[118] = 2.1789244185028790e-02; tau[119] = 2.4492051273927565e-02;
tau[120] = 2.7508662976724094e-02; tau[121] = 3.0875812746695153e-02; tau[122] = 3.4625933661229222e-02;
tau[123] = 3.8771507194934468e-02; tau[124] = 4.3278351607027789e-02; tau[125] = 4.8046866543172910e-02;
tau[126] = 5.2968730144676401e-02; tau[127] = 5.8080080318497186e-02; tau[128] = 6.3512117073305424e-02;
tau[129] = 6.9334818787612357e-02; tau[130] = 7.5580500565664441e-02; tau[131] = 8.2268694684091204e-02;
tau[132] = 8.9410407413410012e-02; tau[133] = 9.7014366545305167e-02; tau[134] = 1.0508676551553092e-01;
tau[135] = 1.1363074698597439e-01; tau[136] = 1.2264676085190102e-01; tau[137] = 1.3213313885917077e-01;
tau[138] = 1.4208431117677991e-01; tau[139] = 1.5248507419772675e-01; tau[140] = 1.6331818759153677e-01;
tau[141] = 1.7457363049899460e-01; tau[142] = 1.8624471255477631e-01; tau[143] = 1.9833035537844179e-01;
tau[144] = 2.1083726583856535e-01; tau[145] = 2.2378158933184314e-01; tau[146] = 2.3718960522404994e-01;
tau[147] = 2.5109715445037328e-01; tau[148] = 2.6554778373896060e-01; tau[149] = 2.8058996227549782e-01;
tau[150] = 2.9627401857870805e-01; tau[151] = 3.1264948596979114e-01; tau[152] = 3.2976328683030964e-01;
tau[153] = 3.4765878733307898e-01; tau[154] = 3.6637545533146748e-01; tau[155] = 3.8594878862798920e-01;
tau[156] = 4.0641030224751162e-01; tau[157] = 4.2778750516236580e-01; tau[158] = 4.5010382615701916e-01;
tau[159] = 4.7337839740328619e-01; tau[160] = 4.9762555188214369e-01; tau[161] = 5.2276004558359734e-01;
tau[162] = 5.4862232621183815e-01; tau[163] = 5.7510375299943239e-01; tau[164] = 6.0224859454892643e-01;
tau[165] = 6.3079868128618721e-01; tau[166] = 6.6150503383107084e-01; tau[167] = 6.9479262019749455e-01;
tau[168] = 7.3135971057268112e-01; tau[169] = 7.7207900416531505e-01; tau[170] = 8.1777260097037741e-01;
tau[171] = 8.7003464761828964e-01; tau[172] = 9.3024357036853700e-01; tau[173] = 9.9877360370577395e-01;
tau[174] = 1.0762228460543470e+00; tau[175] = 1.1562951730417654e+00; tau[176] = 1.2377641657793577e+00;
tau[177] = 1.3216565090158796e+00; tau[178] = 2.6574734564569380e+00; tau[179] = 4.2105772058622346e+00;
tau[180] = 7.0339868009456881e+00; tau[181] = 1.1521276456355757e+01; tau[182] = 1.8285964859512521e+01;
tau[183] = 2.8784368028594791e+01; tau[184] = 4.4808994648392989e+01; tau[185] = 6.8880927399198839e+01;
tau[186] = 1.0449251595375671e+02; tau[187] = 1.5641266261187351e+02; tau[188] = 2.3106587764604379e+02;
tau[189] = 3.3699697083889566e+02; tau[190] = 4.8543396220701010e+02; tau[191] = 6.9096229688139806e+02;
tau[192] = 9.7232374589713959e+02; tau[193] = 1.3811178079094689e+03; tau[194] = 1.9771447087450604e+03;
tau[195] = 2.7426755572022894e+03; tau[196] = 3.6957662267266865e+03; tau[197] = 4.9351475964671836e+03;
tau[198] = 6.5332896118304561e+03; tau[199] = 8.5775143354619722e+03;
w0[0] = 9.9711179180048182e-01; w0[1] = 9.9710577604125894e-01; w0[2] = 9.9709335053701875e-01;
w0[3] = 9.9707409718189177e-01; w0[4] = 9.9704756934937544e-01; w0[5] = 9.9701328938498046e-01;
w0[6] = 9.9697074581156886e-01; w0[7] = 9.9691939021948628e-01; w0[8] = 9.9685863381171902e-01;
w0[9] = 9.9678784357544925e-01; w0[10] = 9.9670633805145825e-01; w0[11] = 9.9661338267233490e-01;
w0[12] = 9.9650818464346891e-01; w0[13] = 9.9638988734055600e-01; w0[14] = 9.9625756419873834e-01;
w0[15] = 9.9611021207159700e-01; w0[16] = 9.9594674403681771e-01; w0[17] = 9.9576598162879704e-01;
w0[18] = 9.9556664647758897e-01; w0[19] = 9.9534735133343222e-01; w0[20] = 9.9510659045563765e-01;
w0[21] = 9.9484272934368789e-01; w0[22] = 9.9455399378589959e-01; w0[23] = 9.9423845819787393e-01;
w0[24] = 9.9389403322209602e-01; w0[25] = 9.9351845255550364e-01; w0[26] = 9.9310925897056246e-01;
w0[27] = 9.9266378949192857e-01; w0[28] = 9.9217915969101766e-01; w0[29] = 9.9165224706199229e-01;
w0[30] = 9.9107967344303005e-01; w0[31] = 9.9045778644803517e-01; w0[32] = 9.8978263988756232e-01;
w0[33] = 9.8904997315915044e-01; w0[34] = 9.8825518959925840e-01; w0[35] = 9.8739333380968586e-01;
w0[36] = 9.8645906798846772e-01; w0[37] = 9.8544664731122855e-01; w0[38] = 9.8434989444384591e-01;
w0[39] = 9.8316217330321953e-01; w0[40] = 9.8187636221329821e-01; w0[41] = 9.8048482665722558e-01;
w0[42] = 9.7897939187322636e-01; w0[43] = 9.7735131560593647e-01; w0[44] = 9.7559126140833230e-01;
w0[45] = 9.7368927293970564e-01; w0[46] = 9.7163474982622933e-01; w0[47] = 9.6941642574050779e-01;
w0[48] = 9.6702234946428145e-01; w0[49] = 9.6443986983423624e-01; w0[50] = 9.6165562558909190e-01;
w0[51] = 9.5865554130176256e-01; w0[52] = 9.5542483072735251e-01; w0[53] = 9.5194800903411125e-01;
w0[54] = 9.4820891557954556e-01; w0[55] = 9.4419074902690803e-01; w0[56] = 9.3987611674461291e-01;
w0[57] = 9.3524710057171967e-01; w0[58] = 9.3028534110971439e-01; w0[59] = 9.2497214278425166e-01;
w0[60] = 9.1928860187296246e-01; w0[61] = 9.1321575969783042e-01; w0[62] = 9.0673478291743770e-01;
w0[63] = 8.9982717269670254e-01; w0[64] = 8.9247500397279889e-01; w0[65] = 8.8466119568641544e-01;
w0[66] = 8.7636981190996832e-01; w0[67] = 8.6758639310529395e-01; w0[68] = 8.5829831560583947e-01;
w0[69] = 8.4849517611710745e-01; w0[70] = 8.3816919684997526e-01; w0[71] = 8.2731564522719481e-01;
w0[72] = 8.1593326067846506e-01; w0[73] = 8.0402467952201917e-01; w0[74] = 7.9159684763866422e-01;
w0[75] = 7.7866140928303185e-01; w0[76] = 7.6523505988054275e-01; w0[77] = 7.5133985023733718e-01;
w0[78] = 7.3700343012197278e-01; w0[79] = 7.2225922012228905e-01; w0[80] = 7.0714650311348992e-01;
w0[81] = 6.9171042928311888e-01; w0[82] = 6.7600193312752554e-01; w0[83] = 6.6007756565534514e-01;
w0[84] = 6.4399925112087075e-01; w0[85] = 6.2751690913313019e-01; w0[86] = 6.1027499828015230e-01;
w0[87] = 5.9224510437414302e-01; w0[88] = 5.7351602373384358e-01; w0[89] = 5.5418855325877014e-01;
w0[90] = 5.3437475255737310e-01; w0[91] = 5.1419661917235160e-01; w0[92] = 4.9378417655716828e-01;
w0[93] = 4.7327302015086270e-01; w0[94] = 4.5280141715326816e-01; w0[95] = 4.3250709824656874e-01;
w0[96] = 4.1252391349300427e-01; w0[97] = 3.9297854411973260e-01; w0[98] = 3.7398746095910612e-01;
w0[99] = 3.5565430274683579e-01; w0[100] = 3.3806781061485630e-01; w0[101] = 3.2130040985921160e-01;
w0[102] = 3.0540747904392268e-01; w0[103] = 2.9042729373237675e-01; w0[104] = 2.7638022880356511e-01;
w0[105] = 2.6326880017521570e-01; w0[106] = 2.5108351465026180e-01; w0[107] = 2.3980507108518678e-01;
w0[108] = 2.2940313887670982e-01; w0[109] = 2.1983837091461964e-01; w0[110] = 2.1106434856035647e-01;
w0[111] = 2.0302938150311162e-01; w0[112] = 1.9567810006274122e-01; w0[113] = 1.8895279358968187e-01;
w0[114] = 1.8279446934896854e-01; w0[115] = 1.7714363349486195e-01; w0[116] = 1.7194084623673653e-01;
w0[117] = 1.6712721159390778e-01; w0[118] = 1.6264518453364277e-01; w0[119] = 1.5844053282765996e-01;
w0[120] = 1.5446717275166658e-01; w0[121] = 1.5069805821385496e-01; w0[122] = 1.4714688950237698e-01;
w0[123] = 1.4390390146402926e-01; w0[124] = 1.4117355924585473e-01; w0[125] = 1.3925281217141075e-01;
w0[126] = 1.3832497295389018e-01; w0[127] = 1.3814978008396606e-01; w0[128] = 1.3835160646341094e-01;
w0[129] = 1.3879024386623634e-01; w0[130] = 1.3943642059052910e-01; w0[131] = 1.4029209031588918e-01;
w0[132] = 1.4137338559055690e-01; w0[133] = 1.4269664294338003e-01; w0[134] = 1.4427862559127622e-01;
w0[135] = 1.4613710893515686e-01; w0[136] = 1.4829031288790176e-01; w0[137] = 1.5075639573344454e-01;
w0[138] = 1.5355577328662448e-01; w0[139] = 1.5671691155491141e-01; w0[140] = 1.6026759551587053e-01;
w0[141] = 1.6422711196183729e-01; w0[142] = 1.6861217530896319e-01; w0[143] = 1.7343599664804954e-01;
w0[144] = 1.7870749622029697e-01; w0[145] = 1.8443089822995290e-01; w0[146] = 1.9060594959733176e-01;
w0[147] = 1.9722890743114718e-01; w0[148] = 2.0429424128179388e-01; w0[149] = 2.1179676211688642e-01;
w0[150] = 2.1973372569350255e-01; w0[151] = 2.2810647413984569e-01; w0[152] = 2.3692139199104423e-01;
w0[153] = 2.4619023520258562e-01; w0[154] = 2.5593007712435828e-01; w0[155] = 2.6616312793182034e-01;
w0[156] = 2.7691657556564053e-01; w0[157] = 2.8822248978446646e-01; w0[158] = 3.0011781136252774e-01;
w0[159] = 3.1264448345127460e-01; w0[160] = 3.2584981513994765e-01; w0[161] = 3.3984816250379990e-01;
w0[162] = 3.5480294275155777e-01; w0[163] = 3.7084641223360615e-01; w0[164] = 3.8801659473095779e-01;
w0[165] = 4.0590776426938302e-01; w0[166] = 4.2411463421746515e-01; w0[167] = 4.4245155797750657e-01;
w0[168] = 4.6057530551628978e-01; w0[169] = 4.7806498804785041e-01; w0[170] = 4.9458321307491998e-01;
w0[171] = 5.0940696342852609e-01; w0[172] = 5.2208413946637822e-01; w0[173] = 5.3285877380442748e-01;
w0[174] = 5.4190756796308170e-01; w0[175] = 5.4874444561683677e-01; w0[176] = 5.5343636086921844e-01;
w0[177] = 5.5665571130037195e-01; w0[178] = 2.9574890756478689e-01; w0[179] = 1.9886942554336376e-01;
w0[180] = 1.2649443133824198e-01; w0[181] = 8.2055381707757405e-02; w0[182] = 5.4932432103361739e-02;
w0[183] = 3.7036615277279070e-02; w0[184] = 2.5222279402791422e-02; w0[185] = 1.7375975057806702e-02;
w0[186] = 1.2117538023085468e-02; w0[187] = 8.5555194333282857e-03; w0[188] = 6.1147950360911198e-03;
w0[189] = 4.4226660277904978e-03; w0[190] = 3.2357616929942499e-03; w0[191] = 2.3936627467879821e-03;
w0[192] = 1.7895336731204119e-03; w0[193] = 1.3229150832381939e-03; w0[194] = 9.6852078283902500e-04;
w0[195] = 7.3181934878072249e-04; w0[196] = 5.6933457177442079e-04; w0[197] = 4.4659287344306218e-04;
w0[198] = 3.5307409636260321e-04; w0[199] = 2.8123294322360731e-04;
return tau, w0, gt, u0, Rsfc
# make inputs
tau, w0, gt, u0, Rsfc = make_inputs()
@nb.njit
def benchmark_twostream():
amean = two_stream(tau, w0, gt, u0, Rsfc)
# compile
benchmark_twostream()
# time
iters = 1000
time_us = timeit.Timer(benchmark_twostream).timeit(number=iters)/iters*1e6
print('Numba twostream runtime = %.1f us'%time_us)
Numba twostream runtime = 19.2 us