Skip to content

Instantly share code, notes, and snippets.

@michael-kenzel
Created June 13, 2024 01:32
Show Gist options
  • Save michael-kenzel/7ebbb6f1fc1dda7835cd055a3014f723 to your computer and use it in GitHub Desktop.
Save michael-kenzel/7ebbb6f1fc1dda7835cd055a3014f723 to your computer and use it in GitHub Desktop.
struct WorkItem {
tidx : fn() -> i32,
tidy : fn() -> i32,
tidz : fn() -> i32,
bidx : fn() -> i32,
bidy : fn() -> i32,
bidz : fn() -> i32,
gidx : fn() -> i32,
gidy : fn() -> i32,
gidz : fn() -> i32,
bdimx : fn() -> i32,
bdimy : fn() -> i32,
bdimz : fn() -> i32,
gdimx : fn() -> i32,
gdimy : fn() -> i32,
gdimz : fn() -> i32,
nblkx : fn() -> i32,
nblky : fn() -> i32,
nblkz : fn() -> i32
}
struct Accelerator {
exec : fn(fn(WorkItem) -> ()) -> fn((i32, i32, i32), (i32, i32, i32)) -> (), // fn(grid, block)->()
sync : fn() -> (),
alloc : fn(i64) -> Buffer,
alloc_unified : fn(i64) -> Buffer,
barrier : fn() -> ()
}
struct Intrinsics {
expf : fn(f32) -> f32,
exp2f : fn(f32) -> f32,
logf : fn(f32) -> f32,
log2f : fn(f32) -> f32,
powf : fn(f32, f32) -> f32,
rsqrtf : fn(f32) -> f32,
sqrtf : fn(f32) -> f32,
fabsf : fn(f32) -> f32,
sinf : fn(f32) -> f32,
cosf : fn(f32) -> f32,
tanf : fn(f32) -> f32,
asinf : fn(f32) -> f32,
acosf : fn(f32) -> f32,
atanf : fn(f32) -> f32,
erff : fn(f32) -> f32,
atan2f : fn(f32, f32) -> f32,
copysignf : fn(f32, f32) -> f32,
fmaf : fn(f32, f32, f32) -> f32,
fmaxf : fn(f32, f32) -> f32,
fminf : fn(f32, f32) -> f32,
fmodf : fn(f32, f32) -> f32,
floorf : fn(f32) -> f32,
isinff : fn(f32) -> i32,
isnanf : fn(f32) -> i32,
isfinitef : fn(f32) -> i32,
exp : fn(f64) -> f64,
exp2 : fn(f64) -> f64,
log : fn(f64) -> f64,
log2 : fn(f64) -> f64,
pow : fn(f64, f64) -> f64,
rsqrt : fn(f64) -> f64,
sqrt : fn(f64) -> f64,
fabs : fn(f64) -> f64,
sin : fn(f64) -> f64,
cos : fn(f64) -> f64,
tan : fn(f64) -> f64,
asin : fn(f64) -> f64,
acos : fn(f64) -> f64,
atan : fn(f64) -> f64,
erf : fn(f64) -> f64,
atan2 : fn(f64, f64) -> f64,
copysign : fn(f64, f64) -> f64,
fma : fn(f64, f64, f64) -> f64,
fmax : fn(f64, f64) -> f64,
fmin : fn(f64, f64) -> f64,
fmod : fn(f64, f64) -> f64,
floor : fn(f64) -> f64,
isinf : fn(f64) -> i32,
isnan : fn(f64) -> i32,
isfinite : fn(f64) -> i32,
min : fn(i32, i32) -> i32,
max : fn(i32, i32) -> i32,
}
#[import(cc = "C")] fn rv_mask() -> bool;
#[import(cc = "C")] fn rv_any(_: bool) -> bool;
#[import(cc = "C")] fn rv_all(_: bool) -> bool;
#[import(cc = "C")] fn rv_ballot(_: bool) -> i32;
#[import(cc = "C")] fn rv_extract(_: f32, _: i32) -> f32;
#[import(cc = "C")] fn rv_insert(_: f32, _: i32, _: f32) -> f32;
#[import(cc = "C")] fn rv_load(_: &f32, _: i32) -> f32;
#[import(cc = "C")] fn rv_store(_: &mut f32, _: i32, _: f32) -> ();
#[import(cc = "C")] fn rv_shuffle(_: f32, _: i32) -> f32;
#[import(cc = "C")] fn rv_align(_: &i8, _: i32)-> &i8;
#[import(cc = "C")] fn rv_compact(_: f32, _: bool) -> f32;
#[import(cc = "C")] fn rv_lane_id() -> i32;
#[import(cc = "C")] fn rv_num_lanes() -> i32;
//#[import(cc = "C", name = "sinf")] fn cpu_sinf(f32) -> f32;
//#[import(cc = "C", name = "cosf")] fn cpu_cosf(f32) -> f32;
#[import(cc = "C", name = "tanf")] fn cpu_tanf(_: f32) -> f32;
#[import(cc = "C", name = "asinf")] fn cpu_asinf(_: f32) -> f32;
#[import(cc = "C", name = "acosf")] fn cpu_acosf(_: f32) -> f32;
#[import(cc = "C", name = "atanf")] fn cpu_atanf(_: f32) -> f32;
#[import(cc = "C", name = "erff")] fn cpu_erff(_: f32) -> f32;
#[import(cc = "C", name = "fmodf")] fn cpu_fmodf(_: f32, _: f32) -> f32;
#[import(cc = "C", name = "atan2f")] fn cpu_atan2f(_: f32, _: f32) -> f32;
#[import(cc = "C", name = "anydsl_isinff")] fn cpu_isinff(_: f32) -> i32;
#[import(cc = "C", name = "anydsl_isnanf")] fn cpu_isnanf(_: f32) -> i32;
#[import(cc = "C", name = "anydsl_isfinitef")] fn cpu_isfinitef(_: f32) -> i32;
//#[import(cc = "C", name = "sin")] fn cpu_sin(f64) -> f64;
//#[import(cc = "C", name = "cos")] fn cpu_cos(f64) -> f64;
#[import(cc = "C", name = "tan")] fn cpu_tan(_: f64) -> f64;
#[import(cc = "C", name = "asin")] fn cpu_asin(_: f64) -> f64;
#[import(cc = "C", name = "acos")] fn cpu_acos(_: f64) -> f64;
#[import(cc = "C", name = "atan")] fn cpu_atan(_: f64) -> f64;
#[import(cc = "C", name = "erf")] fn cpu_erf(_: f64) -> f64;
#[import(cc = "C", name = "fmod")] fn cpu_fmod(_: f64, _: f64) -> f64;
#[import(cc = "C", name = "atan2")] fn cpu_atan2(_: f64, _: f64) -> f64;
#[import(cc = "C", name = "anydsl_isinf")] fn cpu_isinf(_: f64) -> i32;
#[import(cc = "C", name = "anydsl_isnan")] fn cpu_isnan(_: f64) -> i32;
#[import(cc = "C", name = "anydsl_isfinite")] fn cpu_isfinite(_: f64) -> i32;
#[import(cc = "device", name = "llvm.exp.f32")] fn cpu_expf(_: f32) -> f32;
#[import(cc = "device", name = "llvm.exp2.f32")] fn cpu_exp2f(_: f32) -> f32;
#[import(cc = "device", name = "llvm.log.f32")] fn cpu_logf(_: f32) -> f32;
#[import(cc = "device", name = "llvm.log2.f32")] fn cpu_log2f(_: f32) -> f32;
#[import(cc = "device", name = "llvm.pow.f32")] fn cpu_powf(_: f32, _: f32) -> f32;
#[import(cc = "device", name = "llvm.sqrt.f32")] fn cpu_sqrtf(_: f32) -> f32;
#[import(cc = "device", name = "llvm.fabs.f32")] fn cpu_fabsf(_: f32) -> f32;
#[import(cc = "device", name = "llvm.sin.f32")] fn cpu_sinf(_: f32) -> f32;
#[import(cc = "device", name = "llvm.cos.f32")] fn cpu_cosf(_: f32) -> f32;
#[import(cc = "device", name = "llvm.floor.f32")] fn cpu_floorf(_: f32) -> f32;
#[import(cc = "device", name = "llvm.fma.f32")] fn cpu_fmaf(_: f32, _: f32, _: f32) -> f32;
#[import(cc = "device", name = "llvm.fmuladd.f32")] fn cpu_madf(_: f32, _: f32, _: f32) -> f32;
#[import(cc = "device", name = "llvm.copysign.f32")] fn cpu_copysignf(_: f32, _: f32) -> f32;
#[import(cc = "device", name = "llvm.minnum.f32")] fn cpu_fminf(_: f32, _: f32) -> f32;
#[import(cc = "device", name = "llvm.maxnum.f32")] fn cpu_fmaxf(_: f32, _: f32) -> f32;
#[import(cc = "device", name = "llvm.exp.f64")] fn cpu_exp(_: f64) -> f64;
#[import(cc = "device", name = "llvm.exp2.f64")] fn cpu_exp2(_: f64) -> f64;
#[import(cc = "device", name = "llvm.log.f64")] fn cpu_log(_: f64) -> f64;
#[import(cc = "device", name = "llvm.log2.f64")] fn cpu_log2(_: f64) -> f64;
#[import(cc = "device", name = "llvm.pow.f64")] fn cpu_pow(_: f64, _: f64) -> f64;
#[import(cc = "device", name = "llvm.sqrt.f64")] fn cpu_sqrt(_: f64) -> f64;
#[import(cc = "device", name = "llvm.fabs.f64")] fn cpu_fabs(_: f64) -> f64;
#[import(cc = "device", name = "llvm.sin.f64")] fn cpu_sin(_: f64) -> f64;
#[import(cc = "device", name = "llvm.cos.f64")] fn cpu_cos(_: f64) -> f64;
#[import(cc = "device", name = "llvm.floor.f64")] fn cpu_floor(_: f64) -> f64;
#[import(cc = "device", name = "llvm.fma.f64")] fn cpu_fma(_: f64, _: f64, _: f64) -> f64;
#[import(cc = "device", name = "llvm.fmuladd.f64")] fn cpu_mad(_: f64, _: f64, _: f64) -> f64;
#[import(cc = "device", name = "llvm.copysign.f64")] fn cpu_copysign(_: f64, _: f64) -> f64;
#[import(cc = "device", name = "llvm.minnum.f64")] fn cpu_fmin(_: f64, _: f64) -> f64;
#[import(cc = "device", name = "llvm.maxnum.f64")] fn cpu_fmax(_: f64, _: f64) -> f64;
#[import(cc = "device", name = "llvm.ctpop.i32")] fn cpu_popcount32(_: i32) -> i32;
#[import(cc = "device", name = "llvm.ctpop.i64")] fn cpu_popcount64(_: i64) -> i64;
#[import(cc = "device", name = "llvm.ctlz.i32")] fn cpu_clz32(_: i32, _: bool) -> i32;
#[import(cc = "device", name = "llvm.ctlz.i64")] fn cpu_clz64(_: i64, _: bool) -> i64;
#[import(cc = "device", name = "llvm.cttz.i32")] fn cpu_ctz32(_: i32, _: bool) -> i32;
#[import(cc = "device", name = "llvm.cttz.i64")] fn cpu_ctz64(_: i64, _: bool) -> i64;
#[import(cc = "device", name = "llvm.x86.bmi.pext.32")] fn cpu_pext32(_: i32, _: i32) -> i32;
#[import(cc = "device", name = "llvm.x86.bmi.pext.64")] fn cpu_pext64(_: i64, _: i64) -> i64;
#[import(cc = "device", name = "llvm.x86.bmi.pdep.32")] fn cpu_pdep32(_: i32, _: i32) -> i32;
#[import(cc = "device", name = "llvm.x86.bmi.pdep.64")] fn cpu_pdep64(_: i64, _: i64) -> i64;
#[import(cc = "device", name = "llvm.prefetch.p0")] fn cpu_prefetch(&u8, i32, i32, i32) -> ();
//
// atomics
// 0 1 2 3 4 5 6 7 8 9 10 11 12
// operation: Xchg Add Sub And Nand Or Xor Max Min UMax UMin FAdd FSub
// 0 1 2 4 5 6 7
// ordering: NotAtomic Unordered Monotonic Acquire Release AcquireRelease SequentiallyConsistent
// syncscope: singlethread "" (system)
//
fn @cpu_atomic_xchg(a: &mut i32, b: i32) -> i32 = atomic[i32](0, a, b, 7, "");
fn @cpu_atomic_add(a: &mut i32, b: i32) -> i32 = atomic[i32](1, a, b, 7, "");
fn @cpu_atomic_sub(a: &mut i32, b: i32) -> i32 = atomic[i32](2, a, b, 7, "");
fn @cpu_atomic_max(a: &mut i32, b: i32) -> i32 = atomic[i32](7, a, b, 7, "");
fn @cpu_atomic_min(a: &mut i32, b: i32) -> i32 = atomic[i32](8, a, b, 7, "");
static cpu_intrinsics = Intrinsics {
expf = cpu_expf,
exp2f = cpu_exp2f,
logf = cpu_logf,
log2f = cpu_log2f,
powf = cpu_powf,
rsqrtf = @|a| { 1:f32 / cpu_sqrtf(a) },
sqrtf = cpu_sqrtf,
fabsf = cpu_fabsf,
sinf = cpu_sinf,
cosf = cpu_cosf,
tanf = cpu_tanf,
asinf = cpu_asinf,
acosf = cpu_acosf,
atanf = cpu_atanf,
erff = cpu_erff,
atan2f = cpu_atan2f,
copysignf = cpu_copysignf,
fmaf = cpu_fmaf,
fmaxf = cpu_fmaxf,
fminf = cpu_fminf,
fmodf = cpu_fmodf,
floorf = cpu_floorf,
isinff = cpu_isinff,
isnanf = cpu_isnanf,
isfinitef = cpu_isfinitef,
exp = cpu_exp,
exp2 = cpu_exp2,
log = cpu_log,
log2 = cpu_log2,
pow = cpu_pow,
rsqrt = @|a| { 1.0 / cpu_sqrt(a) },
sqrt = cpu_sqrt,
fabs = cpu_fabs,
sin = cpu_sin,
cos = cpu_cos,
tan = cpu_tan,
asin = cpu_asin,
acos = cpu_acos,
atan = cpu_atan,
erf = cpu_erf,
atan2 = cpu_atan2,
copysign = cpu_copysign,
fma = cpu_fma,
fmax = cpu_fmax,
fmin = cpu_fmin,
fmod = cpu_fmod,
floor = cpu_floor,
isinf = cpu_isinf,
isnan = cpu_isnan,
isfinite = cpu_isfinite,
min = @|a, b| { if a < b { a } else { b } },
max = @|a, b| { if a > b { a } else { b } },
};
// no declarations are emitted for "device" functions
#[import(cc = "C", name = "exp")] fn hls_expf(f32) -> f32;
#[import(cc = "C", name = "exp2")] fn hls_exp2f(f32) -> f32;
#[import(cc = "C", name = "log")] fn hls_logf(f32) -> f32;
#[import(cc = "C", name = "log2")] fn hls_log2f(f32) -> f32;
#[import(cc = "C", name = "pow")] fn hls_powf(f32, f32) -> f32;
#[import(cc = "C", name = "rsqrt")] fn hls_rsqrtf(f32) -> f32;
#[import(cc = "C", name = "sqrt")] fn hls_sqrtf(f32) -> f32;
#[import(cc = "C", name = "fabs")] fn hls_fabsf(f32) -> f32;
#[import(cc = "C", name = "sin")] fn hls_sinf(f32) -> f32;
#[import(cc = "C", name = "cos")] fn hls_cosf(f32) -> f32;
#[import(cc = "C", name = "tan")] fn hls_tanf(f32) -> f32;
#[import(cc = "C", name = "asin")] fn hls_asinf(f32) -> f32;
#[import(cc = "C", name = "acos")] fn hls_acosf(f32) -> f32;
#[import(cc = "C", name = "atan")] fn hls_atanf(f32) -> f32;
#[import(cc = "C", name = "erf")] fn hls_erff(f32) -> f32;
#[import(cc = "C", name = "atan2")] fn hls_atan2f(f32, f32) -> f32;
#[import(cc = "C", name = "fmod")] fn hls_fmodf(f32, f32) -> f32;
#[import(cc = "C", name = "floor")] fn hls_floorf(f32) -> f32;
#[import(cc = "C", name = "isinf")] fn hls_isinff(f32) -> i32;
#[import(cc = "C", name = "isnan")] fn hls_isnanf(f32) -> i32;
#[import(cc = "C", name = "isfinite")] fn hls_isfinitef(f32) -> i32;
#[import(cc = "C", name = "fma")] fn hls_fmaf(f32, f32, f32) -> f32;
#[import(cc = "C", name = "mad")] fn hls_madf(f32, f32, f32) -> f32;
#[import(cc = "C", name = "copysign")] fn hls_copysignf(f32, f32) -> f32;
#[import(cc = "C", name = "exp")] fn hls_exp(f64) -> f64;
#[import(cc = "C", name = "exp2")] fn hls_exp2(f64) -> f64;
#[import(cc = "C", name = "log")] fn hls_log(f64) -> f64;
#[import(cc = "C", name = "log2")] fn hls_log2(f64) -> f64;
#[import(cc = "C", name = "pow")] fn hls_pow(f64, f64) -> f64;
#[import(cc = "C", name = "rsqrt")] fn hls_rsqrt(f64) -> f64;
#[import(cc = "C", name = "sqrt")] fn hls_sqrt(f64) -> f64;
#[import(cc = "C", name = "fabs")] fn hls_fabs(f64) -> f64;
#[import(cc = "C", name = "sin")] fn hls_sin(f64) -> f64;
#[import(cc = "C", name = "cos")] fn hls_cos(f64) -> f64;
#[import(cc = "C", name = "tan")] fn hls_tan(f64) -> f64;
#[import(cc = "C", name = "asin")] fn hls_asin(f64) -> f64;
#[import(cc = "C", name = "acos")] fn hls_acos(f64) -> f64;
#[import(cc = "C", name = "atan")] fn hls_atan(f64) -> f64;
#[import(cc = "C", name = "erf")] fn hls_erf(f64) -> f64;
#[import(cc = "C", name = "atan2")] fn hls_atan2(f64, f64) -> f64;
#[import(cc = "C", name = "fmod")] fn hls_fmod(f64, f64) -> f64;
#[import(cc = "C", name = "floor")] fn hls_floor(f64) -> f64;
#[import(cc = "C", name = "isinf")] fn hls_isinf(f64) -> i32;
#[import(cc = "C", name = "isnan")] fn hls_isnan(f64) -> i32;
#[import(cc = "C", name = "isfinite")] fn hls_isfinite(f64) -> i32;
#[import(cc = "C", name = "fma")] fn hls_fma(f64, f64, f64) -> f64;
#[import(cc = "C", name = "mad")] fn hls_mad(f64, f64, f64) -> f64;
#[import(cc = "C", name = "copysign")] fn hls_copysign(f64, f64) -> f64;
#[import(cc = "C", name = "fmin")] fn hls_fminf(f32, f32) -> f32;
#[import(cc = "C", name = "fmax")] fn hls_fmaxf(f32, f32) -> f32;
#[import(cc = "C", name = "fmin")] fn hls_fmin(f64, f64) -> f64;
#[import(cc = "C", name = "fmax")] fn hls_fmax(f64, f64) -> f64;
#[import(cc = "C", name = "min")] fn hls_min(i32, i32) -> i32;
#[import(cc = "C", name = "max")] fn hls_max(i32, i32) -> i32;
#[import(cc = "device")] fn print_pragma(&[u8]) -> ();
// channel scalar types
struct channel[T] { data : T }
// channel array types
struct channel1[T] { data : [T * 1 ] }
struct channel2[T] { data : [T * 2 ] }
struct channel4[T] { data : [T * 4 ] }
struct channel8[T] { data : [T * 8 ] }
struct channel16[T] { data : [T * 16 ] }
struct channel32[T] { data : [T * 32 ] }
struct channel64[T] { data : [T * 64 ] }
struct channel128[T] { data : [T * 128] }
// read and write on scalar channels
#[import(cc = "device", name = "read_channel")] fn read_channel[T] (&mut channel[T]) -> T;
#[import(cc = "device", name = "write_channel")] fn write_channel[T] (&mut channel[T], T ) -> ();
// read and write on array channels
#[import(cc = "device", name = "read_channel")] fn read_channel1[T] ( &mut channel1[T] ) -> [T * 1 ];
#[import(cc = "device", name = "read_channel")] fn read_channel2[T] ( &mut channel2[T] ) -> [T * 2 ];
#[import(cc = "device", name = "read_channel")] fn read_channel4[T] ( &mut channel4[T] ) -> [T * 4 ];
#[import(cc = "device", name = "read_channel")] fn read_channel8[T] ( &mut channel8[T] ) -> [T * 8 ];
#[import(cc = "device", name = "read_channel")] fn read_channel16[T]( &mut channel16[T]) -> [T * 16];
#[import(cc = "device", name = "read_channel")] fn read_channel32[T]( &mut channel32[T]) -> [T * 32];
#[import(cc = "device", name = "write_channel")] fn write_channel1[T] ( &mut channel1[T], [T * 1 ]) -> ();
#[import(cc = "device", name = "write_channel")] fn write_channel2[T] ( &mut channel2[T], [T * 2 ]) -> ();
#[import(cc = "device", name = "write_channel")] fn write_channel4[T] ( &mut channel4[T], [T * 4 ]) -> ();
#[import(cc = "device", name = "write_channel")] fn write_channel8[T] ( &mut channel8[T], [T * 8 ]) -> ();
#[import(cc = "device", name = "write_channel")] fn write_channel16[T]( &mut channel16[T], [T * 16]) -> ();
#[import(cc = "device", name = "write_channel")] fn write_channel32[T]( &mut channel32[T], [T * 32]) -> ();
#[import(cc = "device", name = " ")] fn bitcast_channel[T]( &mut channel1[T]) -> [T * 2];
fn @hls_accelerator(dev: i32) = Accelerator {
exec = @|body| |_grid, _block| {
let work_item = WorkItem {
tidx = @|| 0, tidy = @|| 0, tidz = @|| 0,
bidx = @|| 0, bidy = @|| 0, bidz = @|| 0,
gidx = @|| 0, gidy = @|| 0, gidz = @|| 0,
bdimx = @|| 1, bdimy = @|| 1, bdimz = @|| 1,
gdimx = @|| 1, gdimy = @|| 1, gdimz = @|| 1,
nblkx = @|| 1, nblky = @|| 1, nblkz = @|| 1
};
hls(dev, || @body(work_item));
},
sync = @|| synchronize_hls(dev),
alloc = @|size| alloc_hls(dev, size),
alloc_unified = @|size| alloc_hls_unified(dev, size),
barrier = @|| ()
};
static hls_intrinsics = Intrinsics {
expf = hls_expf,
exp2f = hls_exp2f,
logf = hls_logf,
log2f = hls_log2f,
powf = hls_powf,
rsqrtf = hls_rsqrtf,
sqrtf = hls_sqrtf,
fabsf = hls_fabsf,
sinf = hls_sinf,
cosf = hls_cosf,
tanf = hls_tanf,
asinf = hls_asinf,
acosf = hls_acosf,
atanf = hls_atanf,
erff = hls_erff,
atan2f = hls_atan2f,
copysignf = hls_copysignf,
fmaf = hls_fmaf,
fmaxf = hls_fmaxf,
fminf = hls_fminf,
fmodf = hls_fmodf,
floorf = hls_floorf,
isinff = hls_isinff,
isnanf = hls_isnanf,
isfinitef = hls_isfinitef,
exp = hls_exp,
exp2 = hls_exp2,
log = hls_log,
log2 = hls_log2,
pow = hls_pow,
rsqrt = hls_rsqrt,
sqrt = hls_sqrt,
fabs = hls_fabs,
sin = hls_sin,
cos = hls_cos,
tan = hls_tan,
asin = hls_asin,
acos = hls_acos,
atan = hls_atan,
erf = hls_erf,
atan2 = hls_atan2,
copysign = hls_copysign,
fma = hls_fma,
fmax = hls_fmax,
fmin = hls_fmin,
fmod = hls_fmod,
floor = hls_floor,
isinf = hls_isinf,
isnan = hls_isnan,
isfinite = hls_isfinite,
min = hls_min,
max = hls_max,
};
// no declarations are emitted for "device" functions
#[import(cc = "device", name = "__syncthreads")] fn cuda_syncthreads() -> ();
#[import(cc = "device", name = "expf")] fn cuda_expf(f32) -> f32;
#[import(cc = "device", name = "exp2f")] fn cuda_exp2f(f32) -> f32;
#[import(cc = "device", name = "logf")] fn cuda_logf(f32) -> f32;
#[import(cc = "device", name = "log2f")] fn cuda_log2f(f32) -> f32;
#[import(cc = "device", name = "powf")] fn cuda_powf(f32, f32) -> f32;
#[import(cc = "device", name = "rsqrtf")] fn cuda_rsqrtf(f32) -> f32;
#[import(cc = "device", name = "sqrtf")] fn cuda_sqrtf(f32) -> f32;
#[import(cc = "device", name = "fabsf")] fn cuda_fabsf(f32) -> f32;
#[import(cc = "device", name = "sinf")] fn cuda_sinf(f32) -> f32;
#[import(cc = "device", name = "cosf")] fn cuda_cosf(f32) -> f32;
#[import(cc = "device", name = "tanf")] fn cuda_tanf(f32) -> f32;
#[import(cc = "device", name = "asinf")] fn cuda_asinf(f32) -> f32;
#[import(cc = "device", name = "acosf")] fn cuda_acosf(f32) -> f32;
#[import(cc = "device", name = "atanf")] fn cuda_atanf(f32) -> f32;
#[import(cc = "device", name = "erff")] fn cuda_erff(f32) -> f32;
#[import(cc = "device", name = "atan2f")] fn cuda_atan2f(f32, f32) -> f32;
#[import(cc = "device", name = "copysignf")] fn cuda_copysignf(f32, f32) -> f32;
#[import(cc = "device", name = "fmaf")] fn cuda_fmaf(f32, f32, f32) -> f32;
#[import(cc = "device", name = "fmaxf")] fn cuda_fmaxf(f32, f32) -> f32;
#[import(cc = "device", name = "fminf")] fn cuda_fminf(f32, f32) -> f32;
#[import(cc = "device", name = "fmodf")] fn cuda_fmodf(f32, f32) -> f32;
#[import(cc = "device", name = "floorf")] fn cuda_floorf(f32) -> f32;
#[import(cc = "device", name = "isinf")] fn cuda_isinff(f32) -> i32;
#[import(cc = "device", name = "isnan")] fn cuda_isnanf(f32) -> i32;
#[import(cc = "device", name = "isfinite")] fn cuda_isfinitef(f32) -> i32;
#[import(cc = "device", name = "exp")] fn cuda_exp(f64) -> f64;
#[import(cc = "device", name = "exp2")] fn cuda_exp2(f64) -> f64;
#[import(cc = "device", name = "log")] fn cuda_log(f64) -> f64;
#[import(cc = "device", name = "log2")] fn cuda_log2(f64) -> f64;
#[import(cc = "device", name = "pow")] fn cuda_pow(f64, f64) -> f64;
#[import(cc = "device", name = "rsqrt")] fn cuda_rsqrt(f64) -> f64;
#[import(cc = "device", name = "sqrt")] fn cuda_sqrt(f64) -> f64;
#[import(cc = "device", name = "fabs")] fn cuda_fabs(f64) -> f64;
#[import(cc = "device", name = "sin")] fn cuda_sin(f64) -> f64;
#[import(cc = "device", name = "cos")] fn cuda_cos(f64) -> f64;
#[import(cc = "device", name = "tan")] fn cuda_tan(f64) -> f64;
#[import(cc = "device", name = "asin")] fn cuda_asin(f64) -> f64;
#[import(cc = "device", name = "acos")] fn cuda_acos(f64) -> f64;
#[import(cc = "device", name = "atan")] fn cuda_atan(f64) -> f64;
#[import(cc = "device", name = "erf")] fn cuda_erf(f64) -> f64;
#[import(cc = "device", name = "atan2")] fn cuda_atan2(f64, f64) -> f64;
#[import(cc = "device", name = "copysign")] fn cuda_copysign(f64, f64) -> f64;
#[import(cc = "device", name = "fma")] fn cuda_fma(f64, f64, f64) -> f64;
#[import(cc = "device", name = "fmax")] fn cuda_fmax(f64, f64) -> f64;
#[import(cc = "device", name = "fmin")] fn cuda_fmin(f64, f64) -> f64;
#[import(cc = "device", name = "fmod")] fn cuda_fmod(f64, f64) -> f64;
#[import(cc = "device", name = "floor")] fn cuda_floor(f64) -> f64;
#[import(cc = "device", name = "isinf")] fn cuda_isinf(f64) -> i32;
#[import(cc = "device", name = "isnan")] fn cuda_isnan(f64) -> i32;
#[import(cc = "device", name = "isfinite")] fn cuda_isfinite(f64) -> i32;
#[import(cc = "device", name = "atomicAdd")] fn cuda_atomic_add_global(&mut addrspace(1)i32, i32) -> i32;
#[import(cc = "device", name = "atomicAdd")] fn cuda_atomic_add_shared(&mut addrspace(3)i32, i32) -> i32;
#[import(cc = "device", name = "atomicMin")] fn cuda_atomic_min_global(&mut addrspace(1)i32, i32) -> i32;
#[import(cc = "device", name = "atomicMin")] fn cuda_atomic_min_shared(&mut addrspace(3)i32, i32) -> i32;
#[import(cc = "device", name = "__ldg")] fn cuda_ldg_f32(&addrspace(1)f32) -> f32;
#[import(cc = "device", name = "__ldg")] fn cuda_ldg_i32(&addrspace(1)i32) -> i32;
#[import(cc = "device", name = "__ldg")] fn cuda_ldg_u8(&addrspace(1)u8) -> u8;
#[import(cc = "device", name = "__ldg")] fn cuda_ldg4_f32(&addrspace(1)simd[f32 * 4]) -> simd[f32 * 4];
#[import(cc = "device", name = "__ldg")] fn cuda_ldg4_i32(&addrspace(1)simd[i32 * 4]) -> simd[i32 * 4];
#[import(cc = "device", name = "min")] fn cuda_min(i32, i32) -> i32;
#[import(cc = "device", name = "max")] fn cuda_max(i32, i32) -> i32;
// wrappers for threadIdx.x etc. are provided by the CUDA backend
#[import(cc = "device", name = "threadIdx_x")] fn cuda_threadIdx_x() -> i32;
#[import(cc = "device", name = "threadIdx_y")] fn cuda_threadIdx_y() -> i32;
#[import(cc = "device", name = "threadIdx_z")] fn cuda_threadIdx_z() -> i32;
#[import(cc = "device", name = "blockIdx_x")] fn cuda_blockIdx_x() -> i32;
#[import(cc = "device", name = "blockIdx_y")] fn cuda_blockIdx_y() -> i32;
#[import(cc = "device", name = "blockIdx_z")] fn cuda_blockIdx_z() -> i32;
#[import(cc = "device", name = "blockDim_x")] fn cuda_blockDim_x() -> i32;
#[import(cc = "device", name = "blockDim_y")] fn cuda_blockDim_y() -> i32;
#[import(cc = "device", name = "blockDim_z")] fn cuda_blockDim_z() -> i32;
#[import(cc = "device", name = "gridDim_x")] fn cuda_gridDim_x() -> i32;
#[import(cc = "device", name = "gridDim_y")] fn cuda_gridDim_y() -> i32;
#[import(cc = "device", name = "gridDim_z")] fn cuda_gridDim_z() -> i32;
#[import(cc = "device", name = "__trap")] fn cuda_trap() -> ();
#[import(cc = "device", name = "__clz")] fn cuda_clz_u32(u32) -> i32;
#[import(cc = "device", name = "__clzll")] fn cuda_clz_u64(u64) -> i32;
#[import(cc = "device", name = "__ffs")] fn cuda_ffs_u32(u32) -> i32;
#[import(cc = "device", name = "__ffsll")] fn cuda_ffs_u64(u64) -> i32;
#[import(cc = "device", name = "__popc")] fn cuda_popc_u32(u32) -> i32;
#[import(cc = "device", name = "__popcll")] fn cuda_popc_u64(u64) -> i32;
#[import(cc = "device", name = "__brev")] fn cuda_brev_u32(u32) -> u32;
#[import(cc = "device", name = "__brevll")] fn cuda_brev_u64(u64) -> u64;
#[import(cc = "device", name = "atomicAdd")] fn cuda_atomic_add_global_i32(&mut addrspace(1)i32, i32) -> i32;
#[import(cc = "device", name = "atomicAdd")] fn cuda_atomic_add_global_u32(&mut addrspace(1)u32, u32) -> u32;
#[import(cc = "device", name = "atomicAdd")] fn cuda_atomic_add_global_u64(&mut addrspace(1)u64, u64) -> u64;
#[import(cc = "device", name = "atomicAdd")] fn cuda_atomic_add_global_f32(&mut addrspace(1)f32, f32) -> f32;
#[import(cc = "device", name = "atomicAdd")] fn cuda_atomic_add_global_f64(&mut addrspace(1)f64, f64) -> f64;
#[import(cc = "device", name = "atomicSub")] fn cuda_atomic_sub_global_i32(&mut addrspace(1)i32, i32) -> i32;
#[import(cc = "device", name = "atomicSub")] fn cuda_atomic_sub_global_u32(&mut addrspace(1)u32, u32) -> u32;
#[import(cc = "device", name = "atomicSub")] fn cuda_atomic_sub_global_u64(&mut addrspace(1)u64, u64) -> u64;
#[import(cc = "device", name = "atomicSub")] fn cuda_atomic_sub_global_f32(&mut addrspace(1)f32, f32) -> f32;
#[import(cc = "device", name = "atomicSub")] fn cuda_atomic_sub_global_f64(&mut addrspace(1)f64, f64) -> f64;
#[import(cc = "device", name = "atomicAnd")] fn cuda_atomic_and_global_i32(&mut addrspace(1)i32, i32) -> i32;
#[import(cc = "device", name = "atomicAnd")] fn cuda_atomic_and_global_u32(&mut addrspace(1)u32, u32) -> u32;
#[import(cc = "device", name = "atomicAnd")] fn cuda_atomic_and_global_u64(&mut addrspace(1)u64, u64) -> u64;
#[import(cc = "device", name = "atomicOr")] fn cuda_atomic_or_global_i32(&mut addrspace(1)i32, i32) -> i32;
#[import(cc = "device", name = "atomicOr")] fn cuda_atomic_or_global_u32(&mut addrspace(1)u32, u32) -> u32;
#[import(cc = "device", name = "atomicOr")] fn cuda_atomic_or_global_u64(&mut addrspace(1)u64, u64) -> u64;
#[import(cc = "device", name = "atomicXor")] fn cuda_atomic_xor_global_i32(&mut addrspace(1)i32, i32) -> i32;
#[import(cc = "device", name = "atomicXor")] fn cuda_atomic_xor_global_u32(&mut addrspace(1)u32, u32) -> u32;
#[import(cc = "device", name = "atomicXor")] fn cuda_atomic_xor_global_u64(&mut addrspace(1)u64, u64) -> u64;
#[import(cc = "device", name = "atomicExch")] fn cuda_atomic_exch_global_i32(&mut addrspace(1)i32, i32) -> i32;
#[import(cc = "device", name = "atomicExch")] fn cuda_atomic_exch_global_u32(&mut addrspace(1)u32, u32) -> u32;
#[import(cc = "device", name = "atomicExch")] fn cuda_atomic_exch_global_u64(&mut addrspace(1)u64, u64) -> u64;
#[import(cc = "device", name = "atomicExch")] fn cuda_atomic_exch_global_f32(&mut addrspace(1)f32, f32) -> f32;
#[import(cc = "device", name = "atomicMin")] fn cuda_atomic_min_global_i32(&mut addrspace(1)i32, i32) -> i32;
#[import(cc = "device", name = "atomicMin")] fn cuda_atomic_min_global_u32(&mut addrspace(1)u32, u32) -> u32;
#[import(cc = "device", name = "atomicMin")] fn cuda_atomic_min_global_u64(&mut addrspace(1)u64, u64) -> u64;
#[import(cc = "device", name = "atomicMax")] fn cuda_atomic_max_global_i32(&mut addrspace(1)i32, i32) -> i32;
#[import(cc = "device", name = "atomicMax")] fn cuda_atomic_max_global_u32(&mut addrspace(1)u32, u32) -> u32;
#[import(cc = "device", name = "atomicMax")] fn cuda_atomic_max_global_u64(&mut addrspace(1)u64, u64) -> u64;
#[import(cc = "device", name = "atomicCAS")] fn cuda_atomic_cas_global_u16(&mut addrspace(1)u16, u16, u16) -> u16;
#[import(cc = "device", name = "atomicCAS")] fn cuda_atomic_cas_global_i32(&mut addrspace(1)i32, i32, i32) -> i32;
#[import(cc = "device", name = "atomicCAS")] fn cuda_atomic_cas_global_u32(&mut addrspace(1)u32, u32, u32) -> u32;
#[import(cc = "device", name = "atomicCAS")] fn cuda_atomic_cas_global_u64(&mut addrspace(1)u64, u64, u64) -> u64;
#[import(cc = "device", name = "atomicInc")] fn cuda_atomic_inc_global_u32(&mut addrspace(1)u32, u32) -> u32;
#[import(cc = "device", name = "atomicDec")] fn cuda_atomic_dec_global_u32(&mut addrspace(1)u32, u32) -> u32;
#[import(cc = "device", name = "__threadfence")] fn cuda_threadfence() -> ();
#[import(cc = "device", name = "__threadfence_block")] fn cuda_threadfence_block() -> ();
#[import(cc = "device", name = "__threadfence_system")] fn cuda_threadfence_system() -> ();
#[import(cc = "device", name = "__syncthreads")] fn cuda_block_sync() -> ();
#[import(cc = "device", name = "__syncthreads_count")] fn cuda_block_sync_count(i32) -> i32;
#[import(cc = "device", name = "__syncthreads_and")] fn cuda_block_sync_all(i32) -> i32;
#[import(cc = "device", name = "__syncthreads_or")] fn cuda_block_sync_any(i32) -> i32;
#[import(cc = "device", name = "__syncwarp")] fn cuda_warp_sync(u32) -> ();
#[import(cc = "device", name = "__all_sync")] fn cuda_warp_sync_all(u32, i32) -> i32;
#[import(cc = "device", name = "__any_sync")] fn cuda_warp_sync_any(u32, i32) -> i32;
#[import(cc = "device", name = "__ballot_sync")] fn cuda_warp_sync_vote(u32, i32) -> u32;
#[import(cc = "device", name = "__activemask")] fn cuda_warp_activemask() -> u32;
#[import(cc = "device", name = "__shfl_sync")] fn cuda_warp_shfl_i32(u32, i32, i32, i32) -> i32;
#[import(cc = "device", name = "__shfl_sync")] fn cuda_warp_shfl_u32(u32, u32, i32, i32) -> u32;
#[import(cc = "device", name = "__shfl_sync")] fn cuda_warp_shfl_i64(u32, i64, i32, i32) -> i64;
#[import(cc = "device", name = "__shfl_sync")] fn cuda_warp_shfl_u64(u32, u64, i32, i32) -> u64;
#[import(cc = "device", name = "__shfl_sync")] fn cuda_warp_shfl_f32(u32, f32, i32, i32) -> f32;
#[import(cc = "device", name = "__shfl_sync")] fn cuda_warp_shfl_f64(u32, f64, i32, i32) -> f64;
#[import(cc = "device", name = "__shfl_up_sync")] fn cuda_warp_shfl_up_i32(u32, i32, u32, i32) -> i32;
#[import(cc = "device", name = "__shfl_up_sync")] fn cuda_warp_shfl_up_u32(u32, u32, u32, i32) -> u32;
#[import(cc = "device", name = "__shfl_up_sync")] fn cuda_warp_shfl_up_i64(u32, i64, u32, i32) -> i64;
#[import(cc = "device", name = "__shfl_up_sync")] fn cuda_warp_shfl_up_u64(u32, u64, u32, i32) -> u64;
#[import(cc = "device", name = "__shfl_up_sync")] fn cuda_warp_shfl_up_f32(u32, f32, u32, i32) -> f32;
#[import(cc = "device", name = "__shfl_up_sync")] fn cuda_warp_shfl_up_f64(u32, f64, u32, i32) -> f64;
#[import(cc = "device", name = "__shfl_down_sync")] fn cuda_warp_shfl_down_i32(u32, i32, u32, i32) -> i32;
#[import(cc = "device", name = "__shfl_down_sync")] fn cuda_warp_shfl_down_u32(u32, u32, u32, i32) -> u32;
#[import(cc = "device", name = "__shfl_down_sync")] fn cuda_warp_shfl_down_i64(u32, i64, u32, i32) -> i64;
#[import(cc = "device", name = "__shfl_down_sync")] fn cuda_warp_shfl_down_u64(u32, u64, u32, i32) -> u64;
#[import(cc = "device", name = "__shfl_down_sync")] fn cuda_warp_shfl_down_f32(u32, f32, u32, i32) -> f32;
#[import(cc = "device", name = "__shfl_down_sync")] fn cuda_warp_shfl_down_f64(u32, f64, u32, i32) -> f64;
#[import(cc = "device", name = "__shfl_xor_sync")] fn cuda_warp_shfl_xor_i32(u32, i32, i32, i32) -> i32;
#[import(cc = "device", name = "__shfl_xor_sync")] fn cuda_warp_shfl_xor_u32(u32, u32, i32, i32) -> u32;
#[import(cc = "device", name = "__shfl_xor_sync")] fn cuda_warp_shfl_xor_i64(u32, i64, i32, i32) -> i64;
#[import(cc = "device", name = "__shfl_xor_sync")] fn cuda_warp_shfl_xor_u64(u32, u64, i32, i32) -> u64;
#[import(cc = "device", name = "__shfl_xor_sync")] fn cuda_warp_shfl_xor_f32(u32, f32, i32, i32) -> f32;
#[import(cc = "device", name = "__shfl_xor_sync")] fn cuda_warp_shfl_xor_f64(u32, f64, i32, i32) -> f64;
#[import(cc = "device", name = "__match_any_sync")] fn cuda_warp_match_any_i32(u32, i32) -> u32;
#[import(cc = "device", name = "__match_any_sync")] fn cuda_warp_match_any_u32(u32, u32) -> u32;
#[import(cc = "device", name = "__match_any_sync")] fn cuda_warp_match_any_i64(u32, i64) -> u32;
#[import(cc = "device", name = "__match_any_sync")] fn cuda_warp_match_any_u64(u32, u64) -> u32;
#[import(cc = "device", name = "__match_any_sync")] fn cuda_warp_match_any_f32(u32, f32) -> u32;
#[import(cc = "device", name = "__match_any_sync")] fn cuda_warp_match_any_f64(u32, f64) -> u32;
#[import(cc = "device", name = "__match_all_sync")] fn cuda_warp_match_all_i32(u32, i32, &mut i32) -> u32;
#[import(cc = "device", name = "__match_all_sync")] fn cuda_warp_match_all_u32(u32, u32, &mut i32) -> u32;
#[import(cc = "device", name = "__match_all_sync")] fn cuda_warp_match_all_i64(u32, i64, &mut i32) -> u32;
#[import(cc = "device", name = "__match_all_sync")] fn cuda_warp_match_all_u64(u32, u64, &mut i32) -> u32;
#[import(cc = "device", name = "__match_all_sync")] fn cuda_warp_match_all_f32(u32, f32, &mut i32) -> u32;
#[import(cc = "device", name = "__match_all_sync")] fn cuda_warp_match_all_f64(u32, f64, &mut i32) -> u32;
#[import(cc = "device", name = "__nanosleep")] fn cuda_nanosleep(u32) -> ();
fn @cuda_vprintf(fmt: &[u8], args: &[u8]) -> i32 {
let mut res: i32;
asm(
"{\n"
".param.b64 param0;\n"
".param.b64 param1;\n"
".param.b32 retval;\n"
"st.param.b64 [param0], %1;\n"
"st.param.b64 [param1], %2;\n"
"call.uni (retval), vprintf, (param0, param1);\n"
"ld.param.b32 %0, [retval];\n"
"}" : "=r"(res) : "l"(fmt), "l"(args) : "memory" : "volatile"
);
res
}
fn @cuda_laneid() -> u32 {
let mut id: u32;
asm("mov.u32 %0, %laneid;" : "=r" (id));
id
}
fn @cuda_warpid() -> u32 {
let mut id: u32;
asm("mov.u32 %0, %warpid;" : "=r" (id));
id
}
fn @cuda_nwarpid() -> u32 {
let mut n: u32;
asm("mov.u32 %0, %nwarpid;" : "=r" (n));
n
}
fn @cuda_smid() -> u32 {
let mut id: u32;
asm("mov.u32 %0, %smid;" : "=r" (id));
id
}
fn @cuda_nsmid() -> u32 {
let mut n: u32;
asm("mov.u32 %0, %nsmid;" : "=r" (n));
n
}
fn @cuda_lanemask() -> u32 {
let mut mask: u32;
asm("mov.u32 %0, %lanemask_eq;" : "=r" (mask));
mask
}
fn @cuda_lanemask_le() -> u32 {
let mut mask: u32;
asm("mov.u32 %0, %lanemask_le;" : "=r" (mask));
mask
}
fn @cuda_lanemask_lt() -> u32 {
let mut mask: u32;
asm("mov.u32 %0, %lanemask_lt;" : "=r" (mask));
mask
}
fn @cuda_lanemask_ge() -> u32 {
let mut mask: u32;
asm("mov.u32 %0, %lanemask_ge;" : "=r" (mask));
mask
}
fn @cuda_lanemask_gt() -> u32 {
let mut mask: u32;
asm("mov.u32 %0, %lanemask_gt;" : "=r" (mask));
mask
}
fn @cuda_clock() -> u32 {
let mut cycle_count:u32;
asm("mov.u32 %0, %clock;" : "=r"(cycle_count) ::: "volatile");
cycle_count
}
fn @cuda_clock_hi() -> u32 {
let mut cycle_count:u32;
asm("mov.u32 %0, %clock_hi;" : "=r"(cycle_count) ::: "volatile");
cycle_count
}
fn @cuda_clock64() -> u64 {
let mut cycle_count:u64;
asm("mov.u64 %0, %clock64;" : "=l"(cycle_count) ::: "volatile");
cycle_count
}
fn @cuda_globaltimer() -> u64 {
let mut timestamp:u64;
asm("mov.u64 %0, %globaltimer;" : "=l"(timestamp) ::: "volatile");
timestamp
}
fn @cuda_globaltimer_lo() -> u32 {
let mut timestamp:u32;
asm("mov.u32 %0, %globaltimer_lo;" : "=r"(timestamp) ::: "volatile");
timestamp
}
fn @cuda_globaltimer_hi() -> u32 {
let mut timestamp:u32;
asm("mov.u32 %0, %globaltimer_hi;" : "=r"(timestamp) ::: "volatile");
timestamp
}
fn @cuda_minmin(a: i32, b: i32, c: i32) -> i32 { cuda_min(cuda_min(a, b), c) }
fn @cuda_maxmax(a: i32, b: i32, c: i32) -> i32 { cuda_max(cuda_max(a, b), c) }
fn @cuda_minmax(a: i32, b: i32, c: i32) -> i32 { cuda_max(cuda_min(a, b), c) }
fn @cuda_maxmin(a: i32, b: i32, c: i32) -> i32 { cuda_min(cuda_max(a, b), c) }
fn @cuda_madf(a: f32, b: f32, c: f32) -> f32 { cuda_fmaf(a, b, c) }
fn @cuda_mad(a: f64, b: f64, c: f64) -> f64 { cuda_fma(a, b, c) }
fn @cuda_accelerator(dev: i32) = Accelerator {
exec = @|body| |grid, block| {
let work_item = WorkItem {
tidx = cuda_threadIdx_x,
tidy = cuda_threadIdx_y,
tidz = cuda_threadIdx_z,
bidx = cuda_blockIdx_x,
bidy = cuda_blockIdx_y,
bidz = cuda_blockIdx_z,
gidx = @|| cuda_threadIdx_x() + cuda_blockDim_x() * cuda_blockIdx_x(),
gidy = @|| cuda_threadIdx_y() + cuda_blockDim_y() * cuda_blockIdx_y(),
gidz = @|| cuda_threadIdx_z() + cuda_blockDim_z() * cuda_blockIdx_z(),
bdimx = cuda_blockDim_x,
bdimy = cuda_blockDim_y,
bdimz = cuda_blockDim_z,
gdimx = @|| cuda_gridDim_x() * cuda_blockDim_x(),
gdimy = @|| cuda_gridDim_y() * cuda_blockDim_y(),
gdimz = @|| cuda_gridDim_z() * cuda_blockDim_z(),
nblkx = cuda_gridDim_x,
nblky = cuda_gridDim_y,
nblkz = cuda_gridDim_z
};
cuda(dev, grid, block, || @body(work_item))
},
sync = @|| synchronize_cuda(dev),
alloc = @|size| alloc_cuda(dev, size),
alloc_unified = @|size| alloc_cuda_unified(dev, size),
barrier = cuda_syncthreads
};
static cuda_intrinsics = Intrinsics {
expf = cuda_expf,
exp2f = cuda_exp2f,
logf = cuda_logf,
log2f = cuda_log2f,
powf = cuda_powf,
rsqrtf = cuda_rsqrtf,
sqrtf = cuda_sqrtf,
fabsf = cuda_fabsf,
sinf = cuda_sinf,
cosf = cuda_cosf,
tanf = cuda_tanf,
asinf = cuda_asinf,
acosf = cuda_acosf,
atanf = cuda_atanf,
erff = cuda_erff,
atan2f = cuda_atan2f,
copysignf = cuda_copysignf,
fmaf = cuda_fmaf,
fmaxf = cuda_fmaxf,
fminf = cuda_fminf,
fmodf = cuda_fmodf,
floorf = cuda_floorf,
isinff = cuda_isinff,
isnanf = cuda_isnanf,
isfinitef = cuda_isfinitef,
exp = cuda_exp,
exp2 = cuda_exp2,
log = cuda_log,
log2 = cuda_log2,
pow = cuda_pow,
rsqrt = cuda_rsqrt,
sqrt = cuda_sqrt,
fabs = cuda_fabs,
sin = cuda_sin,
cos = cuda_cos,
tan = cuda_tan,
asin = cuda_asin,
acos = cuda_acos,
atan = cuda_atan,
erf = cuda_erf,
atan2 = cuda_atan2,
copysign = cuda_copysign,
fma = cuda_fma,
fmax = cuda_fmax,
fmin = cuda_fmin,
fmod = cuda_fmod,
floor = cuda_floor,
isinf = cuda_isinf,
isnan = cuda_isnan,
isfinite = cuda_isfinite,
min = cuda_min,
max = cuda_max,
};
#[import(cc = "device", name = "llvm.nvvm.read.ptx.sreg.ctaid.x")] fn nvvm_read_ptx_sreg_ctaid_x() -> i32;
#[import(cc = "device", name = "llvm.nvvm.read.ptx.sreg.ctaid.y")] fn nvvm_read_ptx_sreg_ctaid_y() -> i32;
#[import(cc = "device", name = "llvm.nvvm.read.ptx.sreg.ctaid.z")] fn nvvm_read_ptx_sreg_ctaid_z() -> i32;
#[import(cc = "device", name = "llvm.nvvm.read.ptx.sreg.nctaid.x")] fn nvvm_read_ptx_sreg_nctaid_x() -> i32;
#[import(cc = "device", name = "llvm.nvvm.read.ptx.sreg.nctaid.y")] fn nvvm_read_ptx_sreg_nctaid_y() -> i32;
#[import(cc = "device", name = "llvm.nvvm.read.ptx.sreg.nctaid.z")] fn nvvm_read_ptx_sreg_nctaid_z() -> i32;
#[import(cc = "device", name = "llvm.nvvm.read.ptx.sreg.ntid.x")] fn nvvm_read_ptx_sreg_ntid_x() -> i32;
#[import(cc = "device", name = "llvm.nvvm.read.ptx.sreg.ntid.y")] fn nvvm_read_ptx_sreg_ntid_y() -> i32;
#[import(cc = "device", name = "llvm.nvvm.read.ptx.sreg.ntid.z")] fn nvvm_read_ptx_sreg_ntid_z() -> i32;
#[import(cc = "device", name = "llvm.nvvm.read.ptx.sreg.tid.x")] fn nvvm_read_ptx_sreg_tid_x() -> i32;
#[import(cc = "device", name = "llvm.nvvm.read.ptx.sreg.tid.y")] fn nvvm_read_ptx_sreg_tid_y() -> i32;
#[import(cc = "device", name = "llvm.nvvm.read.ptx.sreg.tid.z")] fn nvvm_read_ptx_sreg_tid_z() -> i32;
#[import(cc = "device", name = "llvm.nvvm.read.ptx.sreg.warpsize")] fn nvvm_read_ptx_sreg_warpsize() -> i32;
#[import(cc = "device", name = "llvm.nvvm.read.ptx.sreg.laneid")] fn nvvm_laneid() -> u32;
#[import(cc = "device", name = "llvm.nvvm.read.ptx.sreg.warpid")] fn nvvm_warpid() -> u32;
#[import(cc = "device", name = "llvm.nvvm.read.ptx.sreg.nwarpid")] fn nvvm_nwarpid() -> u32;
#[import(cc = "device", name = "llvm.nvvm.read.ptx.sreg.smid")] fn nvvm_smid() -> u32;
#[import(cc = "device", name = "llvm.nvvm.read.ptx.sreg.nsmid")] fn nvvm_nsmid() -> u32;
#[import(cc = "device", name = "llvm.nvvm.read.ptx.sreg.lanemask_eq")] fn nvvm_lanemask() -> u32;
#[import(cc = "device", name = "llvm.nvvm.read.ptx.sreg.lanemask_le")] fn nvvm_lanemask_le() -> u32;
#[import(cc = "device", name = "llvm.nvvm.read.ptx.sreg.lanemask_lt")] fn nvvm_lanemask_lt() -> u32;
#[import(cc = "device", name = "llvm.nvvm.read.ptx.sreg.lanemask_ge")] fn nvvm_lanemask_ge() -> u32;
#[import(cc = "device", name = "llvm.nvvm.read.ptx.sreg.lanemask_gt")] fn nvvm_lanemask_gt() -> u32;
#[import(cc = "device", name = "llvm.nvvm.barrier0")] fn nvvm_barrier() -> ();
#[import(cc = "device", name = "llvm.nvvm.abs.i")] fn nvvm_abs_i(i32) -> i32;
#[import(cc = "device", name = "llvm.nvvm.abs.ll")] fn nvvm_abs_ll(i64) -> i64;
#[import(cc = "device", name = "llvm.nvvm.add.rm.d")] fn nvvm_add_rm_d(f64, f64) -> f64;
#[import(cc = "device", name = "llvm.nvvm.add.rm.f")] fn nvvm_add_rm_f(f32, f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.add.rm.ftz.f")] fn nvvm_add_rm_ftz_f(f32, f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.add.rn.d")] fn nvvm_add_rn_d(f64, f64) -> f64;
#[import(cc = "device", name = "llvm.nvvm.add.rn.f")] fn nvvm_add_rn_f(f32, f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.add.rn.ftz.f")] fn nvvm_add_rn_ftz_f(f32, f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.add.rp.d")] fn nvvm_add_rp_d(f64, f64) -> f64;
#[import(cc = "device", name = "llvm.nvvm.add.rp.f")] fn nvvm_add_rp_f(f32, f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.add.rp.ftz.f")] fn nvvm_add_rp_ftz_f(f32, f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.add.rz.d")] fn nvvm_add_rz_d(f64, f64) -> f64;
#[import(cc = "device", name = "llvm.nvvm.add.rz.f")] fn nvvm_add_rz_f(f32, f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.add.rz.ftz.f")] fn nvvm_add_rz_ftz_f(f32, f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.barrier0.and")] fn nvvm_barrier0_and(i32) -> i32;
#[import(cc = "device", name = "llvm.nvvm.barrier0.or")] fn nvvm_barrier0_or(i32) -> i32;
#[import(cc = "device", name = "llvm.nvvm.barrier0.popc")] fn nvvm_barrier0_popc(i32) -> i32;
#[import(cc = "device", name = "llvm.nvvm.bitcast.d2ll")] fn nvvm_bitcast_d2ll(f64) -> i64;
#[import(cc = "device", name = "llvm.nvvm.bitcast.f2i")] fn nvvm_bitcast_f2i(f32) -> i32;
#[import(cc = "device", name = "llvm.nvvm.bitcast.i2f")] fn nvvm_bitcast_i2f(i32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.bitcast.ll2d")] fn nvvm_bitcast_ll2d(i64) -> f64;
#[import(cc = "device", name = "llvm.nvvm.brev32")] fn nvvm_brev32(i32) -> i32;
#[import(cc = "device", name = "llvm.nvvm.brev64")] fn nvvm_brev64(i64) -> i64;
#[import(cc = "device", name = "llvm.nvvm.ceil.d")] fn nvvm_ceil_d(f64) -> f64;
#[import(cc = "device", name = "llvm.nvvm.ceil.f")] fn nvvm_ceil_f(f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.ceil.ftz.f")] fn nvvm_ceil_ftz_f(f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.clz.i")] fn nvvm_clz_i(i32) -> i32;
#[import(cc = "device", name = "llvm.nvvm.clz.ll")] fn nvvm_clz_ll(i64) -> i32;
#[import(cc = "device", name = "llvm.nvvm.cos.approx.f")] fn nvvm_cos_approx_f(f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.cos.approx.ftz.f")] fn nvvm_cos_approx_ftz_f(f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.d2f.rm")] fn nvvm_d2f_rm(f64) -> f32;
#[import(cc = "device", name = "llvm.nvvm.d2f.rm.ftz")] fn nvvm_d2f_rm_ftz(f64) -> f32;
#[import(cc = "device", name = "llvm.nvvm.d2f.rn")] fn nvvm_d2f_rn(f64) -> f32;
#[import(cc = "device", name = "llvm.nvvm.d2f.rn.ftz")] fn nvvm_d2f_rn_ftz(f64) -> f32;
#[import(cc = "device", name = "llvm.nvvm.d2f.rp")] fn nvvm_d2f_rp(f64) -> f32;
#[import(cc = "device", name = "llvm.nvvm.d2f.rp.ftz")] fn nvvm_d2f_rp_ftz(f64) -> f32;
#[import(cc = "device", name = "llvm.nvvm.d2f.rz")] fn nvvm_d2f_rz(f64) -> f32;
#[import(cc = "device", name = "llvm.nvvm.d2f.rz.ftz")] fn nvvm_d2f_rz_ftz(f64) -> f32;
#[import(cc = "device", name = "llvm.nvvm.d2i.hi")] fn nvvm_d2i_hi(f64) -> i32;
#[import(cc = "device", name = "llvm.nvvm.d2i.lo")] fn nvvm_d2i_lo(f64) -> i32;
#[import(cc = "device", name = "llvm.nvvm.d2i.rm")] fn nvvm_d2i_rm(f64) -> i32;
#[import(cc = "device", name = "llvm.nvvm.d2i.rn")] fn nvvm_d2i_rn(f64) -> i32;
#[import(cc = "device", name = "llvm.nvvm.d2i.rp")] fn nvvm_d2i_rp(f64) -> i32;
#[import(cc = "device", name = "llvm.nvvm.d2i.rz")] fn nvvm_d2i_rz(f64) -> i32;
#[import(cc = "device", name = "llvm.nvvm.d2ll.rm")] fn nvvm_d2ll_rm(f64) -> i64;
#[import(cc = "device", name = "llvm.nvvm.d2ll.rn")] fn nvvm_d2ll_rn(f64) -> i64;
#[import(cc = "device", name = "llvm.nvvm.d2ll.rp")] fn nvvm_d2ll_rp(f64) -> i64;
#[import(cc = "device", name = "llvm.nvvm.d2ll.rz")] fn nvvm_d2ll_rz(f64) -> i64;
#[import(cc = "device", name = "llvm.nvvm.d2ui.rm")] fn nvvm_d2ui_rm(f64) -> i32;
#[import(cc = "device", name = "llvm.nvvm.d2ui.rn")] fn nvvm_d2ui_rn(f64) -> i32;
#[import(cc = "device", name = "llvm.nvvm.d2ui.rp")] fn nvvm_d2ui_rp(f64) -> i32;
#[import(cc = "device", name = "llvm.nvvm.d2ui.rz")] fn nvvm_d2ui_rz(f64) -> i32;
#[import(cc = "device", name = "llvm.nvvm.d2ull.rm")] fn nvvm_d2ull_rm(f64) -> i64;
#[import(cc = "device", name = "llvm.nvvm.d2ull.rn")] fn nvvm_d2ull_rn(f64) -> i64;
#[import(cc = "device", name = "llvm.nvvm.d2ull.rp")] fn nvvm_d2ull_rp(f64) -> i64;
#[import(cc = "device", name = "llvm.nvvm.d2ull.rz")] fn nvvm_d2ull_rz(f64) -> i64;
#[import(cc = "device", name = "llvm.nvvm.div.approx.f")] fn nvvm_div_approx_f(f32, f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.div.approx.ftz.f")] fn nvvm_div_approx_ftz_f(f32, f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.div.rm.d")] fn nvvm_div_rm_d(f64, f64) -> f64;
#[import(cc = "device", name = "llvm.nvvm.div.rm.f")] fn nvvm_div_rm_f(f32, f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.div.rm.ftz.f")] fn nvvm_div_rm_ftz_f(f32, f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.div.rn.d")] fn nvvm_div_rn_d(f64, f64) -> f64;
#[import(cc = "device", name = "llvm.nvvm.div.rn.f")] fn nvvm_div_rn_f(f32, f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.div.rn.ftz.f")] fn nvvm_div_rn_ftz_f(f32, f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.div.rp.d")] fn nvvm_div_rp_d(f64, f64) -> f64;
#[import(cc = "device", name = "llvm.nvvm.div.rp.f")] fn nvvm_div_rp_f(f32, f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.div.rp.ftz.f")] fn nvvm_div_rp_ftz_f(f32, f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.div.rz.d")] fn nvvm_div_rz_d(f64, f64) -> f64;
#[import(cc = "device", name = "llvm.nvvm.div.rz.f")] fn nvvm_div_rz_f(f32, f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.div.rz.ftz.f")] fn nvvm_div_rz_ftz_f(f32, f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.ex2.approx.d")] fn nvvm_ex2_approx_d(f64) -> f64;
#[import(cc = "device", name = "llvm.nvvm.ex2.approx.f")] fn nvvm_ex2_approx_f(f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.ex2.approx.ftz.f")] fn nvvm_ex2_approx_ftz_f(f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.f2h.rn")] fn nvvm_f2h_rn(f32) -> i16;
#[import(cc = "device", name = "llvm.nvvm.f2h.rn.ftz")] fn nvvm_f2h_rn_ftz(f32) -> i16;
#[import(cc = "device", name = "llvm.nvvm.f2i.rm")] fn nvvm_f2i_rm(f32) -> i32;
#[import(cc = "device", name = "llvm.nvvm.f2i.rm.ftz")] fn nvvm_f2i_rm_ftz(f32) -> i32;
#[import(cc = "device", name = "llvm.nvvm.f2i.rn")] fn nvvm_f2i_rn(f32) -> i32;
#[import(cc = "device", name = "llvm.nvvm.f2i.rn.ftz")] fn nvvm_f2i_rn_ftz(f32) -> i32;
#[import(cc = "device", name = "llvm.nvvm.f2i.rp")] fn nvvm_f2i_rp(f32) -> i32;
#[import(cc = "device", name = "llvm.nvvm.f2i.rp.ftz")] fn nvvm_f2i_rp_ftz(f32) -> i32;
#[import(cc = "device", name = "llvm.nvvm.f2i.rz")] fn nvvm_f2i_rz(f32) -> i32;
#[import(cc = "device", name = "llvm.nvvm.f2i.rz.ftz")] fn nvvm_f2i_rz_ftz(f32) -> i32;
#[import(cc = "device", name = "llvm.nvvm.f2ll.rm")] fn nvvm_f2ll_rm(f32) -> i64;
#[import(cc = "device", name = "llvm.nvvm.f2ll.rm.ftz")] fn nvvm_f2ll_rm_ftz(f32) -> i64;
#[import(cc = "device", name = "llvm.nvvm.f2ll.rn")] fn nvvm_f2ll_rn(f32) -> i64;
#[import(cc = "device", name = "llvm.nvvm.f2ll.rn.ftz")] fn nvvm_f2ll_rn_ftz(f32) -> i64;
#[import(cc = "device", name = "llvm.nvvm.f2ll.rp")] fn nvvm_f2ll_rp(f32) -> i64;
#[import(cc = "device", name = "llvm.nvvm.f2ll.rp.ftz")] fn nvvm_f2ll_rp_ftz(f32) -> i64;
#[import(cc = "device", name = "llvm.nvvm.f2ll.rz")] fn nvvm_f2ll_rz(f32) -> i64;
#[import(cc = "device", name = "llvm.nvvm.f2ll.rz.ftz")] fn nvvm_f2ll_rz_ftz(f32) -> i64;
#[import(cc = "device", name = "llvm.nvvm.f2ui.rm")] fn nvvm_f2ui_rm(f32) -> i32;
#[import(cc = "device", name = "llvm.nvvm.f2ui.rm.ftz")] fn nvvm_f2ui_rm_ftz(f32) -> i32;
#[import(cc = "device", name = "llvm.nvvm.f2ui.rn")] fn nvvm_f2ui_rn(f32) -> i32;
#[import(cc = "device", name = "llvm.nvvm.f2ui.rn.ftz")] fn nvvm_f2ui_rn_ftz(f32) -> i32;
#[import(cc = "device", name = "llvm.nvvm.f2ui.rp")] fn nvvm_f2ui_rp(f32) -> i32;
#[import(cc = "device", name = "llvm.nvvm.f2ui.rp.ftz")] fn nvvm_f2ui_rp_ftz(f32) -> i32;
#[import(cc = "device", name = "llvm.nvvm.f2ui.rz")] fn nvvm_f2ui_rz(f32) -> i32;
#[import(cc = "device", name = "llvm.nvvm.f2ui.rz.ftz")] fn nvvm_f2ui_rz_ftz(f32) -> i32;
#[import(cc = "device", name = "llvm.nvvm.f2ull.rm")] fn nvvm_f2ull_rm(f32) -> i64;
#[import(cc = "device", name = "llvm.nvvm.f2ull.rm.ftz")] fn nvvm_f2ull_rm_ftz(f32) -> i64;
#[import(cc = "device", name = "llvm.nvvm.f2ull.rn")] fn nvvm_f2ull_rn(f32) -> i64;
#[import(cc = "device", name = "llvm.nvvm.f2ull.rn.ftz")] fn nvvm_f2ull_rn_ftz(f32) -> i64;
#[import(cc = "device", name = "llvm.nvvm.f2ull.rp")] fn nvvm_f2ull_rp(f32) -> i64;
#[import(cc = "device", name = "llvm.nvvm.f2ull.rp.ftz")] fn nvvm_f2ull_rp_ftz(f32) -> i64;
#[import(cc = "device", name = "llvm.nvvm.f2ull.rz")] fn nvvm_f2ull_rz(f32) -> i64;
#[import(cc = "device", name = "llvm.nvvm.f2ull.rz.ftz")] fn nvvm_f2ull_rz_ftz(f32) -> i64;
#[import(cc = "device", name = "llvm.nvvm.fabs.d")] fn nvvm_fabs_d(f64) -> f64;
#[import(cc = "device", name = "llvm.nvvm.fabs.f")] fn nvvm_fabs_f(f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.fabs.ftz.f")] fn nvvm_fabs_ftz_f(f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.floor.d")] fn nvvm_floor_d(f64) -> f64;
#[import(cc = "device", name = "llvm.nvvm.floor.f")] fn nvvm_floor_f(f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.floor.ftz.f")] fn nvvm_floor_ftz_f(f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.fma.rm.d")] fn nvvm_fma_rm_d(f64, f64, f64) -> f64;
#[import(cc = "device", name = "llvm.nvvm.fma.rm.f")] fn nvvm_fma_rm_f(f32, f32, f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.fma.rm.ftz.f")] fn nvvm_fma_rm_ftz_f(f32, f32, f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.fma.rn.d")] fn nvvm_fma_rn_d(f64, f64, f64) -> f64;
#[import(cc = "device", name = "llvm.nvvm.fma.rn.f")] fn nvvm_fma_rn_f(f32, f32, f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.fma.rn.ftz.f")] fn nvvm_fma_rn_ftz_f(f32, f32, f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.fma.rp.d")] fn nvvm_fma_rp_d(f64, f64, f64) -> f64;
#[import(cc = "device", name = "llvm.nvvm.fma.rp.f")] fn nvvm_fma_rp_f(f32, f32, f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.fma.rp.ftz.f")] fn nvvm_fma_rp_ftz_f(f32, f32, f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.fma.rz.d")] fn nvvm_fma_rz_d(f64, f64, f64) -> f64;
#[import(cc = "device", name = "llvm.nvvm.fma.rz.f")] fn nvvm_fma_rz_f(f32, f32, f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.fma.rz.ftz.f")] fn nvvm_fma_rz_ftz_f(f32, f32, f32) -> f32;
//#[import(cc = "device", name = "llvm.nvvm.fmax.d")] fn nvvm_fmax(f64, f64) -> f64;
//#[import(cc = "device", name = "llvm.nvvm.fmax.f")] fn nvvm_fmaxf(f32, f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.fmax.ftz.f")] fn nvvm_fmax_ftz_f(f32, f32) -> f32;
//#[import(cc = "device", name = "llvm.nvvm.fmin.d")] fn nvvm_fmin(f64, f64) -> f64;
//#[import(cc = "device", name = "llvm.nvvm.fmin.f")] fn nvvm_fminf(f32, f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.fmin.ftz.f")] fn nvvm_fmin_ftz_f(f32, f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.h2f")] fn nvvm_h2f(i16) -> f32;
#[import(cc = "device", name = "llvm.nvvm.i2d.rm")] fn nvvm_i2d_rm(i32) -> f64;
#[import(cc = "device", name = "llvm.nvvm.i2d.rn")] fn nvvm_i2d_rn(i32) -> f64;
#[import(cc = "device", name = "llvm.nvvm.i2d.rp")] fn nvvm_i2d_rp(i32) -> f64;
#[import(cc = "device", name = "llvm.nvvm.i2d.rz")] fn nvvm_i2d_rz(i32) -> f64;
#[import(cc = "device", name = "llvm.nvvm.i2f.rm")] fn nvvm_i2f_rm(i32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.i2f.rn")] fn nvvm_i2f_rn(i32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.i2f.rp")] fn nvvm_i2f_rp(i32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.i2f.rz")] fn nvvm_i2f_rz(i32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.lg2.approx.d")] fn nvvm_lg2_approx_d(f64) -> f64;
#[import(cc = "device", name = "llvm.nvvm.lg2.approx.f")] fn nvvm_lg2_approx_f(f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.lg2.approx.ftz.f")] fn nvvm_lg2_approx_ftz_f(f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.ll2d.rm")] fn nvvm_ll2d_rm(i64) -> f64;
#[import(cc = "device", name = "llvm.nvvm.ll2d.rn")] fn nvvm_ll2d_rn(i64) -> f64;
#[import(cc = "device", name = "llvm.nvvm.ll2d.rp")] fn nvvm_ll2d_rp(i64) -> f64;
#[import(cc = "device", name = "llvm.nvvm.ll2d.rz")] fn nvvm_ll2d_rz(i64) -> f64;
#[import(cc = "device", name = "llvm.nvvm.ll2f.rm")] fn nvvm_ll2f_rm(i64) -> f32;
#[import(cc = "device", name = "llvm.nvvm.ll2f.rn")] fn nvvm_ll2f_rn(i64) -> f32;
#[import(cc = "device", name = "llvm.nvvm.ll2f.rp")] fn nvvm_ll2f_rp(i64) -> f32;
#[import(cc = "device", name = "llvm.nvvm.ll2f.rz")] fn nvvm_ll2f_rz(i64) -> f32;
#[import(cc = "device", name = "llvm.nvvm.lohi.i2d")] fn nvvm_lohi_i2d(i32, i32) -> f64;
#[import(cc = "device", name = "llvm.nvvm.max.i")] fn nvvm_max(i32, i32) -> i32;
#[import(cc = "device", name = "llvm.nvvm.max.ll")] fn nvvm_max_ll(i64, i64) -> i64;
#[import(cc = "device", name = "llvm.nvvm.max.ui")] fn nvvm_max_ui(i32, i32) -> i32;
#[import(cc = "device", name = "llvm.nvvm.max.ull")] fn nvvm_max_ull(i64, i64) -> i64;
#[import(cc = "device", name = "llvm.nvvm.membar.cta")] fn nvvm_membar_cta() -> ();
#[import(cc = "device", name = "llvm.nvvm.membar.gl")] fn nvvm_membar_gl() -> ();
#[import(cc = "device", name = "llvm.nvvm.membar.sys")] fn nvvm_membar_sys() -> ();
#[import(cc = "device", name = "llvm.nvvm.min.i")] fn nvvm_min(i32, i32) -> i32;
#[import(cc = "device", name = "llvm.nvvm.min.ll")] fn nvvm_min_ll(i64, i64) -> i64;
#[import(cc = "device", name = "llvm.nvvm.min.ui")] fn nvvm_min_ui(i32, i32) -> i32;
#[import(cc = "device", name = "llvm.nvvm.min.ull")] fn nvvm_min_ull(i64, i64) -> i64;
#[import(cc = "device", name = "llvm.nvvm.move.f64")] fn nvvm_move_f64(f64) -> f64;
#[import(cc = "device", name = "llvm.nvvm.move.f32")] fn nvvm_move_f32(f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.move.i16")] fn nvvm_move_i16(i16) -> i16;
#[import(cc = "device", name = "llvm.nvvm.move.i32")] fn nvvm_move_i32(i32) -> i32;
#[import(cc = "device", name = "llvm.nvvm.move.i64")] fn nvvm_move_i64(i64) -> i64;
#[import(cc = "device", name = "llvm.nvvm.mul24.i")] fn nvvm_mul24_i(i32, i32) -> i32;
#[import(cc = "device", name = "llvm.nvvm.mul24.ui")] fn nvvm_mul24_ui(i32, i32) -> i32;
#[import(cc = "device", name = "llvm.nvvm.mul.rm.d")] fn nvvm_mul_rm_d(f64, f64) -> f64;
#[import(cc = "device", name = "llvm.nvvm.mul.rm.f")] fn nvvm_mul_rm_f(f32, f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.mul.rm.ftz.f")] fn nvvm_mul_rm_ftz_f(f32, f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.mul.rn.d")] fn nvvm_mul_rn_d(f64, f64) -> f64;
#[import(cc = "device", name = "llvm.nvvm.mul.rn.f")] fn nvvm_mul_rn_f(f32, f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.mul.rn.ftz.f")] fn nvvm_mul_rn_ftz_f(f32, f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.mul.rp.d")] fn nvvm_mul_rp_d(f64, f64) -> f64;
#[import(cc = "device", name = "llvm.nvvm.mul.rp.f")] fn nvvm_mul_rp_f(f32, f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.mul.rp.ftz.f")] fn nvvm_mul_rp_ftz_f(f32, f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.mul.rz.d")] fn nvvm_mul_rz_d(f64, f64) -> f64;
#[import(cc = "device", name = "llvm.nvvm.mul.rz.f")] fn nvvm_mul_rz_f(f32, f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.mul.rz.ftz.f")] fn nvvm_mul_rz_ftz_f(f32, f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.mulhi.i")] fn nvvm_mulhi_i(i32, i32) -> i32;
#[import(cc = "device", name = "llvm.nvvm.mulhi.ll")] fn nvvm_mulhi_ll(i64, i64) -> i64;
#[import(cc = "device", name = "llvm.nvvm.mulhi.ui")] fn nvvm_mulhi_ui(i32, i32) -> i32;
#[import(cc = "device", name = "llvm.nvvm.mulhi.ull")] fn nvvm_mulhi_ull(i64, i64) -> i64;
#[import(cc = "device", name = "llvm.nvvm.popc.i")] fn nvvm_popc_i(i32) -> i32;
#[import(cc = "device", name = "llvm.nvvm.popc.ll")] fn nvvm_popc_ll(i64) -> i32;
#[import(cc = "device", name = "llvm.nvvm.prmt")] fn nvvm_prmt(i32, i32, i32) -> i32;
#[import(cc = "device", name = "llvm.nvvm.rcp.approx.ftz.d")] fn nvvm_rcp_approx_ftz_d(f64) -> f64;
#[import(cc = "device", name = "llvm.nvvm.rcp.rm.d")] fn nvvm_rcp_rm_d(f64) -> f64;
#[import(cc = "device", name = "llvm.nvvm.rcp.rm.f")] fn nvvm_rcp_rm_f(f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.rcp.rm.ftz.f")] fn nvvm_rcp_rm_ftz_f(f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.rcp.rn.d")] fn nvvm_rcp_rn_d(f64) -> f64;
#[import(cc = "device", name = "llvm.nvvm.rcp.rn.f")] fn nvvm_rcp_rn_f(f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.rcp.rn.ftz.f")] fn nvvm_rcp_rn_ftz_f(f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.rcp.rp.d")] fn nvvm_rcp_rp_d(f64) -> f64;
#[import(cc = "device", name = "llvm.nvvm.rcp.rp.f")] fn nvvm_rcp_rp_f(f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.rcp.rp.ftz.f")] fn nvvm_rcp_rp_ftz_f(f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.rcp.rz.d")] fn nvvm_rcp_rz_d(f64) -> f64;
#[import(cc = "device", name = "llvm.nvvm.rcp.rz.f")] fn nvvm_rcp_rz_f(f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.rcp.rz.ftz.f")] fn nvvm_rcp_rz_ftz_f(f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.round.d")] fn nvvm_round_d(f64) -> f64;
#[import(cc = "device", name = "llvm.nvvm.round.f")] fn nvvm_round_f(f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.round.ftz.f")] fn nvvm_round_ftz_f(f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.rsqrt.approx.d")] fn nvvm_rsqrt_approx_d(f64) -> f64;
#[import(cc = "device", name = "llvm.nvvm.rsqrt.approx.f")] fn nvvm_rsqrt_approx_f(f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.rsqrt.approx.ftz.f")] fn nvvm_rsqrt_approx_ftz_f(f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.sad.i")] fn nvvm_sad_i(i32, i32, i32) -> i32;
#[import(cc = "device", name = "llvm.nvvm.sad.ui")] fn nvvm_sad_ui(i32, i32, i32) -> i32;
#[import(cc = "device", name = "llvm.nvvm.saturate.d")] fn nvvm_saturate_d(f64) -> f64;
#[import(cc = "device", name = "llvm.nvvm.saturate.f")] fn nvvm_saturate_f(f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.saturate.ftz.f")] fn nvvm_saturate_ftz_f(f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.sin.approx.f")] fn nvvm_sin_approx_f(f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.sin.approx.ftz.f")] fn nvvm_sin_approx_ftz_f(f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.sqrt.approx.f")] fn nvvm_sqrt_approx_f(f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.sqrt.approx.ftz.f")] fn nvvm_sqrt_approx_ftz_f(f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.sqrt.f")] fn nvvm_sqrt_f(f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.sqrt.rm.d")] fn nvvm_sqrt_rm_d(f64) -> f64;
#[import(cc = "device", name = "llvm.nvvm.sqrt.rm.f")] fn nvvm_sqrt_rm_f(f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.sqrt.rm.ftz.f")] fn nvvm_sqrt_rm_ftz_f(f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.sqrt.rn.d")] fn nvvm_sqrt_rn_d(f64) -> f64;
#[import(cc = "device", name = "llvm.nvvm.sqrt.rn.f")] fn nvvm_sqrt_rn_f(f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.sqrt.rn.ftz.f")] fn nvvm_sqrt_rn_ftz_f(f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.sqrt.rp.d")] fn nvvm_sqrt_rp_d(f64) -> f64;
#[import(cc = "device", name = "llvm.nvvm.sqrt.rp.f")] fn nvvm_sqrt_rp_f(f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.sqrt.rp.ftz.f")] fn nvvm_sqrt_rp_ftz_f(f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.sqrt.rz.d")] fn nvvm_sqrt_rz_d(f64) -> f64;
#[import(cc = "device", name = "llvm.nvvm.sqrt.rz.f")] fn nvvm_sqrt_rz_f(f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.sqrt.rz.ftz.f")] fn nvvm_sqrt_rz_ftz_f(f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.trunc.d")] fn nvvm_trunc_d(f64) -> f64;
#[import(cc = "device", name = "llvm.nvvm.trunc.f")] fn nvvm_trunc_f(f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.trunc.ftz.f")] fn nvvm_trunc_ftz_f(f32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.ui2d.rm")] fn nvvm_ui2d_rm(i32) -> f64;
#[import(cc = "device", name = "llvm.nvvm.ui2d.rn")] fn nvvm_ui2d_rn(i32) -> f64;
#[import(cc = "device", name = "llvm.nvvm.ui2d.rp")] fn nvvm_ui2d_rp(i32) -> f64;
#[import(cc = "device", name = "llvm.nvvm.ui2d.rz")] fn nvvm_ui2d_rz(i32) -> f64;
#[import(cc = "device", name = "llvm.nvvm.ui2f.rm")] fn nvvm_ui2f_rm(i32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.ui2f.rn")] fn nvvm_ui2f_rn(i32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.ui2f.rp")] fn nvvm_ui2f_rp(i32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.ui2f.rz")] fn nvvm_ui2f_rz(i32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.ull2d.rm")] fn nvvm_ull2d_rm(i64) -> f64;
#[import(cc = "device", name = "llvm.nvvm.ull2d.rn")] fn nvvm_ull2d_rn(i64) -> f64;
#[import(cc = "device", name = "llvm.nvvm.ull2d.rp")] fn nvvm_ull2d_rp(i64) -> f64;
#[import(cc = "device", name = "llvm.nvvm.ull2d.rz")] fn nvvm_ull2d_rz(i64) -> f64;
#[import(cc = "device", name = "llvm.nvvm.ull2f.rm")] fn nvvm_ull2f_rm(i64) -> f32;
#[import(cc = "device", name = "llvm.nvvm.ull2f.rn")] fn nvvm_ull2f_rn(i64) -> f32;
#[import(cc = "device", name = "llvm.nvvm.ull2f.rp")] fn nvvm_ull2f_rp(i64) -> f32;
#[import(cc = "device", name = "llvm.nvvm.ull2f.rz")] fn nvvm_ull2f_rz(i64) -> f32;
#[import(cc = "device", name = "llvm.nvvm.ldg.global.i.i8.p1")] fn nvvm_ldg_u8_p1(&addrspace(1)u8, i32) -> u8;
#[import(cc = "device", name = "llvm.nvvm.ldg.global.i.i32.p1")] fn nvvm_ldg_i32_p1(&addrspace(1)i32, i32) -> i32;
#[import(cc = "device", name = "llvm.nvvm.ldg.global.f.f32.p1")] fn nvvm_ldg_f32_p1(&addrspace(1)f32, i32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.ldg.global.i.v4i32.p1")] fn nvvm_ldg4_i32_p1(&addrspace(1)simd[i32 * 4], i32) -> simd[i32 * 4];
#[import(cc = "device", name = "llvm.nvvm.ldg.global.f.v4f32.p1")] fn nvvm_ldg4_f32_p1(&addrspace(1)simd[f32 * 4], i32) -> simd[f32 * 4];
#[import(cc = "device", name = "llvm.nvvm.atomic.load.inc.32.p1")] fn nvvm_atomic_inc_global_u32(&mut addrspace(1)u32, u32) -> u32;
#[import(cc = "device", name = "llvm.nvvm.atomic.load.dec.32.p1")] fn nvvm_atomic_dec_global_u32(&mut addrspace(1)u32, u32) -> u32;
#[import(cc = "device", name = "llvm.nvvm.membar.gl")] fn nvvm_threadfence() -> ();
#[import(cc = "device", name = "llvm.nvvm.membar.cta")] fn nvvm_threadfence_block() -> ();
#[import(cc = "device", name = "llvm.nvvm.membar.sys")] fn nvvm_threadfence_system() -> ();
#[import(cc = "device", name = "llvm.nvvm.barrier0")] fn nvvm_block_sync() -> ();
#[import(cc = "device", name = "llvm.nvvm.barrier0.popc")] fn nvvm_block_sync_count(i32) -> i32;
#[import(cc = "device", name = "llvm.nvvm.barrier0.and")] fn nvvm_block_sync_all(i32) -> i32;
#[import(cc = "device", name = "llvm.nvvm.barrier0.or")] fn nvvm_block_sync_any(i32) -> i32;
#[import(cc = "device", name = "llvm.nvvm.bar.warp.sync")] fn nvvm_warp_sync(u32) -> ();
// #[import(cc = "device", name = "llvm.nvvm.vote.sync")] fn nvvm_warp_vote_sync(u32, i32, bool) -> (u32, bool); // seems to not be supported by LLVM
#[import(cc = "device", name = "llvm.nvvm.vote.all.sync")] fn nvvm_warp_sync_all(u32, bool) -> bool;
#[import(cc = "device", name = "llvm.nvvm.vote.any.sync")] fn nvvm_warp_sync_any(u32, bool) -> bool;
#[import(cc = "device", name = "llvm.nvvm.vote.ballot.sync")] fn nvvm_warp_sync_ballot(u32, bool) -> u32;
// #[import(cc = "device", name = "llvm.nvvm.shfl.sync.i32")] fn nvvm_warp_shfl_sync(i32, i32, i32, i32, i32) -> (i32, bool); // seems to not be supported by LLVM
#[import(cc = "device", name = "llvm.nvvm.shfl.sync.idx.i32")] fn nvvm_warp_shfl_sync_idx_i32(u32, i32, i32, i32) -> i32;
#[import(cc = "device", name = "llvm.nvvm.shfl.sync.idx.f32")] fn nvvm_warp_shfl_sync_idx_f32(u32, f32, i32, i32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.shfl.sync.up.i32")] fn nvvm_warp_shfl_sync_up_i32(u32, i32, i32, i32) -> i32;
#[import(cc = "device", name = "llvm.nvvm.shfl.sync.up.f32")] fn nvvm_warp_shfl_sync_up_f32(u32, f32, i32, i32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.shfl.sync.down.i32")] fn nvvm_warp_shfl_sync_down_i32(u32, i32, i32, i32) -> i32;
#[import(cc = "device", name = "llvm.nvvm.shfl.sync.down.f32")] fn nvvm_warp_shfl_sync_down_f32(u32, f32, i32, i32) -> f32;
#[import(cc = "device", name = "llvm.nvvm.shfl.sync.bfly.i32")] fn nvvm_warp_shfl_sync_bfly_i32(u32, i32, i32, i32) -> i32;
#[import(cc = "device", name = "llvm.nvvm.shfl.sync.bfly.f32")] fn nvvm_warp_shfl_sync_bfly_f32(u32, f32, i32, i32) -> f32;
// libdevice intrinsics: https://docs.nvidia.com/cuda/libdevice-users-guide
#[import(cc = "C", name = "__nv_expf")] fn nvvm_expf(f32) -> f32;
#[import(cc = "C", name = "__nv_exp2f")] fn nvvm_exp2f(f32) -> f32;
#[import(cc = "C", name = "__nv_logf")] fn nvvm_logf(f32) -> f32;
#[import(cc = "C", name = "__nv_log2f")] fn nvvm_log2f(f32) -> f32;
#[import(cc = "C", name = "__nv_powf")] fn nvvm_powf(f32, f32) -> f32;
#[import(cc = "C", name = "__nv_rsqrtf")] fn nvvm_rsqrtf(f32) -> f32;
#[import(cc = "C", name = "__nv_sqrtf")] fn nvvm_sqrtf(f32) -> f32;
#[import(cc = "C", name = "__nv_fabsf")] fn nvvm_fabsf(f32) -> f32;
#[import(cc = "C", name = "__nv_sinf")] fn nvvm_sinf(f32) -> f32;
#[import(cc = "C", name = "__nv_cosf")] fn nvvm_cosf(f32) -> f32;
#[import(cc = "C", name = "__nv_tanf")] fn nvvm_tanf(f32) -> f32;
#[import(cc = "C", name = "__nv_asinf")] fn nvvm_asinf(f32) -> f32;
#[import(cc = "C", name = "__nv_acosf")] fn nvvm_acosf(f32) -> f32;
#[import(cc = "C", name = "__nv_atanf")] fn nvvm_atanf(f32) -> f32;
#[import(cc = "C", name = "__nv_erff")] fn nvvm_erff(f32) -> f32;
#[import(cc = "C", name = "__nv_atan2f")] fn nvvm_atan2f(f32, f32) -> f32;
#[import(cc = "C", name = "__nv_fmaxf")] fn nvvm_fmaxf(f32, f32) -> f32;
#[import(cc = "C", name = "__nv_fminf")] fn nvvm_fminf(f32, f32) -> f32;
#[import(cc = "C", name = "__nv_fmodf")] fn nvvm_fmodf(f32, f32) -> f32;
#[import(cc = "C", name = "__nv_floorf")] fn nvvm_floorf(f32) -> f32;
#[import(cc = "C", name = "__nv_fmaf")] fn nvvm_fmaf(f32, f32, f32) -> f32;
#[import(cc = "C", name = "__nv_fmaf")] fn nvvm_madf(f32, f32, f32) -> f32;
#[import(cc = "C", name = "__nv_isinff")] fn nvvm_isinff(f32) -> i32;
#[import(cc = "C", name = "__nv_isnanf")] fn nvvm_isnanf(f32) -> i32;
#[import(cc = "C", name = "__nv_finitef")] fn nvvm_isfinitef(f32) -> i32;
#[import(cc = "C", name = "__nv_copysignf")] fn nvvm_copysignf(f32, f32) -> f32;
#[import(cc = "C", name = "__nv_exp")] fn nvvm_exp(f64) -> f64;
#[import(cc = "C", name = "__nv_exp2")] fn nvvm_exp2(f64) -> f64;
#[import(cc = "C", name = "__nv_log")] fn nvvm_log(f64) -> f64;
#[import(cc = "C", name = "__nv_log2")] fn nvvm_log2(f64) -> f64;
#[import(cc = "C", name = "__nv_pow")] fn nvvm_pow(f64, f64) -> f64;
#[import(cc = "C", name = "__nv_rsqrt")] fn nvvm_rsqrt(f64) -> f64;
#[import(cc = "C", name = "__nv_sqrt")] fn nvvm_sqrt(f64) -> f64;
#[import(cc = "C", name = "__nv_fabs")] fn nvvm_fabs(f64) -> f64;
#[import(cc = "C", name = "__nv_sin")] fn nvvm_sin(f64) -> f64;
#[import(cc = "C", name = "__nv_cos")] fn nvvm_cos(f64) -> f64;
#[import(cc = "C", name = "__nv_tan")] fn nvvm_tan(f64) -> f64;
#[import(cc = "C", name = "__nv_asin")] fn nvvm_asin(f64) -> f64;
#[import(cc = "C", name = "__nv_acos")] fn nvvm_acos(f64) -> f64;
#[import(cc = "C", name = "__nv_atan")] fn nvvm_atan(f64) -> f64;
#[import(cc = "C", name = "__nv_erf")] fn nvvm_erf(f64) -> f64;
#[import(cc = "C", name = "__nv_atan2")] fn nvvm_atan2(f64, f64) -> f64;
#[import(cc = "C", name = "__nv_fmin")] fn nvvm_fmin(f64, f64) -> f64;
#[import(cc = "C", name = "__nv_fmax")] fn nvvm_fmax(f64, f64) -> f64;
#[import(cc = "C", name = "__nv_fmod")] fn nvvm_fmod(f64, f64) -> f64;
#[import(cc = "C", name = "__nv_floor")] fn nvvm_floor(f64) -> f64;
#[import(cc = "C", name = "__nv_fma")] fn nvvm_fma(f64, f64, f64) -> f64;
#[import(cc = "C", name = "__nv_fma")] fn nvvm_mad(f64, f64, f64) -> f64;
#[import(cc = "C", name = "__nv_isinfd")] fn nvvm_isinf(f64) -> i32;
#[import(cc = "C", name = "__nv_isnand")] fn nvvm_isnan(f64) -> i32;
#[import(cc = "C", name = "__nv_isfinited")] fn nvvm_isfinite(f64) -> i32;
#[import(cc = "C", name = "__nv_copysign")] fn nvvm_copysign(f64, f64) -> f64;
// https://github.com/nvidia-compiler-sdk/nvvmir-samples/tree/master/syscalls/vprintf.ll
// There is no direct printf() support. In order to use vprintf(), a local buffer is allocated.
// Integer types that are shorter than int need to be extended to int and float needs to be
// extended to double before being pushed into the local buffer.
#[import(cc = "device", name = "vprintf")] fn nvvm_vprintf(_fmt: &[u8], _args: &[u8]) -> i32;
//
// atomics
// 0 1 2 3 4 5 6 7 8 9 10 11 12
// operation: Xchg Add Sub And Nand Or Xor Max Min UMax UMin FAdd FSub
// 0 1 2 4 5 6 7
// ordering: NotAtomic Unordered Monotonic Acquire Release AcquireRelease SequentiallyConsistent
// syncscope: "" (system)
//
fn @nvvm_atomic_xchg_global(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32](0, addr, val, 2, "");
fn @nvvm_atomic_xchg_shared(addr: &mut addrspace(3)i32, val: i32) = atomic_p3[i32](0, addr, val, 2, "");
fn @nvvm_atomic_add_global(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32](1, addr, val, 2, "");
fn @nvvm_atomic_add_shared(addr: &mut addrspace(3)i32, val: i32) = atomic_p3[i32](1, addr, val, 2, "");
fn @nvvm_atomic_sub_global(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32](2, addr, val, 2, "");
fn @nvvm_atomic_sub_shared(addr: &mut addrspace(3)i32, val: i32) = atomic_p3[i32](2, addr, val, 2, "");
fn @nvvm_atomic_max_global(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32](7, addr, val, 2, "");
fn @nvvm_atomic_max_shared(addr: &mut addrspace(3)i32, val: i32) = atomic_p3[i32](7, addr, val, 2, "");
fn @nvvm_atomic_min_global(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32](8, addr, val, 2, "");
fn @nvvm_atomic_min_shared(addr: &mut addrspace(3)i32, val: i32) = atomic_p3[i32](8, addr, val, 2, "");
fn @nvvm_atomic_add_global_i32(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32]( 1, addr, val, 2, "");
fn @nvvm_atomic_add_global_u32(addr: &mut addrspace(1)u32, val: u32) = atomic_p1[u32]( 1, addr, val, 2, "");
fn @nvvm_atomic_add_global_u64(addr: &mut addrspace(1)u64, val: u64) = atomic_p1[u64]( 1, addr, val, 2, "");
fn @nvvm_atomic_add_global_f32(addr: &mut addrspace(1)f32, val: f32) = atomic_p1[f32](11, addr, val, 2, "");
fn @nvvm_atomic_add_global_f64(addr: &mut addrspace(1)f64, val: f64) = atomic_p1[f64](11, addr, val, 2, "");
fn @nvvm_atomic_sub_global_i32(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32]( 2, addr, val, 2, "");
fn @nvvm_atomic_sub_global_u32(addr: &mut addrspace(1)u32, val: u32) = atomic_p1[u32]( 2, addr, val, 2, "");
fn @nvvm_atomic_sub_global_u64(addr: &mut addrspace(1)u64, val: u64) = atomic_p1[u64]( 2, addr, val, 2, "");
fn @nvvm_atomic_sub_global_f32(addr: &mut addrspace(1)f32, val: f32) = atomic_p1[f32](12, addr, val, 2, "");
fn @nvvm_atomic_sub_global_f64(addr: &mut addrspace(1)f64, val: f64) = atomic_p1[f64](12, addr, val, 2, "");
fn @nvvm_atomic_and_global_i32(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32](3, addr, val, 2, "");
fn @nvvm_atomic_and_global_u32(addr: &mut addrspace(1)u32, val: u32) = atomic_p1[u32](3, addr, val, 2, "");
fn @nvvm_atomic_and_global_u64(addr: &mut addrspace(1)u64, val: u64) = atomic_p1[u64](3, addr, val, 2, "");
fn @nvvm_atomic_or_global_i32(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32](5, addr, val, 2, "");
fn @nvvm_atomic_or_global_u32(addr: &mut addrspace(1)u32, val: u32) = atomic_p1[u32](5, addr, val, 2, "");
fn @nvvm_atomic_or_global_u64(addr: &mut addrspace(1)u64, val: u64) = atomic_p1[u64](5, addr, val, 2, "");
fn @nvvm_atomic_xor_global_i32(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32](6, addr, val, 2, "");
fn @nvvm_atomic_xor_global_u32(addr: &mut addrspace(1)u32, val: u32) = atomic_p1[u32](6, addr, val, 2, "");
fn @nvvm_atomic_xor_global_u64(addr: &mut addrspace(1)u64, val: u64) = atomic_p1[u64](6, addr, val, 2, "");
fn @nvvm_atomic_exch_global_i32(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32](0, addr, val, 2, "");
fn @nvvm_atomic_exch_global_u32(addr: &mut addrspace(1)u32, val: u32) = atomic_p1[u32](0, addr, val, 2, "");
fn @nvvm_atomic_exch_global_u64(addr: &mut addrspace(1)u64, val: u64) = atomic_p1[u64](0, addr, val, 2, "");
fn @nvvm_atomic_exch_global_f32(addr: &mut addrspace(1)f32, val: f32) = atomic_p1[f32](0, addr, val, 2, "");
fn @nvvm_atomic_min_global_i32(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32]( 8, addr, val, 2, "");
fn @nvvm_atomic_min_global_u32(addr: &mut addrspace(1)u32, val: u32) = atomic_p1[u32](10, addr, val, 2, "");
fn @nvvm_atomic_min_global_u64(addr: &mut addrspace(1)u64, val: u64) = atomic_p1[u64](10, addr, val, 2, "");
fn @nvvm_atomic_max_global_i32(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32](7, addr, val, 2, "");
fn @nvvm_atomic_max_global_u32(addr: &mut addrspace(1)u32, val: u32) = atomic_p1[u32](9, addr, val, 2, "");
fn @nvvm_atomic_max_global_u64(addr: &mut addrspace(1)u64, val: u64) = atomic_p1[u64](9, addr, val, 2, "");
fn @nvvm_atomic_cas_global_u16(addr: &mut addrspace(1)u16, cmp: u16, new: u16) = match cmpxchg_p1[u16](addr, cmp, new, 2, 2, "") { (old, _success) => old };
fn @nvvm_atomic_cas_global_i32(addr: &mut addrspace(1)i32, cmp: i32, new: i32) = match cmpxchg_p1[i32](addr, cmp, new, 2, 2, "") { (old, _success) => old };
fn @nvvm_atomic_cas_global_u32(addr: &mut addrspace(1)u32, cmp: u32, new: u32) = match cmpxchg_p1[u32](addr, cmp, new, 2, 2, "") { (old, _success) => old };
fn @nvvm_atomic_cas_global_u64(addr: &mut addrspace(1)u64, cmp: u64, new: u64) = match cmpxchg_p1[u64](addr, cmp, new, 2, 2, "") { (old, _success) => old };
// fn @nvvm_warp_sync_all(membermask: u32, predicate: bool) -> bool = match nvvm_warp_vote_sync(membermask, 0, predicate) { (_ballot, bit) => bit };
// fn @nvvm_warp_sync_any(membermask: u32, predicate: bool) -> bool = match nvvm_warp_vote_sync(membermask, 1, predicate) { (_ballot, bit) => bit };
// fn @nvvm_warp_sync_ballot(membermask: u32, predicate: bool) -> u32 = match nvvm_warp_vote_sync(membermask, 3, predicate) { (ballot, _bit) => ballot };
fn @nvvm_warp_activemask() -> u32 {
let mut mask: u32;
asm("activemask.b32 %0;" : "=r" (mask));
mask
}
// fn @nvvm_warp_shfl_i32(membermask: u32, x: i32, src_lane: i32, width: i32) -> i32 { nvvm_warp_shfl_sync(membermask as i32, 0, x, src_lane, width)(0) }
// fn @nvvm_warp_shfl_u32(membermask: u32, x: u32, src_lane: i32, width: i32) -> u32 { nvvm_warp_shfl_i32(membermask, x as i32, src_lane, width) as u32 }
// fn @nvvm_warp_shfl_i64(membermask: u32, x: i64, src_lane: i32, width: i32) -> i64;
// fn @nvvm_warp_shfl_u64(membermask: u32, x: u64, src_lane: i32, width: i32) -> u64;
// fn @nvvm_warp_shfl_f32(membermask: u32, x: f32, src_lane: i32, width: i32) -> f32 { nvvm_bitcast_i2f(nvvm_warp_shfl_i32(membermask, nvvm_bitcast_f2i(x), src_lane, width)) }
// fn @nvvm_warp_shfl_f64(membermask: u32, x: f64, src_lane: i32, width: i32) -> f64 {
// let lo = nvvm_warp_shfl_i32(membermask, nvvm_d2i_lo x, src_lane, width)
// }
// fn @nvvm_warp_shfl_up_i32(membermask: u32, x: i32, delta: u32, width: i32) -> i32 { nvvm_warp_shfl_sync(membermask as i32, 1, x, delta as i32, width)(0) }
// fn @nvvm_warp_shfl_up_u32(membermask: u32, x: u32, delta: u32, width: i32) -> u32 { nvvm_warp_shfl_up_i32(membermask, x as i32, delta, width) as u32 }
// fn @nvvm_warp_shfl_up_i64(membermask: u32, x: i64, delta: u32, width: i32) -> i64;
// fn @nvvm_warp_shfl_up_u64(membermask: u32, x: u64, delta: u32, width: i32) -> u64;
// fn @nvvm_warp_shfl_up_f32(membermask: u32, x: f32, delta: u32, width: i32) -> f32 { nvvm_bitcast_i2f(nvvm_warp_shfl_up_i32(membermask, nvvm_bitcast_f2i(x), delta, width)) }
// fn @nvvm_warp_shfl_up_f64(membermask: u32, x: f64, delta: u32, width: i32) -> f64;
// fn @nvvm_warp_shfl_down_i32(membermask: u32, x: i32, delta: u32, width: i32) -> i32 { nvvm_warp_shfl_sync(membermask as i32, 2, x, delta as i32, width)(0) }
// fn @nvvm_warp_shfl_down_u32(membermask: u32, x: u32, delta: u32, width: i32) -> u32 { nvvm_warp_shfl_down_i32(membermask, x as i32, delta, width) as u32 }
// fn @nvvm_warp_shfl_down_i64(membermask: u32, x: i64, delta: u32, width: i32) -> i64;
// fn @nvvm_warp_shfl_down_u64(membermask: u32, x: u64, delta: u32, width: i32) -> u64;
// fn @nvvm_warp_shfl_down_f32(membermask: u32, x: f32, delta: u32, width: i32) -> f32 { nvvm_bitcast_i2f(nvvm_warp_shfl_down_i32(membermask, nvvm_bitcast_f2i(x), delta, width)) }
// fn @nvvm_warp_shfl_down_f64(membermask: u32, x: f64, delta: u32, width: i32) -> f64;
// fn @nvvm_warp_shfl_xor_i32(membermask: u32, x: i32, lane_mask: i32, width: i32) -> i32 { nvvm_warp_shfl_sync(membermask as i32, 3, x, lane_mask, width)(0) }
// fn @nvvm_warp_shfl_xor_u32(membermask: u32, x: u32, lane_mask: i32, width: i32) -> u32 { nvvm_warp_shfl_xor_i32(membermask, x as i32, lane_mask, width) as u32 }
// fn @nvvm_warp_shfl_xor_i64(membermask: u32, x: i64, lane_mask: i32, width: i32) -> i64;
// fn @nvvm_warp_shfl_xor_u64(membermask: u32, x: u64, lane_mask: i32, width: i32) -> u64;
// fn @nvvm_warp_shfl_xor_f32(membermask: u32, x: f32, lane_mask: i32, width: i32) -> f32 { nvvm_bitcast_i2f(nvvm_warp_shfl_xor_i32(membermask, nvvm_bitcast_f2i(x), lane_mask, width)) }
// fn @nvvm_warp_shfl_xor_f64(membermask: u32, x: f64, lane_mask: i32, width: i32) -> f64;
// fn @nvvm_warp_match_any_i32(membermask: u32, x: i32) -> u32;
// fn @nvvm_warp_match_any_u32(membermask: u32, x: u32) -> u32;
// fn @nvvm_warp_match_any_i64(membermask: u32, x: i64) -> u32;
// fn @nvvm_warp_match_any_u64(membermask: u32, x: u64) -> u32;
// fn @nvvm_warp_match_any_f32(membermask: u32, x: f32) -> u32;
// fn @nvvm_warp_match_any_f64(membermask: u32, x: f64) -> u32;
// fn @nvvm_warp_match_all_i32(membermask: u32, x: i32, predicate: &mut i32) -> u32;
// fn @nvvm_warp_match_all_u32(membermask: u32, x: u32, predicate: &mut i32) -> u32;
// fn @nvvm_warp_match_all_i64(membermask: u32, x: i64, predicate: &mut i32) -> u32;
// fn @nvvm_warp_match_all_u64(membermask: u32, x: u64, predicate: &mut i32) -> u32;
// fn @nvvm_warp_match_all_f32(membermask: u32, x: f32, predicate: &mut i32) -> u32;
// fn @nvvm_warp_match_all_f64(membermask: u32, x: f64, predicate: &mut i32) -> u32;
fn @nvvm_trap() -> () {
asm("trap;");
}
fn @nvvm_nanosleep(t: u32) -> () {
asm("nanosleep.u32 $0;" :: "r"(t) :: "volatile");
}
fn @nvvm_clock() -> u32 {
let mut cycle_count:u32;
asm("mov.u32 $0, %clock;" : "=r"(cycle_count) ::: "volatile");
cycle_count
}
fn @nvvm_clock_hi() -> u32 {
let mut cycle_count:u32;
asm("mov.u32 $0, %clock_hi;" : "=r"(cycle_count) ::: "volatile");
cycle_count
}
fn @nvvm_clock64() -> u64 {
let mut cycle_count:u64;
asm("mov.u64 $0, %clock64;" : "=l"(cycle_count) ::: "volatile");
cycle_count
}
fn @nvvm_globaltimer() -> u64 {
let mut timestamp:u64;
asm("mov.u64 $0, %globaltimer;" : "=l"(timestamp) ::: "volatile");
timestamp
}
fn @nvvm_globaltimer_lo() -> u32 {
let mut timestamp:u32;
asm("mov.u32 $0, %globaltimer_lo;" : "=r"(timestamp) ::: "volatile");
timestamp
}
fn @nvvm_globaltimer_hi() -> u32 {
let mut timestamp:u32;
asm("mov.u32 $0, %globaltimer_hi;" : "=r"(timestamp) ::: "volatile");
timestamp
}
fn @nvvm_minmin(a: i32, b: i32, c: i32) -> i32 {
let mut res: i32;
asm("vmin.s32.s32.s32.min $0, $1, $2, $3;"
: "=r"(res)
: "r"(a), "r"(b), "r"(c)
);
res
}
fn @nvvm_maxmax(a: i32, b: i32, c: i32) -> i32 {
let mut res: i32;
asm("vmax.s32.s32.s32.max $0, $1, $2, $3;"
: "=r"(res)
: "r"(a), "r"(b), "r"(c)
);
res
}
fn @nvvm_minmax(a: i32, b: i32, c: i32) -> i32 {
let mut res: i32;
asm("vmin.s32.s32.s32.max $0, $1, $2, $3;"
: "=r"(res)
: "r"(a), "r"(b), "r"(c)
);
res
}
fn @nvvm_maxmin(a: i32, b: i32, c: i32) -> i32 {
let mut res: i32;
asm("vmax.s32.s32.s32.min $0, $1, $2, $3;"
: "=r"(res)
: "r"(a), "r"(b), "r"(c)
);
res
}
fn @nvvm_ldg_u8(addr: &addrspace(1)u8) -> u8 = nvvm_ldg_u8_p1(addr, 1);
fn @nvvm_ldg_i32(addr: &addrspace(1)i32) -> i32 = nvvm_ldg_i32_p1(addr, 4);
fn @nvvm_ldg_f32(addr: &addrspace(1)f32) -> f32 = nvvm_ldg_f32_p1(addr, 4);
fn @nvvm_ldg4_i32(addr: &addrspace(1)simd[i32 * 4]) -> simd[i32 * 4] = nvvm_ldg4_i32_p1(addr, 16);
fn @nvvm_ldg4_f32(addr: &addrspace(1)simd[f32 * 4]) -> simd[f32 * 4] = nvvm_ldg4_f32_p1(addr, 16);
fn @nvvm_popcount(i: i32) -> i32 { nvvm_popc_i(i) }
fn @nvvm_accelerator(dev: i32) = Accelerator {
exec = @|body| |grid, block| {
let work_item = WorkItem {
tidx = nvvm_read_ptx_sreg_tid_x,
tidy = nvvm_read_ptx_sreg_tid_y,
tidz = nvvm_read_ptx_sreg_tid_z,
bidx = nvvm_read_ptx_sreg_ctaid_x,
bidy = nvvm_read_ptx_sreg_ctaid_y,
bidz = nvvm_read_ptx_sreg_ctaid_z,
gidx = @|| nvvm_read_ptx_sreg_tid_x() + nvvm_read_ptx_sreg_ntid_x() * nvvm_read_ptx_sreg_ctaid_x(),
gidy = @|| nvvm_read_ptx_sreg_tid_y() + nvvm_read_ptx_sreg_ntid_y() * nvvm_read_ptx_sreg_ctaid_y(),
gidz = @|| nvvm_read_ptx_sreg_tid_z() + nvvm_read_ptx_sreg_ntid_z() * nvvm_read_ptx_sreg_ctaid_z(),
bdimx = nvvm_read_ptx_sreg_ntid_x,
bdimy = nvvm_read_ptx_sreg_ntid_y,
bdimz = nvvm_read_ptx_sreg_ntid_z,
gdimx = @|| nvvm_read_ptx_sreg_nctaid_x() * nvvm_read_ptx_sreg_ntid_x(),
gdimy = @|| nvvm_read_ptx_sreg_nctaid_y() * nvvm_read_ptx_sreg_ntid_y(),
gdimz = @|| nvvm_read_ptx_sreg_nctaid_z() * nvvm_read_ptx_sreg_ntid_z(),
nblkx = nvvm_read_ptx_sreg_nctaid_x,
nblky = nvvm_read_ptx_sreg_nctaid_y,
nblkz = nvvm_read_ptx_sreg_nctaid_z
};
nvvm(dev, grid, block, || @body(work_item))
},
sync = @|| synchronize_cuda(dev),
alloc = @|size| alloc_cuda(dev, size),
alloc_unified = @|size| alloc_cuda_unified(dev, size),
barrier = nvvm_barrier
};
static nvvm_intrinsics = Intrinsics {
expf = nvvm_expf,
exp2f = nvvm_exp2f,
logf = nvvm_logf,
log2f = nvvm_log2f,
powf = nvvm_powf,
rsqrtf = nvvm_rsqrtf,
sqrtf = nvvm_sqrtf,
fabsf = nvvm_fabsf,
sinf = nvvm_sinf,
cosf = nvvm_cosf,
tanf = nvvm_tanf,
asinf = nvvm_asinf,
acosf = nvvm_acosf,
atanf = nvvm_atanf,
erff = nvvm_erff,
atan2f = nvvm_atan2f,
copysignf = nvvm_copysignf,
fmaf = nvvm_fmaf,
fmaxf = nvvm_fmaxf,
fminf = nvvm_fminf,
fmodf = nvvm_fmodf,
floorf = nvvm_floorf,
isinff = nvvm_isinff,
isnanf = nvvm_isnanf,
isfinitef = nvvm_isfinitef,
exp = nvvm_exp,
exp2 = nvvm_exp2,
log = nvvm_log,
log2 = nvvm_log2,
pow = nvvm_pow,
rsqrt = nvvm_rsqrt,
sqrt = nvvm_sqrt,
fabs = nvvm_fabs,
sin = nvvm_sin,
cos = nvvm_cos,
tan = nvvm_tan,
asin = nvvm_asin,
acos = nvvm_acos,
atan = nvvm_atan,
erf = nvvm_erf,
atan2 = nvvm_atan2,
copysign = nvvm_copysign,
fma = nvvm_fma,
fmax = nvvm_fmax,
fmin = nvvm_fmin,
fmod = nvvm_fmod,
floor = nvvm_floor,
isinf = nvvm_isinf,
isnan = nvvm_isnan,
isfinite = nvvm_isfinite,
min = nvvm_min,
max = nvvm_max,
};
#[import(cc = "device", name = "llvm.amdgcn.dispatch.id")] fn amdgcn_dispatch_id() -> i64;
#[import(cc = "device", name = "llvm.amdgcn.dispatch.ptr")] fn amdgcn_dispatch_ptr() -> &addrspace(4)i8;
#[import(cc = "device", name = "llvm.amdgcn.implicitarg.ptr")] fn amdgcn_implicitarg_ptr() -> &addrspace(4)i8;
#[import(cc = "device", name = "llvm.amdgcn.ds.gws.barrier")] fn amdgcn_ds_gws_barrier(i32, i32) -> ();
#[import(cc = "device", name = "llvm.amdgcn.s.barrier")] fn amdgcn_s_barrier() -> ();
#[import(cc = "device", name = "llvm.amdgcn.wave.barrier")] fn amdgcn_wave_barrier() -> ();
#[import(cc = "device", name = "llvm.amdgcn.sched.barrier")] fn amdgcn_sched_barrier(i32) -> ();
#[import(cc = "device", name = "llvm.amdgcn.s.sethalt")] fn amdgcn_s_sethalt(i32) -> ();
#[import(cc = "device", name = "llvm.amdgcn.wavefrontsize")] fn amdgcn_wavefrontsize() -> i32;
#[import(cc = "device", name = "llvm.amdgcn.mbcnt.hi")] fn amdgcn_mbcnt_hi(i32, i32) -> i32;
#[import(cc = "device", name = "llvm.amdgcn.mbcnt.lo")] fn amdgcn_mbcnt_lo(i32, i32) -> i32;
#[import(cc = "device", name = "llvm.amdgcn.ds.bpermute")] fn amdgcn_ds_bpermute(i32, i32) -> i32;
#[import(cc = "device", name = "llvm.amdgcn.workgroup.id.x")] fn amdgcn_workgroup_id_x() -> i32;
#[import(cc = "device", name = "llvm.amdgcn.workgroup.id.y")] fn amdgcn_workgroup_id_y() -> i32;
#[import(cc = "device", name = "llvm.amdgcn.workgroup.id.z")] fn amdgcn_workgroup_id_z() -> i32;
#[import(cc = "device", name = "llvm.amdgcn.workitem.id.x")] fn amdgcn_workitem_id_x() -> i32;
#[import(cc = "device", name = "llvm.amdgcn.workitem.id.y")] fn amdgcn_workitem_id_y() -> i32;
#[import(cc = "device", name = "llvm.amdgcn.workitem.id.z")] fn amdgcn_workitem_id_z() -> i32;
#[import(cc = "device", name = "llvm.amdgcn.sin.f32")] fn amdgcn_sinf(f32) -> f32;
#[import(cc = "device", name = "llvm.amdgcn.cos.f32")] fn amdgcn_cosf(f32) -> f32;
#[import(cc = "device", name = "llvm.amdgcn.sin.f64")] fn amdgcn_sin(f64) -> f64;
#[import(cc = "device", name = "llvm.amdgcn.cos.f64")] fn amdgcn_cos(f64) -> f64;
#[import(cc = "device", name = "llvm.amdgcn.s.sleep")] fn amdgcn_s_sleep(i32) -> ();
#[import(cc = "device", name = "llvm.amdgcn.icmp.i32.i32")] fn amdgcn_icmp_i32(i32, i32, i32) -> i32;
#[import(cc = "device", name = "llvm.amdgcn.icmp.i64.i32")] fn amdgcn_icmp_i64(i32, i32, i32) -> i64;
#[import(cc = "device", name = "llvm.amdgcn.atomic.inc.i32.p1")] fn amdgcn_atomic_inc_global_u32(&mut addrspace(1)u32, u32, u32, u32, bool) -> u32;
#[import(cc = "device", name = "llvm.amdgcn.atomic.dec.i32.p1")] fn amdgcn_atomic_dec_global_u32(&mut addrspace(1)u32, u32, u32, u32, bool) -> u32;
#[import(cc = "device", name = "llvm.amdgcn.s.memrealtime")] fn amdgcn_s_memrealtime() -> u64;
// https://github.com/RadeonOpenCompute/ROCm-Device-Libs/blob/master/doc/OCML.md
#[import(cc = "C", name = "__ocml_exp_f32")] fn ocml_expf(f32) -> f32;
#[import(cc = "C", name = "__ocml_exp2_f32")] fn ocml_exp2f(f32) -> f32;
#[import(cc = "C", name = "__ocml_log_f32")] fn ocml_logf(f32) -> f32;
#[import(cc = "C", name = "__ocml_log2_f32")] fn ocml_log2f(f32) -> f32;
#[import(cc = "C", name = "__ocml_powr_f32")] fn ocml_powrf(f32, f32) -> f32;
#[import(cc = "C", name = "__ocml_pown_f32")] fn ocml_pownf(f32, i32) -> f32;
#[import(cc = "C", name = "__ocml_pow_f32")] fn ocml_powf(f32, f32) -> f32;
#[import(cc = "C", name = "__ocml_rsqrt_f32")] fn ocml_rsqrtf(f32) -> f32;
#[import(cc = "C", name = "__ocml_sqrt_f32")] fn ocml_sqrtf(f32) -> f32;
#[import(cc = "C", name = "__ocml_fabs_f32")] fn ocml_fabsf(f32) -> f32;
#[import(cc = "C", name = "__ocml_sin_f32")] fn ocml_sinf(f32) -> f32;
#[import(cc = "C", name = "__ocml_cos_f32")] fn ocml_cosf(f32) -> f32;
#[import(cc = "C", name = "__ocml_tan_f32")] fn ocml_tanf(f32) -> f32;
#[import(cc = "C", name = "__ocml_asin_f32")] fn ocml_asinf(f32) -> f32;
#[import(cc = "C", name = "__ocml_acos_f32")] fn ocml_acosf(f32) -> f32;
#[import(cc = "C", name = "__ocml_atan_f32")] fn ocml_atanf(f32) -> f32;
#[import(cc = "C", name = "__ocml_erf_f32")] fn ocml_erff(f32) -> f32;
#[import(cc = "C", name = "__ocml_atan2_f32")] fn ocml_atan2f(f32, f32) -> f32;
#[import(cc = "C", name = "__ocml_copysign_f32")] fn ocml_copysignf(f32, f32) -> f32;
#[import(cc = "C", name = "__ocml_fma_f32")] fn ocml_fmaf(f32, f32, f32) -> f32;
#[import(cc = "C", name = "__ocml_fmax_f32")] fn ocml_fmaxf(f32, f32) -> f32;
#[import(cc = "C", name = "__ocml_fmin_f32")] fn ocml_fminf(f32, f32) -> f32;
#[import(cc = "C", name = "__ocml_fmod_f32")] fn ocml_fmodf(f32, f32) -> f32;
#[import(cc = "C", name = "__ocml_floor_f32")] fn ocml_floorf(f32) -> f32;
#[import(cc = "C", name = "__ocml_isinf_f32")] fn ocml_isinff(f32) -> i32;
#[import(cc = "C", name = "__ocml_isnan_f32")] fn ocml_isnanf(f32) -> i32;
#[import(cc = "C", name = "__ocml_isfinite_f32")] fn ocml_isfinitef(f32) -> i32;
#[import(cc = "C", name = "__ocml_mad_f32")] fn ocml_madf(f32, f32, f32) -> f32;
#[import(cc = "C", name = "__ocml_exp_f64")] fn ocml_exp(f64) -> f64;
#[import(cc = "C", name = "__ocml_exp2_f64")] fn ocml_exp2(f64) -> f64;
#[import(cc = "C", name = "__ocml_log_f64")] fn ocml_log(f64) -> f64;
#[import(cc = "C", name = "__ocml_log2_f64")] fn ocml_log2(f64) -> f64;
#[import(cc = "C", name = "__ocml_powr_f64")] fn ocml_powr(f64, f64) -> f64;
#[import(cc = "C", name = "__ocml_pown_f64")] fn ocml_pown(f64, i32) -> f64;
#[import(cc = "C", name = "__ocml_pow_f64")] fn ocml_pow(f64, f64) -> f64;
#[import(cc = "C", name = "__ocml_rsqrt_f64")] fn ocml_rsqrt(f64) -> f64;
#[import(cc = "C", name = "__ocml_sqrt_f64")] fn ocml_sqrt(f64) -> f64;
#[import(cc = "C", name = "__ocml_fabs_f64")] fn ocml_fabs(f64) -> f64;
#[import(cc = "C", name = "__ocml_sin_f64")] fn ocml_sin(f64) -> f64;
#[import(cc = "C", name = "__ocml_cos_f64")] fn ocml_cos(f64) -> f64;
#[import(cc = "C", name = "__ocml_tan_f64")] fn ocml_tan(f64) -> f64;
#[import(cc = "C", name = "__ocml_asin_f64")] fn ocml_asin(f64) -> f64;
#[import(cc = "C", name = "__ocml_acos_f64")] fn ocml_acos(f64) -> f64;
#[import(cc = "C", name = "__ocml_atan_f64")] fn ocml_atan(f64) -> f64;
#[import(cc = "C", name = "__ocml_erf_f64")] fn ocml_erf(f64) -> f64;
#[import(cc = "C", name = "__ocml_atan2_f64")] fn ocml_atan2(f64, f64) -> f64;
#[import(cc = "C", name = "__ocml_copysign_f64")] fn ocml_copysign(f64, f64) -> f64;
#[import(cc = "C", name = "__ocml_fma_f64")] fn ocml_fma(f64, f64, f64) -> f64;
#[import(cc = "C", name = "__ocml_fmax_f64")] fn ocml_fmax(f64, f64) -> f64;
#[import(cc = "C", name = "__ocml_fmin_f64")] fn ocml_fmin(f64, f64) -> f64;
#[import(cc = "C", name = "__ocml_fmod_f64")] fn ocml_fmod(f64, f64) -> f64;
#[import(cc = "C", name = "__ocml_floor_f64")] fn ocml_floor(f64) -> f64;
#[import(cc = "C", name = "__ocml_isinf_f64")] fn ocml_isinf(f64) -> i32;
#[import(cc = "C", name = "__ocml_isnan_f64")] fn ocml_isnan(f64) -> i32;
#[import(cc = "C", name = "__ocml_isfinite_f64")] fn ocml_isfinite(f64) -> i32;
#[import(cc = "C", name = "__ocml_mad_f64")] fn ocml_mad(f64, f64, f64) -> f64;
//
// atomics
// 0 1 2 3 4 5 6 7 8 9 10 11 12
// operation: Xchg Add Sub And Nand Or Xor Max Min UMax UMin FAdd FSub
// 0 1 2 4 5 6 7
// ordering: NotAtomic Unordered Monotonic Acquire Release AcquireRelease SequentiallyConsistent
// syncscope: agent workgroup wavefront one-as agent-one-as workgroup-one-as wavefront-one-as singlethread-one-as
//
fn @amdgcn_atomic_load_global_i32(addr: &addrspace(1)i32) = atomic_load_p1[i32](addr, 2, "agent");
fn @amdgcn_atomic_load_global_u32(addr: &addrspace(1)u32) = atomic_load_p1[u32](addr, 2, "agent");
fn @amdgcn_atomic_load_global_u64(addr: &addrspace(1)u64) = atomic_load_p1[u64](addr, 2, "agent");
fn @amdgcn_atomic_store_global_i32(addr: &mut addrspace(1)i32, val: i32) = atomic_store_p1[i32](addr, val, 2, "agent");
fn @amdgcn_atomic_store_global_u32(addr: &mut addrspace(1)u32, val: u32) = atomic_store_p1[u32](addr, val, 2, "agent");
fn @amdgcn_atomic_store_global_u64(addr: &mut addrspace(1)u64, val: u64) = atomic_store_p1[u64](addr, val, 2, "agent");
fn @amdgcn_atomic_xchg_global(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32](0, addr, val, 2, "agent");
fn @amdgcn_atomic_xchg_shared(addr: &mut addrspace(3)i32, val: i32) = atomic_p3[i32](0, addr, val, 2, "workgroup");
fn @amdgcn_atomic_add_global(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32](1, addr, val, 2, "agent");
fn @amdgcn_atomic_add_shared(addr: &mut addrspace(3)i32, val: i32) = atomic_p3[i32](1, addr, val, 2, "workgroup");
fn @amdgcn_atomic_sub_global(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32](2, addr, val, 2, "agent");
fn @amdgcn_atomic_sub_shared(addr: &mut addrspace(3)i32, val: i32) = atomic_p3[i32](2, addr, val, 2, "workgroup");
fn @amdgcn_atomic_max_global(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32](7, addr, val, 2, "agent");
fn @amdgcn_atomic_max_shared(addr: &mut addrspace(3)i32, val: i32) = atomic_p3[i32](7, addr, val, 2, "workgroup");
fn @amdgcn_atomic_min_global(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32](8, addr, val, 2, "agent");
fn @amdgcn_atomic_min_shared(addr: &mut addrspace(3)i32, val: i32) = atomic_p3[i32](8, addr, val, 2, "workgroup");
fn @amdgcn_atomic_add_global_i32(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32]( 1, addr, val, 2, "agent");
fn @amdgcn_atomic_add_global_u32(addr: &mut addrspace(1)u32, val: u32) = atomic_p1[u32]( 1, addr, val, 2, "agent");
fn @amdgcn_atomic_add_global_u64(addr: &mut addrspace(1)u64, val: u64) = atomic_p1[u64]( 1, addr, val, 2, "agent");
fn @amdgcn_atomic_add_global_f32(addr: &mut addrspace(1)f32, val: f32) = atomic_p1[f32](11, addr, val, 2, "agent");
fn @amdgcn_atomic_add_global_f64(addr: &mut addrspace(1)f64, val: f64) = atomic_p1[f64](11, addr, val, 2, "agent");
fn @amdgcn_atomic_sub_global_i32(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32]( 2, addr, val, 2, "agent");
fn @amdgcn_atomic_sub_global_u32(addr: &mut addrspace(1)u32, val: u32) = atomic_p1[u32]( 2, addr, val, 2, "agent");
fn @amdgcn_atomic_sub_global_u64(addr: &mut addrspace(1)u64, val: u64) = atomic_p1[u64]( 2, addr, val, 2, "agent");
fn @amdgcn_atomic_sub_global_f32(addr: &mut addrspace(1)f32, val: f32) = atomic_p1[f32](12, addr, val, 2, "agent");
fn @amdgcn_atomic_sub_global_f64(addr: &mut addrspace(1)f64, val: f64) = atomic_p1[f64](12, addr, val, 2, "agent");
fn @amdgcn_atomic_and_global_i32(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32](3, addr, val, 2, "agent");
fn @amdgcn_atomic_and_global_u32(addr: &mut addrspace(1)u32, val: u32) = atomic_p1[u32](3, addr, val, 2, "agent");
fn @amdgcn_atomic_and_global_u64(addr: &mut addrspace(1)u64, val: u64) = atomic_p1[u64](3, addr, val, 2, "agent");
fn @amdgcn_atomic_or_global_i32(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32](5, addr, val, 2, "agent");
fn @amdgcn_atomic_or_global_u32(addr: &mut addrspace(1)u32, val: u32) = atomic_p1[u32](5, addr, val, 2, "agent");
fn @amdgcn_atomic_or_global_u64(addr: &mut addrspace(1)u64, val: u64) = atomic_p1[u64](5, addr, val, 2, "agent");
fn @amdgcn_atomic_xor_global_i32(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32](6, addr, val, 2, "agent");
fn @amdgcn_atomic_xor_global_u32(addr: &mut addrspace(1)u32, val: u32) = atomic_p1[u32](6, addr, val, 2, "agent");
fn @amdgcn_atomic_xor_global_u64(addr: &mut addrspace(1)u64, val: u64) = atomic_p1[u64](6, addr, val, 2, "agent");
fn @amdgcn_atomic_exch_global_i32(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32](0, addr, val, 2, "agent");
fn @amdgcn_atomic_exch_global_u32(addr: &mut addrspace(1)u32, val: u32) = atomic_p1[u32](0, addr, val, 2, "agent");
fn @amdgcn_atomic_exch_global_u64(addr: &mut addrspace(1)u64, val: u64) = atomic_p1[u64](0, addr, val, 2, "agent");
fn @amdgcn_atomic_exch_global_f32(addr: &mut addrspace(1)f32, val: f32) = atomic_p1[f32](0, addr, val, 2, "agent");
fn @amdgcn_atomic_min_global_i32(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32]( 8, addr, val, 2, "agent");
fn @amdgcn_atomic_min_global_u32(addr: &mut addrspace(1)u32, val: u32) = atomic_p1[u32](10, addr, val, 2, "agent");
fn @amdgcn_atomic_min_global_u64(addr: &mut addrspace(1)u64, val: u64) = atomic_p1[u64](10, addr, val, 2, "agent");
fn @amdgcn_atomic_max_global_i32(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32](7, addr, val, 2, "agent");
fn @amdgcn_atomic_max_global_u32(addr: &mut addrspace(1)u32, val: u32) = atomic_p1[u32](9, addr, val, 2, "agent");
fn @amdgcn_atomic_max_global_u64(addr: &mut addrspace(1)u64, val: u64) = atomic_p1[u64](9, addr, val, 2, "agent");
fn @amdgcn_atomic_cas_global_u16(addr: &mut addrspace(1)u16, cmp: u16, new: u16) = match cmpxchg_p1[u16](addr, cmp, new, 2, 2, "agent") { (old, _success) => old };
fn @amdgcn_atomic_cas_global_i32(addr: &mut addrspace(1)i32, cmp: i32, new: i32) = match cmpxchg_p1[i32](addr, cmp, new, 2, 2, "agent") { (old, _success) => old };
fn @amdgcn_atomic_cas_global_u32(addr: &mut addrspace(1)u32, cmp: u32, new: u32) = match cmpxchg_p1[u32](addr, cmp, new, 2, 2, "agent") { (old, _success) => old };
fn @amdgcn_atomic_cas_global_u64(addr: &mut addrspace(1)u64, cmp: u64, new: u64) = match cmpxchg_p1[u64](addr, cmp, new, 2, 2, "agent") { (old, _success) => old };
fn @amdgcn_threadfence() = fence(7, "agent");
fn @amdgcn_lane_id() -> i32 {
if amdgcn_wavefrontsize() == 64 {
amdgcn_mbcnt_hi(-1, amdgcn_mbcnt_lo(-1, 0))
} else {
amdgcn_mbcnt_lo(-1, 0)
}
}
fn @amdgcn_activelane() -> i32 {
if amdgcn_wavefrontsize() == 64 {
amdgcn_mbcnt_hi(amdgcn_read_exec_hi(), amdgcn_mbcnt_lo(amdgcn_read_exec_lo(), 0))
} else {
amdgcn_mbcnt_lo(amdgcn_read_exec_lo(), 0)
}
}
static ICMP_NE = 33;
fn @amdgcn_ballot(p: i32) -> u64 {
if amdgcn_wavefrontsize() == 64 {
amdgcn_icmp_i64(p, 0, ICMP_NE) as u64
} else {
amdgcn_icmp_i32(p, 0, ICMP_NE) as u32 as u64
}
}
fn @amdgcn_lanemask_eq() -> u64 {
let lane_id = amdgcn_lane_id();
let mask = 1:u64 << lane_id as u64;
mask
}
fn @amdgcn_lanemask_lt() -> u64 {
let lane_id = amdgcn_lane_id();
let ballot = amdgcn_ballot(1);
let mask = (1 :u64<< lane_id as u64) - 1:u64;
mask & ballot
}
fn @amdgcn_lanemask_le() -> u64 {
amdgcn_lanemask_lt() | amdgcn_lanemask_eq()
}
fn @amdgcn_lanemask_gt() -> u64 {
let lane_id = amdgcn_lane_id();
if (amdgcn_wavefrontsize() == 64 && lane_id == 63) || (amdgcn_wavefrontsize() == 32 && lane_id == 31) {
return(0:u64)
}
let ballot = amdgcn_ballot(1);
let mask = (!0:u64) << (lane_id as u64 + 1:u64);
mask & ballot
}
fn @amdgcn_lanemask_ge() -> u64 {
amdgcn_lanemask_gt() | amdgcn_lanemask_eq()
}
fn @amdgcn_sync_all(p: i32) -> i32 {
amdgcn_wave_barrier();
if amdgcn_icmp_i32(p, 0, ICMP_NE) == amdgcn_read_exec_lo() { 1 } else { 0 }
}
fn @amdgcn_sync_any(p: i32) -> i32 {
amdgcn_wave_barrier();
if amdgcn_icmp_i32(p, 0, ICMP_NE) != 0 { 1 } else { 0 }
}
fn @amdgcn_sync_count(p: i32) -> i32 {
amdgcn_wave_barrier();
cpu_popcount32(amdgcn_icmp_i32(p, 0, ICMP_NE))
}
fn @amdgcn_sync_vote(p: i32) -> u64 {
amdgcn_wave_barrier();
(amdgcn_icmp_i32(p, 0, ICMP_NE) as u32) as u64
}
fn @amdgcn_shfl_i32(var: i32, src_lane: i32, width: i32) {
let lane_id = amdgcn_lane_id();
let idx = src_lane + (lane_id & !(width - 1));
amdgcn_ds_bpermute(idx << 2, var)
}
fn @amdgcn_shfl_u32(var: u32, src_lane: i32, width: i32) { bitcast[u32](amdgcn_shfl_i32(bitcast[i32](var), src_lane, width)) }
fn @amdgcn_shfl_f32(var: f32, src_lane: i32, width: i32) { bitcast[f32](amdgcn_shfl_i32(bitcast[i32](var), src_lane, width)) }
fn @amdgcn_shfl_up_i32(var: i32, lane_delta: u32, width: i32) {
let lane_id = amdgcn_lane_id();
let idx = if lane_id - lane_delta as i32 < lane_id & !(width - 1) { lane_id } else { lane_id - lane_delta as i32 };
amdgcn_ds_bpermute(idx << 2, var)
}
fn @amdgcn_shfl_up_u32(var: u32, lane_delta: u32, width: i32) { bitcast[u32](amdgcn_shfl_up_i32(bitcast[i32](var), lane_delta, width)) }
fn @amdgcn_shfl_up_f32(var: f32, lane_delta: u32, width: i32) { bitcast[f32](amdgcn_shfl_up_i32(bitcast[i32](var), lane_delta, width)) }
fn @amdgcn_shfl_down_i32(var: i32, lane_delta: u32, width: i32) {
let lane_id = amdgcn_lane_id();
let idx = if (lane_id & (width - 1)) + lane_delta as i32 >= width { lane_id } else { lane_id + lane_delta as i32 };
amdgcn_ds_bpermute(idx << 2, var)
}
fn @amdgcn_shfl_down_u32(var: u32, lane_delta: u32, width: i32) { bitcast[u32](amdgcn_shfl_down_i32(bitcast[i32](var), lane_delta, width)) }
fn @amdgcn_shfl_down_f32(var: f32, lane_delta: u32, width: i32) { bitcast[f32](amdgcn_shfl_down_i32(bitcast[i32](var), lane_delta, width)) }
fn @amdgcn_shfl_xor_i32(var: i32, lane_mask: i32, width: i32) {
let lane_id = amdgcn_lane_id();
let idx = if lane_id ^ lane_mask >= (lane_id + width) & !(width - 1) { lane_id } else { lane_id ^ lane_mask };
amdgcn_ds_bpermute(idx << 2, var)
}
fn @amdgcn_shfl_xor_u32(var: u32, lane_mask: i32, width: i32) { bitcast[u32](amdgcn_shfl_xor_i32(bitcast[i32](var), lane_mask, width)) }
fn @amdgcn_shfl_xor_f32(var: f32, lane_mask: i32, width: i32) { bitcast[f32](amdgcn_shfl_xor_i32(bitcast[i32](var), lane_mask, width)) }
fn @amdgcn_read_exec() -> i64 {
let mut exec_lo: i32;
let mut exec_hi: i32;
asm("v_mov_b32_e32 $0, exec_lo\n"
"v_mov_b32_e32 $1, exec_hi" : "=v"(exec_lo), "=v"(exec_hi) : :: "volatile");
exec_hi as i64 << 32:i64 | exec_lo as i64
}
fn @amdgcn_read_exec_lo() -> i32 {
let mut res: i32;
asm("v_mov_b32_e32 $0, exec_lo" : "=v"(res) : :: "volatile");
res
}
fn @amdgcn_read_exec_hi() -> i32 {
let mut res: i32;
asm("v_mov_b32_e32 $0, exec_hi" : "=v"(res) : :: "volatile");
res
}
fn @amdgcn_minmin(a: i32, b: i32, c: i32) -> i32 {
let mut res: i32;
asm("v_min3_i32 $0, $1, $2, $3"
: "=v"(res)
: "v"(a), "v"(b), "v"(c)
);
res
}
fn @amdgcn_maxmax(a: i32, b: i32, c: i32) -> i32 {
let mut res: i32;
asm("v_max3_i32 $0, $1, $2, $3"
: "=v"(res)
: "v"(a), "v"(b), "v"(c)
);
res
}
fn @amdpal_breakpoint() -> () {
amdgcn_sched_barrier(0);
amdgcn_s_sethalt(1);
amdgcn_sched_barrier(0);
}
struct hsa_signal_t {
handle : u64
}
struct hsa_dispatch_packet_t {
header : u16,
setup : u16,
workgroup_size_x : u16,
workgroup_size_y : u16,
workgroup_size_z : u16,
reserved0 : u16,
grid_size_x : u32,
grid_size_y : u32,
grid_size_z : u32,
private_segment_size : u32,
group_segment_size : u32,
kernel_object : u64,
kernarg_address : &[i8], // HSA_LARGE_MODEL
reserved2 : u64,
completion_signal : hsa_signal_t
}
fn @amdgpu_hsa_accelerator(dev: i32) = Accelerator {
exec = @|body| |grid, block| {
fn @div_round_up(num: i32, multiple: i32) -> i32 { (num + multiple - 1) / multiple }
let work_item = WorkItem {
tidx = amdgcn_workitem_id_x,
tidy = amdgcn_workitem_id_y,
tidz = amdgcn_workitem_id_z,
bidx = amdgcn_workgroup_id_x,
bidy = amdgcn_workgroup_id_y,
bidz = amdgcn_workgroup_id_z,
gidx = @|| amdgcn_workitem_id_x() + bitcast[&addrspace(4)[u16]](amdgcn_dispatch_ptr())(2) as i32 * amdgcn_workgroup_id_x(),
gidy = @|| amdgcn_workitem_id_y() + bitcast[&addrspace(4)[u16]](amdgcn_dispatch_ptr())(3) as i32 * amdgcn_workgroup_id_y(),
gidz = @|| amdgcn_workitem_id_z() + bitcast[&addrspace(4)[u16]](amdgcn_dispatch_ptr())(4) as i32 * amdgcn_workgroup_id_z(),
bdimx = @|| bitcast[&addrspace(4)[u16]](amdgcn_dispatch_ptr())(2) as i32,
bdimy = @|| bitcast[&addrspace(4)[u16]](amdgcn_dispatch_ptr())(3) as i32,
bdimz = @|| bitcast[&addrspace(4)[u16]](amdgcn_dispatch_ptr())(4) as i32,
gdimx = @|| bitcast[&addrspace(4)[u32]](amdgcn_dispatch_ptr())(3) as i32,
gdimy = @|| bitcast[&addrspace(4)[u32]](amdgcn_dispatch_ptr())(4) as i32,
gdimz = @|| bitcast[&addrspace(4)[u32]](amdgcn_dispatch_ptr())(5) as i32,
nblkx = @|| div_round_up(bitcast[&addrspace(4)[u32]](amdgcn_dispatch_ptr())(3) as i32, bitcast[&addrspace(4)[u16]](amdgcn_dispatch_ptr())(2) as i32),
nblky = @|| div_round_up(bitcast[&addrspace(4)[u32]](amdgcn_dispatch_ptr())(4) as i32, bitcast[&addrspace(4)[u16]](amdgcn_dispatch_ptr())(3) as i32),
nblkz = @|| div_round_up(bitcast[&addrspace(4)[u32]](amdgcn_dispatch_ptr())(5) as i32, bitcast[&addrspace(4)[u16]](amdgcn_dispatch_ptr())(4) as i32)
};
amdgpu_hsa(dev, grid, block, || @body(work_item))
},
sync = @|| synchronize_hsa(dev),
alloc = @|size| alloc_hsa(dev, size),
alloc_unified = @|size| alloc_hsa_unified(dev, size),
barrier = amdgcn_s_barrier,
};
#[import(cc = "device", name = "anydsl.amdpal.workitem.id.x")] fn amdpal_workitem_id_x() -> i32;
#[import(cc = "device", name = "anydsl.amdpal.workitem.id.y")] fn amdpal_workitem_id_y() -> i32;
#[import(cc = "device", name = "anydsl.amdpal.workitem.id.z")] fn amdpal_workitem_id_z() -> i32;
#[import(cc = "device", name = "anydsl.amdpal.workgroup.id.x")] fn amdpal_workgroup_id_x() -> i32;
#[import(cc = "device", name = "anydsl.amdpal.workgroup.id.y")] fn amdpal_workgroup_id_y() -> i32;
#[import(cc = "device", name = "anydsl.amdpal.workgroup.id.z")] fn amdpal_workgroup_id_z() -> i32;
#[import(cc = "device", name = "anydsl.amdpal.workgroup.size.x")] fn amdpal_workgroup_size_x() -> i32;
#[import(cc = "device", name = "anydsl.amdpal.workgroup.size.y")] fn amdpal_workgroup_size_y() -> i32;
#[import(cc = "device", name = "anydsl.amdpal.workgroup.size.z")] fn amdpal_workgroup_size_z() -> i32;
#[import(cc = "device", name = "anydsl.amdpal.nblk.x")] fn amdpal_nblk_x() -> i32;
#[import(cc = "device", name = "anydsl.amdpal.nblk.y")] fn amdpal_nblk_y() -> i32;
#[import(cc = "device", name = "anydsl.amdpal.nblk.z")] fn amdpal_nblk_z() -> i32;
fn @amdgpu_pal_accelerator(dev: i32) = Accelerator {
exec = @|body| |grid, block| {
let work_item = WorkItem {
tidx = amdpal_workitem_id_x,
tidy = amdpal_workitem_id_y,
tidz = amdpal_workitem_id_z,
bidx = amdpal_workgroup_id_x,
bidy = amdpal_workgroup_id_y,
bidz = amdpal_workgroup_id_z,
gidx = @|| amdpal_workgroup_id_x() * amdpal_workgroup_size_x() + amdpal_workitem_id_x(),
gidy = @|| amdpal_workgroup_id_y() * amdpal_workgroup_size_y() + amdpal_workitem_id_y(),
gidz = @|| amdpal_workgroup_id_z() * amdpal_workgroup_size_z() + amdpal_workitem_id_z(),
bdimx = amdpal_workgroup_size_x,
bdimy = amdpal_workgroup_size_y,
bdimz = amdpal_workgroup_size_z,
gdimx = @|| amdpal_nblk_x() * amdpal_workgroup_size_x(),
gdimy = @|| amdpal_nblk_y() * amdpal_workgroup_size_y(),
gdimz = @|| amdpal_nblk_z() * amdpal_workgroup_size_z(),
nblkx = amdpal_nblk_x,
nblky = amdpal_nblk_y,
nblkz = amdpal_nblk_z
};
amdgpu_pal(dev, grid, block, || @body(work_item))
},
sync = @|| synchronize_pal(dev),
alloc = @|size| alloc_pal(dev, size),
alloc_unified = @|size| alloc_pal_unified(dev, size),
barrier = amdgcn_s_barrier,
};
static amdgpu_intrinsics = Intrinsics {
expf = ocml_expf,
exp2f = ocml_exp2f,
logf = ocml_logf,
log2f = ocml_log2f,
powf = @ |x, p| {
if ?(p == ((p as i32) as f32)) { ocml_pownf(x, p as i32) }
else if ?(p >= 0) { ocml_powrf(x, p) }
else { ocml_powf(x, p) }
},
rsqrtf = ocml_rsqrtf,
sqrtf = ocml_sqrtf,
fabsf = ocml_fabsf,
sinf = ocml_sinf,
cosf = ocml_cosf,
tanf = ocml_tanf,
asinf = ocml_asinf,
acosf = ocml_acosf,
atanf = ocml_atanf,
erff = ocml_erff,
atan2f = ocml_atan2f,
copysignf = ocml_copysignf,
fmaf = ocml_fmaf,
fmaxf = ocml_fmaxf,
fminf = ocml_fminf,
fmodf = ocml_fmodf,
floorf = ocml_floorf,
isinff = ocml_isinff,
isnanf = ocml_isnanf,
isfinitef = ocml_isfinitef,
exp = ocml_exp,
exp2 = ocml_exp2,
log = ocml_log,
log2 = ocml_log2,
pow = @ |x, p| {
if ?(p == ((p as i32) as f64)) { ocml_pown(x, p as i32) }
else if ?(p >= 0) { ocml_powr(x, p) }
else { ocml_pow(x, p) }
},
rsqrt = ocml_rsqrt,
sqrt = ocml_sqrt,
fabs = ocml_fabs,
sin = ocml_sin,
cos = ocml_cos,
tan = ocml_tan,
asin = ocml_asin,
acos = ocml_acos,
atan = ocml_atan,
erf = ocml_erf,
atan2 = ocml_atan2,
copysign = ocml_copysign,
fma = ocml_fma,
fmax = ocml_fmax,
fmin = ocml_fmin,
fmod = ocml_fmod,
floor = ocml_floor,
isinf = ocml_isinf,
isnan = ocml_isnan,
isfinite = ocml_isfinite,
min = @|a, b| { if a < b { a } else { b } },
max = @|a, b| { if a > b { a } else { b } },
};
// no declarations are emitted for "device" functions
#[import(cc = "device", name = "barrier")] fn opencl_barrier(u32) -> ();
#[import(cc = "device", name = "exp")] fn opencl_expf(f32) -> f32;
#[import(cc = "device", name = "exp2")] fn opencl_exp2f(f32) -> f32;
#[import(cc = "device", name = "log")] fn opencl_logf(f32) -> f32;
#[import(cc = "device", name = "log2")] fn opencl_log2f(f32) -> f32;
#[import(cc = "device", name = "pow")] fn opencl_powf(f32, f32) -> f32;
#[import(cc = "device", name = "rsqrt")] fn opencl_rsqrtf(f32) -> f32;
#[import(cc = "device", name = "sqrt")] fn opencl_sqrtf(f32) -> f32;
#[import(cc = "device", name = "fabs")] fn opencl_fabsf(f32) -> f32;
#[import(cc = "device", name = "sin")] fn opencl_sinf(f32) -> f32;
#[import(cc = "device", name = "cos")] fn opencl_cosf(f32) -> f32;
#[import(cc = "device", name = "tan")] fn opencl_tanf(f32) -> f32;
#[import(cc = "device", name = "asin")] fn opencl_asinf(f32) -> f32;
#[import(cc = "device", name = "acos")] fn opencl_acosf(f32) -> f32;
#[import(cc = "device", name = "atan")] fn opencl_atanf(f32) -> f32;
#[import(cc = "device", name = "erf")] fn opencl_erff(f32) -> f32;
#[import(cc = "device", name = "atan2")] fn opencl_atan2f(f32, f32) -> f32;
#[import(cc = "device", name = "fmod")] fn opencl_fmodf(f32, f32) -> f32;
#[import(cc = "device", name = "floor")] fn opencl_floorf(f32) -> f32;
#[import(cc = "device", name = "isinf")] fn opencl_isinff(f32) -> i32;
#[import(cc = "device", name = "isnan")] fn opencl_isnanf(f32) -> i32;
#[import(cc = "device", name = "isfinite")] fn opencl_isfinitef(f32) -> i32;
#[import(cc = "device", name = "fma")] fn opencl_fmaf(f32, f32, f32) -> f32;
#[import(cc = "device", name = "mad")] fn opencl_madf(f32, f32, f32) -> f32;
#[import(cc = "device", name = "copysign")] fn opencl_copysignf(f32, f32) -> f32;
#[import(cc = "device", name = "exp")] fn opencl_exp(f64) -> f64;
#[import(cc = "device", name = "exp2")] fn opencl_exp2(f64) -> f64;
#[import(cc = "device", name = "log")] fn opencl_log(f64) -> f64;
#[import(cc = "device", name = "log2")] fn opencl_log2(f64) -> f64;
#[import(cc = "device", name = "pow")] fn opencl_pow(f64, f64) -> f64;
#[import(cc = "device", name = "rsqrt")] fn opencl_rsqrt(f64) -> f64;
#[import(cc = "device", name = "sqrt")] fn opencl_sqrt(f64) -> f64;
#[import(cc = "device", name = "fabs")] fn opencl_fabs(f64) -> f64;
#[import(cc = "device", name = "sin")] fn opencl_sin(f64) -> f64;
#[import(cc = "device", name = "cos")] fn opencl_cos(f64) -> f64;
#[import(cc = "device", name = "tan")] fn opencl_tan(f64) -> f64;
#[import(cc = "device", name = "asin")] fn opencl_asin(f64) -> f64;
#[import(cc = "device", name = "acos")] fn opencl_acos(f64) -> f64;
#[import(cc = "device", name = "atan")] fn opencl_atan(f64) -> f64;
#[import(cc = "device", name = "erf")] fn opencl_erf(f64) -> f64;
#[import(cc = "device", name = "atan2")] fn opencl_atan2(f64, f64) -> f64;
#[import(cc = "device", name = "fmod")] fn opencl_fmod(f64, f64) -> f64;
#[import(cc = "device", name = "floor")] fn opencl_floor(f64) -> f64;
#[import(cc = "device", name = "isinf")] fn opencl_isinf(f64) -> i32;
#[import(cc = "device", name = "isnan")] fn opencl_isnan(f64) -> i32;
#[import(cc = "device", name = "isfinite")] fn opencl_isfinite(f64) -> i32;
#[import(cc = "device", name = "fma")] fn opencl_fma(f64, f64, f64) -> f64;
#[import(cc = "device", name = "mad")] fn opencl_mad(f64, f64, f64) -> f64;
#[import(cc = "device", name = "copysign")] fn opencl_copysign(f64, f64) -> f64;
#[import(cc = "device", name = "fmin")] fn opencl_fminf(f32, f32) -> f32;
#[import(cc = "device", name = "fmax")] fn opencl_fmaxf(f32, f32) -> f32;
#[import(cc = "device", name = "fmin")] fn opencl_fmin(f64, f64) -> f64;
#[import(cc = "device", name = "fmax")] fn opencl_fmax(f64, f64) -> f64;
#[import(cc = "device", name = "min")] fn opencl_min(i32, i32) -> i32;
#[import(cc = "device", name = "max")] fn opencl_max(i32, i32) -> i32;
#[import(cc = "device", name = "atomic_add")] fn opencl_atomic_add_global(&mut addrspace(1)i32, i32) -> i32;
#[import(cc = "device", name = "atomic_add")] fn opencl_atomic_add_shared(&mut addrspace(3)i32, i32) -> i32;
#[import(cc = "device", name = "atomic_min")] fn opencl_atomic_min_global(&mut addrspace(1)i32, i32) -> i32;
#[import(cc = "device", name = "atomic_min")] fn opencl_atomic_min_shared(&mut addrspace(3)i32, i32) -> i32;
#[import(cc = "device", name = "get_work_dim")] fn opencl_get_work_dim() -> u32;
#[import(cc = "device", name = "get_global_size")] fn opencl_get_global_size(u32) -> u64;
#[import(cc = "device", name = "get_global_id")] fn opencl_get_global_id(u32) -> u64;
#[import(cc = "device", name = "get_local_size")] fn opencl_get_local_size(u32) -> u64;
#[import(cc = "device", name = "get_local_id")] fn opencl_get_local_id(u32) -> u64;
#[import(cc = "device", name = "get_num_groups")] fn opencl_get_num_groups(u32) -> u64;
#[import(cc = "device", name = "get_group_id")] fn opencl_get_group_id(u32) -> u64;
#[import(cc = "device", name = "get_global_offset")] fn opencl_get_global_offset(u32) -> u64;
static CLK_LOCAL_MEM_FENCE = 1:u32;
static CLK_GLOBAL_MEM_FENCE = 2:u32;
fn @opencl_accelerator(dev: i32) = Accelerator {
exec = @|body| |grid, block| {
let work_item = WorkItem {
tidx = @|| opencl_get_local_id(0) as i32,
tidy = @|| opencl_get_local_id(1) as i32,
tidz = @|| opencl_get_local_id(2) as i32,
bidx = @|| opencl_get_group_id(0) as i32,
bidy = @|| opencl_get_group_id(1) as i32,
bidz = @|| opencl_get_group_id(2) as i32,
gidx = @|| opencl_get_global_id(0) as i32,
gidy = @|| opencl_get_global_id(1) as i32,
gidz = @|| opencl_get_global_id(2) as i32,
bdimx = @|| opencl_get_local_size(0) as i32,
bdimy = @|| opencl_get_local_size(1) as i32,
bdimz = @|| opencl_get_local_size(2) as i32,
gdimx = @|| opencl_get_global_size(0) as i32,
gdimy = @|| opencl_get_global_size(1) as i32,
gdimz = @|| opencl_get_global_size(2) as i32,
nblkx = @|| opencl_get_num_groups(0) as i32,
nblky = @|| opencl_get_num_groups(1) as i32,
nblkz = @|| opencl_get_num_groups(2) as i32
};
opencl(dev, grid, block, || @body(work_item))
},
sync = @|| synchronize_opencl(dev),
alloc = @|size| alloc_opencl(dev, size),
alloc_unified = @|size| alloc_opencl_unified(dev, size),
barrier = @|| opencl_barrier(CLK_LOCAL_MEM_FENCE),
};
static opencl_intrinsics = Intrinsics {
expf = opencl_expf,
exp2f = opencl_exp2f,
logf = opencl_logf,
log2f = opencl_log2f,
powf = opencl_powf,
rsqrtf = opencl_rsqrtf,
sqrtf = opencl_sqrtf,
fabsf = opencl_fabsf,
sinf = opencl_sinf,
cosf = opencl_cosf,
tanf = opencl_tanf,
asinf = opencl_asinf,
acosf = opencl_acosf,
atanf = opencl_atanf,
erff = opencl_erff,
atan2f = opencl_atan2f,
copysignf = opencl_copysignf,
fmaf = opencl_fmaf,
fmaxf = opencl_fmaxf,
fminf = opencl_fminf,
fmodf = opencl_fmodf,
floorf = opencl_floorf,
isinff = opencl_isinff,
isnanf = opencl_isnanf,
isfinitef = opencl_isfinitef,
exp = opencl_exp,
exp2 = opencl_exp2,
log = opencl_log,
log2 = opencl_log2,
pow = opencl_pow,
rsqrt = opencl_rsqrt,
sqrt = opencl_sqrt,
fabs = opencl_fabs,
sin = opencl_sin,
cos = opencl_cos,
tan = opencl_tan,
asin = opencl_asin,
acos = opencl_acos,
atan = opencl_atan,
erf = opencl_erf,
atan2 = opencl_atan2,
copysign = opencl_copysign,
fma = opencl_fma,
fmax = opencl_fmax,
fmin = opencl_fmin,
fmod = opencl_fmod,
floor = opencl_floor,
isinf = opencl_isinf,
isnan = opencl_isnan,
isfinite = opencl_isfinite,
min = opencl_min,
max = opencl_max,
};
#[import(cc = "builtin")] fn undef[T]() -> T;
#[import(cc = "builtin")] fn sizeof[_]() -> i64;
#[import(cc = "builtin")] fn alignof[_]() -> i64;
#[import(cc = "builtin")] fn bitcast[T, U](_src: U) -> T;
#[import(cc = "builtin")] fn select[T, U](_cond: T, _true: U, _false: U) -> U;
#[import(cc = "builtin")] fn insert[T, U](_tuple: T, _index: i32, _value: U) -> T;
#[import(cc = "thorin")] fn atomic[T](_binop: u32, _addr: &mut T, _val: T, _order: u32, _scope: &[u8]) -> T; // Xchg Add Sub And Nand Or Xor Max Min UMax UMin FAdd FSub
#[import(cc = "thorin")] fn atomic_load[T](_addr: &T, _order: u32, _scope: &[u8]) -> T;
#[import(cc = "thorin")] fn atomic_store[T](_addr: &mut T, _val: T, _order: u32, _scope: &[u8]) -> ();
#[import(cc = "thorin")] fn cmpxchg[T](_addr: &mut T, _cmp: T, _new: T, _success_order: u32, _failure_order: u32, _scope: &[u8]) -> (T, bool); // only for integer data types
#[import(cc = "thorin")] fn cmpxchg_weak[T](_addr: &mut T, _cmp: T, _new: T, _success_order: u32, _failure_order: u32, _scope: &[u8]) -> (T, bool); // only for integer data types
#[import(cc = "thorin")] fn fence(_order: u32, _scope: &[u8]) -> ();
#[import(cc = "thorin")] fn pe_info[T](_src: &[u8], _val: T) -> ();
#[import(cc = "thorin")] fn cuda(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _body: fn() -> ()) -> ();
#[import(cc = "thorin")] fn nvvm(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _body: fn() -> ()) -> ();
#[import(cc = "thorin")] fn opencl(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _body: fn() -> ()) -> ();
#[import(cc = "thorin")] fn amdgpu_hsa(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _body: fn() -> ()) -> ();
#[import(cc = "thorin")] fn amdgpu_pal(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _body: fn() -> ()) -> ();
#[import(cc = "thorin")] fn reserve_shared[T](_size: i32) -> &mut addrspace(3)[T];
#[import(cc = "thorin")] fn hls(_dev: i32, _body: fn() -> ()) -> ();
#[import(cc = "thorin", name = "pipeline")] fn thorin_pipeline(_initiation_interval: i32, _lower: i32, _upper: i32, _body: fn(i32) -> ()) -> (); // only for HLS/OpenCL backend
#[import(cc = "thorin", name = "parallel")] fn thorin_parallel(_num_threads: i32, _lower: i32, _upper: i32, _body: fn(i32) -> ()) -> ();
#[import(cc = "thorin", name = "spawn")] fn thorin_spawn(_body: fn() -> ()) -> i32;
#[import(cc = "thorin")] fn sync(_id: i32) -> ();
#[import(cc = "thorin")] fn vectorize(_vector_length: i32, _body: fn(i32) -> ()) -> ();
#[import(cc = "thorin", name = "atomic")] fn atomic_p1[T](_binop: u32, _addr: &mut addrspace(1)T, _val: T, _order: u32, _scope: &[u8]) -> T;
#[import(cc = "thorin", name = "atomic")] fn atomic_p3[T](_binop: u32, _addr: &mut addrspace(3)T, _val: T, _order: u32, _scope: &[u8]) -> T;
#[import(cc = "thorin", name = "atomic_load")] fn atomic_load_p1[T](_addr: &addrspace(1)T, _order: u32, _scope: &[u8]) -> T;
#[import(cc = "thorin", name = "atomic_load")] fn atomic_load_p3[T](_addr: &addrspace(3)T, _order: u32, _scope: &[u8]) -> T;
#[import(cc = "thorin", name = "atomic_store")] fn atomic_store_p1[T](_addr: &mut addrspace(1)T, _val: T, _order: u32, _scope: &[u8]) -> ();
#[import(cc = "thorin", name = "atomic_store")] fn atomic_store_p3[T](_addr: &mut addrspace(3)T, _val: T, _order: u32, _scope: &[u8]) -> ();
#[import(cc = "thorin", name = "cmpxchg")] fn cmpxchg_p1[T](_addr: &mut addrspace(1)T, _cmp: T, _new: T, _success_order: u32, _failure_order: u32, _scope: &[u8]) -> (T, bool);
#[import(cc = "thorin", name = "cmpxchg")] fn cmpxchg_p3[T](_addr: &mut addrspace(3)T, _cmp: T, _new: T, _success_order: u32, _failure_order: u32, _scope: &[u8]) -> (T, bool);
#[import(cc = "thorin", name = "cmpxchg_weak")] fn cmpxchg_weak_p1[T](_addr: &mut addrspace(1)T, _cmp: T, _new: T, _success_order: u32, _failure_order: u32, _scope: &[u8]) -> (T, bool);
#[import(cc = "thorin", name = "cmpxchg_weak")] fn cmpxchg_weak_p3[T](_addr: &mut addrspace(3)T, _cmp: T, _new: T, _success_order: u32, _failure_order: u32, _scope: &[u8]) -> (T, bool);
fn @pipeline(body: fn(i32) -> ()) = @|initiation_interval: i32, lower: i32, upper: i32| thorin_pipeline(initiation_interval, lower, upper, body);
fn @parallel(body: fn(i32) -> ()) = @|num_threads: i32, lower: i32, upper: i32| thorin_parallel(num_threads, lower, upper, body);
fn @spawn(body: fn() -> ()) = @|| thorin_spawn(body);
#[import(cc = "C", name = "anydsl_info")] fn runtime_info() -> ();
#[import(cc = "C", name = "anydsl_device_name")] fn runtime_device_name(_device: i32) -> &[u8];
#[import(cc = "C", name = "anydsl_device_check_feature_support")] fn runtime_device_check_feature_support(_device: i32, _feature: &[u8]) -> bool;
#[import(cc = "C", name = "anydsl_alloc")] fn runtime_alloc(_device: i32, _size: i64) -> &mut [i8];
#[import(cc = "C", name = "anydsl_alloc_host")] fn runtime_alloc_host(_device: i32, _size: i64) -> &mut [i8];
#[import(cc = "C", name = "anydsl_alloc_unified")] fn runtime_alloc_unified(_device: i32, _size: i64) -> &mut [i8];
#[import(cc = "C", name = "anydsl_copy")] fn runtime_copy(_src_device: i32, _src_ptr: &[i8], _src_offset: i64, _dst_device: i32, _dst_ptr: &mut [i8], _dst_offset: i64, _size: i64) -> ();
#[import(cc = "C", name = "anydsl_get_device_ptr")] fn runtime_get_device_ptr(_device: i32, _ptr: &[i8]) -> &[i8];
#[import(cc = "C", name = "anydsl_synchronize")] fn runtime_synchronize(_device: i32) -> ();
#[import(cc = "C", name = "anydsl_release")] fn runtime_release(_device: i32, _ptr: &[i8]) -> ();
#[import(cc = "C", name = "anydsl_release_host")] fn runtime_release_host(_device: i32, _ptr: &[i8]) -> ();
#[import(cc = "C", name = "anydsl_random_seed")] fn random_seed(_: u32) -> ();
#[import(cc = "C", name = "anydsl_random_val_f32")] fn random_val_f32() -> f32;
#[import(cc = "C", name = "anydsl_random_val_u64")] fn random_val_u64() -> u64;
#[import(cc = "C", name = "anydsl_get_micro_time")] fn get_micro_time() -> i64;
#[import(cc = "C", name = "anydsl_get_nano_time")] fn get_nano_time() -> i64;
#[import(cc = "C", name = "anydsl_get_kernel_time")] fn get_kernel_time() -> i64;
#[import(cc = "C", name = "anydsl_print_i16")] fn print_i16(_: i16) -> ();
#[import(cc = "C", name = "anydsl_print_i32")] fn print_i32(_: i32) -> ();
#[import(cc = "C", name = "anydsl_print_i64")] fn print_i64(_: i64) -> ();
#[import(cc = "C", name = "anydsl_print_u16")] fn print_u16(_: u16) -> ();
#[import(cc = "C", name = "anydsl_print_u32")] fn print_u32(_: u32) -> ();
#[import(cc = "C", name = "anydsl_print_u64")] fn print_u64(_: u64) -> ();
#[import(cc = "C", name = "anydsl_print_f32")] fn print_f32(_: f32) -> ();
#[import(cc = "C", name = "anydsl_print_f64")] fn print_f64(_: f64) -> ();
#[import(cc = "C", name = "anydsl_print_char")] fn print_char(_: u8) -> ();
#[import(cc = "C", name = "anydsl_print_string")] fn print_string(_: &[u8]) -> ();
#[import(cc = "C", name = "anydsl_print_flush")] fn print_flush() -> ();
// TODO
//struct Buffer[T] {
// data : &mut [T],
// size : i64,
// device : i32
//}
//
//fn @alloc[T](device: i32, size: i64) = Buffer[T] {
// data = runtime_alloc(device, size * sizeof[T]()) as &mut [T],
// size = size,
// device = device
//};
//fn @alloc_host[T](device: i32, size: i64) = Buffer[T] {
// data = runtime_alloc_host(device, size * sizeof[T]()) as &mut [T],
// size = size,
// device = device
//};
//fn @alloc_unified[T](device: i32, size: i64) = Buffer[T] {
// data = runtime_alloc_unified(device, size * sizeof[T]()) as &mut [T],
// size = size,
// device = device
//};
//
//fn @release[T](buf: Buffer[T]) = runtime_release(buf.device, buf.data as &[i8]);
//fn @alloc_cpu[T](size: i64) = alloc[T](0, size);
//fn @alloc_cuda[T](dev: i32, size: i64) = alloc[T](runtime_device(1, dev), size);
//fn @alloc_cuda_host[T](dev: i32, size: i64) = alloc_host[T](runtime_device(1, dev), size);
//fn @alloc_cuda_unified[T](dev: i32, size: i64) = alloc_unified[T](runtime_device(1, dev), size);
//fn @synchronize_cuda(dev: i32) = runtime_synchronize(runtime_device(1, dev));
//fn @alloc_opencl[T](dev: i32, size: i64) = alloc[T](runtime_device(2, dev), size);
//fn @alloc_opencl_unified[T](dev: i32, size: i64) = alloc_unified[T](runtime_device(2, dev), size);
//fn @synchronize_opencl(dev: i32) = runtime_synchronize(runtime_device(2, dev));
//fn @alloc_hsa[T](dev: i32, size: i64) = alloc[T](runtime_device(3, dev), size);
//fn @alloc_hsa_host[T](dev: i32, size: i64) = alloc_host[T](runtime_device(3, dev), size);
//fn @alloc_hsa_unified[T](dev: i32, size: i64) = alloc_unified[T](runtime_device(3, dev), size);
//fn @synchronize_hsa(dev: i32) = runtime_synchronize(runtime_device(3, dev));
//fn @alloc_pal[T](dev: i32, size: i64) = alloc[T](runtime_device(4, dev), size);
//fn @alloc_pal_host[T](dev: i32, size: i64) = alloc_host[T](runtime_device(4, dev), size);
//fn @alloc_pal_unified[T](dev: i32, size: i64) = alloc_unified[T](runtime_device(4, dev), size);
//fn @synchronize_pal(dev: i32) = runtime_synchronize(runtime_device(4, dev));
//
//fn @copy[T](src: Buffer[T], dst: Buffer[T]) = runtime_copy(src.device, src.data as &[i8], 0, dst.device, dst.data as &mut [i8], 0, src.size);
//fn @copy_offset[T](src: Buffer[T], off_src: i64, dst: Buffer[T], off_dst: i64, size: i64) = runtime_copy(src.device, src.data as &[i8], off_src, dst.device, dst.data as &mut [i8], off_dst, size);
struct Buffer {
data : &mut [i8],
size : i64,
device : i32
}
fn @alloc(device: i32, size: i64) = Buffer {
data = runtime_alloc(device, size),
size = size,
device = device
};
fn @alloc_host(device: i32, size: i64) = Buffer {
data = runtime_alloc_host(device, size),
size = size,
device = device
};
fn @alloc_unified(device: i32, size: i64) = Buffer {
data = runtime_alloc_unified(device, size),
size = size,
device = device
};
fn @release(buf: Buffer) = runtime_release(buf.device, buf.data);
fn @runtime_device(platform: i32, device: i32) -> i32 { platform | (device << 4) }
fn @alloc_cpu(size: i64) = alloc(0, size);
fn @alloc_cuda(dev: i32, size: i64) = alloc(runtime_device(1, dev), size);
fn @alloc_cuda_host(dev: i32, size: i64) = alloc_host(runtime_device(1, dev), size);
fn @alloc_cuda_unified(dev: i32, size: i64) = alloc_unified(runtime_device(1, dev), size);
fn @synchronize_cuda(dev: i32) = runtime_synchronize(runtime_device(1, dev));
fn @alloc_opencl(dev: i32, size: i64) = alloc(runtime_device(2, dev), size);
fn @alloc_opencl_unified(dev: i32, size: i64) = alloc_unified(runtime_device(2, dev), size);
fn @synchronize_opencl(dev: i32) = runtime_synchronize(runtime_device(2, dev));
fn @alloc_hls(dev: i32, size: i64) -> Buffer { alloc(runtime_device(2, dev), size) }
fn @alloc_hls_unified(dev: i32, size: i64) -> Buffer { alloc_unified(runtime_device(2, dev), size) }
fn @synchronize_hls(dev: i32) -> () { runtime_synchronize(runtime_device(2, dev)) }
fn @alloc_hsa(dev: i32, size: i64) = alloc(runtime_device(3, dev), size);
fn @alloc_hsa_host(dev: i32, size: i64) = alloc_host(runtime_device(3, dev), size);
fn @alloc_hsa_unified(dev: i32, size: i64) = alloc_unified(runtime_device(3, dev), size);
fn @synchronize_hsa(dev: i32) = runtime_synchronize(runtime_device(3, dev));
fn @alloc_pal(dev: i32, size: i64) = alloc(runtime_device(4, dev), size);
fn @alloc_pal_host(dev: i32, size: i64) = alloc_host(runtime_device(4, dev), size);
fn @alloc_pal_unified(dev: i32, size: i64) = alloc_unified(runtime_device(4, dev), size);
fn @synchronize_pal(dev: i32) = runtime_synchronize(runtime_device(4, dev));
fn @copy(src: Buffer, dst: Buffer) = runtime_copy(src.device, src.data, 0, dst.device, dst.data, 0, src.size);
fn @copy_offset(src: Buffer, off_src: i64, dst: Buffer, off_dst: i64, size: i64) = runtime_copy(src.device, src.data, off_src, dst.device, dst.data, off_dst, size);
// range, range_step, unroll, unroll_step, etc.
fn @unroll_step(body: fn(i32) -> ()) {
fn @(?beg & ?end & ?step) loop(beg: i32, end: i32, step: i32) -> () {
if beg < end {
@body(beg);
loop(beg + step, end, step)
}
}
loop
}
fn @unroll_step_rev(body: fn(i32) -> ()) {
fn @(?beg & ?end & ?step) loop(end: i32, beg: i32, step: i32) -> () {
if end > beg {
@body(end);
loop(end - step, beg, step)
}
}
loop
}
fn @range(body: fn(i32) -> ()) = @|lower: i32, upper: i32| unroll_step(body)($lower, $upper, 1);
fn @range_step(body: fn(i32) -> ()) = @|lower: i32, upper: i32, step: i32| unroll_step(body)($lower, $upper, step);
fn @range_rev(body: fn(i32) -> ()) = @|upper: i32, lower: i32| unroll_step_rev(body)(upper, lower, 1);
fn @unroll(body: fn(i32) -> ()) = @|lower: i32, upper: i32| unroll_step(body)(lower, upper, 1);
fn @unroll_rev(body: fn(i32) -> ()) = @|upper: i32, lower: i32| unroll_step_rev(body)(upper, lower, 1);
mod math_builtins {
#[import(cc = "builtin")] fn fabs[T](T) -> T;
#[import(cc = "builtin")] fn copysign[T](T, T) -> T;
#[import(cc = "builtin")] fn signbit[T](T) -> bool;
#[import(cc = "builtin")] fn round[T](T) -> T;
#[import(cc = "builtin")] fn ceil[T](T) -> T;
#[import(cc = "builtin")] fn floor[T](T) -> T;
#[import(cc = "builtin")] fn fmin[T](T, T) -> T;
#[import(cc = "builtin")] fn fmax[T](T, T) -> T;
#[import(cc = "builtin")] fn cos[T](T) -> T;
#[import(cc = "builtin")] fn sin[T](T) -> T;
#[import(cc = "builtin")] fn tan[T](T) -> T;
#[import(cc = "builtin")] fn acos[T](T) -> T;
#[import(cc = "builtin")] fn asin[T](T) -> T;
#[import(cc = "builtin")] fn atan[T](T) -> T;
#[import(cc = "builtin")] fn atan2[T](T, T) -> T;
#[import(cc = "builtin")] fn sqrt[T](T) -> T;
#[import(cc = "builtin")] fn cbrt[T](T) -> T;
#[import(cc = "builtin")] fn pow[T](T, T) -> T;
#[import(cc = "builtin")] fn exp[T](T) -> T;
#[import(cc = "builtin")] fn exp2[T](T) -> T;
#[import(cc = "builtin")] fn log[T](T) -> T;
#[import(cc = "builtin")] fn log2[T](T) -> T;
#[import(cc = "builtin")] fn log10[T](T) -> T;
#[import(cc = "builtin")] fn isnan[T](T) -> bool;
#[import(cc = "builtin")] fn isfinite[T](T) -> bool;
}
enum memory_order {
relaxed,
acquire,
release,
acq_rel,
seq_cst
}
fn @is_weaker_memory_order(a: memory_order, b: memory_order) -> bool {
builtin_memory_order(a) < builtin_memory_order(b)
}
fn @stronger_memory_order(a: memory_order, b: memory_order) -> memory_order {
if is_weaker_memory_order(a, b) { b } else { a }
}
fn @builtin_memory_order(order: memory_order) -> u32 {
match order {
memory_order::relaxed => 2,
memory_order::acquire => 4,
memory_order::release => 5,
memory_order::acq_rel => 6,
memory_order::seq_cst => 7,
}
}
struct grid_context {
device: i32,
max_concurrency: fn() -> i32,
groups: fn(fn(group_context) -> ()) -> fn() -> (),
waves: fn(fn(wave_context) -> ()) -> fn() -> (),
threads: fn(fn(thread_context) -> ()) -> fn() -> (),
num_groups: fn(i32) -> u32,
num_waves: fn() -> u32,
num_threads: fn(i32) -> u32,
}
struct group_context {
idx: fn(i32) -> u32,
waves: fn(fn(wave_context) -> ()) -> fn() -> (),
threads: fn(fn(thread_context) -> ()) -> fn() -> (),
num_waves: fn() -> u32,
num_threads: fn(i32) -> u32,
barrier: fn() -> (),
barrier_all: fn(bool) -> bool,
barrier_any: fn(bool) -> bool,
barrier_count: fn(bool) -> i32,
}
struct wave_context {
idx: fn() -> u32,
membermask: fn() -> u64,
threads: fn(fn(thread_context) -> ()) -> fn() -> (),
num_threads: fn() -> u32,
barrier: fn() -> (),
barrier_all: fn(bool) -> bool,
barrier_any: fn(bool) -> bool,
barrier_count: fn(bool) -> i32,
barrier_vote: fn(bool) -> u64,
// activemask: fn() -> u32,
shfl_i32: fn(i32, i32, u32) -> i32,
shfl_u32: fn(u32, i32, u32) -> u32,
// shfl_i64: fn(i64, i32, u32) -> i64,
// shfl_u64: fn(u64, i32, u32) -> u64,
// shfl_f32: fn(f32, i32, u32) -> f32,
// shfl_f64: fn(f64, i32, u32) -> f64,
shfl_up_i32: fn(i32, u32, u32) -> i32,
shfl_up_u32: fn(u32, u32, u32) -> u32,
// shfl_up_i64: fn(i64, u32, u32) -> i64,
// shfl_up_u64: fn(u64, u32, u32) -> u64,
// shfl_up_f32: fn(f32, u32, u32) -> f32,
// shfl_up_f64: fn(f64, u32, u32) -> f64,
shfl_down_i32: fn(i32, u32, u32) -> i32,
shfl_down_u32: fn(u32, u32, u32) -> u32,
// shfl_down_i64: fn(i64, u32, u32) -> i64,
// shfl_down_u64: fn(u64, u32, u32) -> u64,
// shfl_down_f32: fn(f32, u32, u32) -> f32,
// shfl_down_f64: fn(f64, u32, u32) -> f64,
shfl_bfly_i32: fn(i32, i32, u32) -> i32,
shfl_bfly_u32: fn(u32, i32, u32) -> u32,
// shfl_bfly_i64: fn(i64, i32, u32) -> i64,
// shfl_bfly_u64: fn(u64, i32, u32) -> u64,
// shfl_bfly_f32: fn(f32, i32, u32) -> f32,
// shfl_bfly_f64: fn(f64, i32, u32) -> f64,
// match_any_i32: fn(i32) -> u32,
// match_any_u32: fn(u32) -> u32,
// match_any_i64: fn(i64) -> u32,
// match_any_u64: fn(u64) -> u32,
// match_any_f32: fn(f32) -> u32,
// match_any_f64: fn(f64) -> u32,
// match_all_i32: fn(i32, &mut i32) -> u32,
// match_all_u32: fn(u32, &mut i32) -> u32,
// match_all_i64: fn(i64, &mut i32) -> u32,
// match_all_u64: fn(u64, &mut i32) -> u32,
// match_all_f32: fn(f32, &mut i32) -> u32,
// match_all_f64: fn(f64, &mut i32) -> u32,
lanemask: fn() -> u64,
lanemask_le: fn() -> u64,
lanemask_lt: fn() -> u64,
lanemask_ge: fn() -> u64,
lanemask_gt: fn() -> u64
}
struct thread_context {
idx: fn(i32) -> u32,
gid: fn() -> u32,
uid: fn() -> i32,
atomic_load_global_i32: fn(&addrspace(1) i32, memory_order) -> i32,
atomic_load_global_u32: fn(&addrspace(1) u32, memory_order) -> u32,
atomic_load_global_i64: fn(&addrspace(1) i64, memory_order) -> i64,
atomic_load_global_u64: fn(&addrspace(1) u64, memory_order) -> u64,
atomic_load_global_i32_coalesced: fn(&addrspace(1) i32, memory_order) -> i32,
atomic_load_global_u32_coalesced: fn(&addrspace(1) u32, memory_order) -> u32,
atomic_load_global_i64_coalesced: fn(&addrspace(1) i64, memory_order) -> i64,
atomic_load_global_u64_coalesced: fn(&addrspace(1) u64, memory_order) -> u64,
atomic_store_global_i32: fn(&mut addrspace(1) i32, i32, memory_order) -> (),
atomic_store_global_u32: fn(&mut addrspace(1) u32, u32, memory_order) -> (),
atomic_store_global_i64: fn(&mut addrspace(1) i64, i64, memory_order) -> (),
atomic_store_global_u64: fn(&mut addrspace(1) u64, u64, memory_order) -> (),
atomic_store_global_i32_coalesced: fn(&mut addrspace(1) i32, i32, memory_order) -> (),
atomic_store_global_u32_coalesced: fn(&mut addrspace(1) u32, u32, memory_order) -> (),
atomic_store_global_i64_coalesced: fn(&mut addrspace(1) i64, i64, memory_order) -> (),
atomic_store_global_u64_coalesced: fn(&mut addrspace(1) u64, u64, memory_order) -> (),
atomic_add_global_i32: fn(&mut addrspace(1) i32, i32, memory_order) -> i32,
atomic_add_global_u32: fn(&mut addrspace(1) u32, u32, memory_order) -> u32,
atomic_add_global_i64: fn(&mut addrspace(1) i64, i64, memory_order) -> i64,
atomic_add_global_u64: fn(&mut addrspace(1) u64, u64, memory_order) -> u64,
atomic_sub_global_i32: fn(&mut addrspace(1) i32, i32, memory_order) -> i32,
atomic_sub_global_u32: fn(&mut addrspace(1) u32, u32, memory_order) -> u32,
atomic_sub_global_u64: fn(&mut addrspace(1) u64, u64, memory_order) -> u64,
atomic_and_global_i32: fn(&mut addrspace(1) i32, i32, memory_order) -> i32,
atomic_and_global_u32: fn(&mut addrspace(1) u32, u32, memory_order) -> u32,
atomic_and_global_u64: fn(&mut addrspace(1) u64, u64, memory_order) -> u64,
atomic_or_global_i32: fn(&mut addrspace(1) i32, i32, memory_order) -> i32,
atomic_or_global_u32: fn(&mut addrspace(1) u32, u32, memory_order) -> u32,
atomic_or_global_u64: fn(&mut addrspace(1) u64, u64, memory_order) -> u64,
atomic_xor_global_i32: fn(&mut addrspace(1) i32, i32, memory_order) -> i32,
atomic_xor_global_u32: fn(&mut addrspace(1) u32, u32, memory_order) -> u32,
atomic_xor_global_u64: fn(&mut addrspace(1) u64, u64, memory_order) -> u64,
atomic_exch_global_i32: fn(&mut addrspace(1) i32, i32, memory_order) -> i32,
atomic_exch_global_u32: fn(&mut addrspace(1) u32, u32, memory_order) -> u32,
atomic_exch_global_u64: fn(&mut addrspace(1) u64, u64, memory_order) -> u64,
atomic_min_global_i32: fn(&mut addrspace(1) i32, i32, memory_order) -> i32,
atomic_min_global_u32: fn(&mut addrspace(1) u32, u32, memory_order) -> u32,
atomic_min_global_u64: fn(&mut addrspace(1) u64, u64, memory_order) -> u64,
atomic_max_global_i32: fn(&mut addrspace(1) i32, i32, memory_order) -> i32,
atomic_max_global_u32: fn(&mut addrspace(1) u32, u32, memory_order) -> u32,
atomic_max_global_u64: fn(&mut addrspace(1) u64, u64, memory_order) -> u64,
atomic_cas_global_i32: fn(&mut addrspace(1) i32, i32, i32, memory_order, memory_order) -> (i32, bool),
atomic_cas_global_u32: fn(&mut addrspace(1) u32, u32, u32, memory_order, memory_order) -> (u32, bool),
atomic_cas_global_i64: fn(&mut addrspace(1) i64, i64, i64, memory_order, memory_order) -> (i64, bool),
atomic_cas_global_u64: fn(&mut addrspace(1) u64, u64, u64, memory_order, memory_order) -> (u64, bool),
atomic_cas_global_i32_weak: fn(&mut addrspace(1) i32, i32, i32, memory_order, memory_order) -> (i32, bool),
atomic_cas_global_u32_weak: fn(&mut addrspace(1) u32, u32, u32, memory_order, memory_order) -> (u32, bool),
atomic_cas_global_i64_weak: fn(&mut addrspace(1) i64, i64, i64, memory_order, memory_order) -> (i64, bool),
atomic_cas_global_u64_weak: fn(&mut addrspace(1) u64, u64, u64, memory_order, memory_order) -> (u64, bool),
atomic_inc_global_u32: fn(&mut addrspace(1) u32, u32) -> u32,
memory_barrier: fn(memory_order) -> (),
timestamp: fn() -> i64,
timestamp32: fn() -> i32,
// atomic_wait_and_transition_global_u32: fn(&mut addrspace(1) u32, u32, u32, memory_order, &[u8]) -> (),
wait: fn(fn() -> bool, &[u8]) -> (),
}
struct ProducerConsumerQueue[T] {
// [THREAD-SAFETY]: only functions that receive a thread_context (i.e. push/pop/size) are thread-safe
push: fn(fn() -> T) -> fn(thread_context) -> i32,
pop: fn(fn(T) -> ()) -> fn(thread_context) -> i32,
// pop_wave: fn(wave_context, i32, fn(T) -> ()) -> i32,
size: fn(thread_context) -> i32,
reset: fn(grid_context) -> (),
validate: fn(&mut addrspace(1) u32, grid_context) -> (),
release: fn() -> ()
}
enum create_queue_result[T] {
Ok(ProducerConsumerQueue[T]),
Err(&[u8])
}
type queue_constructor[T] = fn(AccDevice, i32) -> create_queue_result[T];
struct Allocator[T] {
alloc: fn() -> u64,
free: fn(u64) -> (),
alloc_ptr: fn() -> &mut T,
free_ptr: fn(&mut T) -> (),
clear: fn()->(),
release: fn()->(),
alignment: i64,
}
type pool_idx = i32;
fn createPool[T](capacity: pool_idx, alignment: pool_idx) -> Allocator[T] {
let size = max_i64(sizeof[T](), sizeof[pool_idx]());
let align = max_i64(max_i64(alignment as i64, alignof[T]()), alignof[pool_idx]());
let width = round_up_i64(size, align);
let offset = 2*sizeof[pool_idx]();
//print_string("size: "); print_i64(size); print_char('\n');
//print_string("align: "); print_i64(align); print_char('\n');
//print_string("width: "); print_i64(width); print_char('\n');
//print_string("offset: "); print_i64(offset); print_char('\n');
let buffer = alloc_cpu(offset + capacity as i64 * width + align);
let begin = buffer.data as u64;
let data:u64 = (begin + offset as u64 + align as u64) & !(align as u64 - 1);
let next_alloc:&mut pool_idx = begin as &mut pool_idx;
let next_free:&mut pool_idx = (begin + sizeof[pool_idx]() as u64) as &mut pool_idx;
fn clear() -> () {
atomic_store[pool_idx](next_alloc, 0, 5 /* rel */, "");
atomic_store[pool_idx](next_free, -1, 5 /* rel */, "");
}
clear();
fn pool_alloc() -> u64 {
while true {
let idx = atomic_load[pool_idx](next_free, 4 /* acq */, "");
if (idx == -1) {
break()
}
let ptr = data + (idx as u64) * (width as u64);
let next:pool_idx = *(ptr as &pool_idx);
if cmpxchg[pool_idx](next_free, idx, next, 5 /* rel */, 2 /* rlx */, "").1 {
return(ptr)
}
}
let i = atomic[pool_idx](1, next_alloc, 1, 2 /* rlx */, "");
assert(0 <= i && i < capacity, "pool is running out of memory");
data + (i as u64) * (width as u64)
}
fn @pool_free(t: u64) -> () {
let j = t - data;
let i = (j / width as u64) as pool_idx;
assert(j % width as u64 == 0, "invalid pointer to element");
assert(0 <= i && i < capacity, "invalid pool element index");
while true {
let next = atomic_load[pool_idx](next_free, 2 /* rlx */, "");
*(t as &mut pool_idx) = next;
if cmpxchg[pool_idx](next_free, next, i, 5 /* rel */, 2 /* rlx */, "").1 {
break()
}
}
}
Allocator[T] {
alloc = pool_alloc,
alloc_ptr = @|| { pool_alloc() as &mut T },
free = pool_free,
free_ptr = @|t: &mut T| { pool_free(t as u64); },
clear = @|| { clear(); },
release = @|| { release(buffer); },
alignment = width
}
}
fn @anyq_verbose(_body: fn()->()) -> fn()->() {
|| { }
}
fn @assert(_condition: bool, _msg: &[u8]) -> () {
// skip assertions in release
}
#[import(cc = "device", name = "[]{return __CUDA_ARCH__;}")] fn cuda_device_arch() -> i32; // HACK
fn @createAccDevice(device: i32) {
let platform_device = runtime_device(1, device);
let its = runtime_device_check_feature_support(platform_device, "ITS");
AccDevice {
supports_its = its,
supports_npot_atomic_inc = true,
launch_1d = @|body|@|num_groups, group_size| cuda_launch(device, (num_groups, 1, 1), (group_size, 1, 1), wrap_index_1d, wrap_dim_1d, body),
synchronize = || synchronize_cuda(device),
alloc = |size| alloc_cuda(device, size),
platform_device = platform_device,
platform_name = "cuda",
print_i32 = @|format: &[u8], arg: i32| {
cuda_vprintf(format, &arg as &[u8]);
},
print_2xi32 = @|format: &[u8], arg1: i32, arg2: i32| {
let args:&[i32] = [arg1, arg2];
cuda_vprintf(format, args as &[u8]);
},
print_3xi32 = @|format: &[u8], arg1: i32, arg2: i32, arg3: i32| {
let args:&[i32] = [arg1, arg2, arg3];
cuda_vprintf(format, args as &[u8]);
}
}
}
fn @createDefaultAccDevice() = createAccDevice(0);
fn @cuda_pred(b: bool) -> i32 {
if b { 1 } else { 0 }
}
fn @cuda_memory_barrier(order: memory_order) {
if cuda_device_arch() >= 700 {
(@|| {
match order {
memory_order::acquire => asm("fence.acq_rel.gpu;" :::: "volatile"),
memory_order::release => asm("fence.acq_rel.gpu;" :::: "volatile"),
memory_order::acq_rel => asm("fence.acq_rel.gpu;" :::: "volatile"),
memory_order::seq_cst => asm("fence.sc.gpu;" :::: "volatile"),
_ => ()
}
})()
}
else {
cuda_threadfence() // TODO: seq_cst not supported?
}
}
fn @cuda_legacy_atomic_memory_barrier_pre(order:memory_order) {
match order {
memory_order::release => cuda_memory_barrier(memory_order::release),
memory_order::acq_rel => cuda_memory_barrier(memory_order::acq_rel),
memory_order::seq_cst => cuda_memory_barrier(memory_order::seq_cst),
_ => ()
}
}
fn @cuda_legacy_atomic_memory_barrier_post(order:memory_order) {
match order {
memory_order::acquire => cuda_memory_barrier(memory_order::acquire),
memory_order::acq_rel => cuda_memory_barrier(memory_order::acq_rel),
memory_order::seq_cst => cuda_memory_barrier(memory_order::seq_cst),
_ => ()
}
}
fn @cuda_legacy_atomic_memory_order_wrap_load[T](f: fn(&addrspace(1) T) -> T) {
@|location:&addrspace(1) T, order:memory_order| -> T {
cuda_legacy_atomic_memory_barrier_pre(order);
let res = f(location);
cuda_legacy_atomic_memory_barrier_post(order);
res
}
}
fn @cuda_legacy_atomic_memory_order_wrap_store[T](f: fn(&mut addrspace(1) T, T) -> ()) {
@|location:&mut addrspace(1) T, value:T, order:memory_order| -> () {
cuda_legacy_atomic_memory_barrier_pre(order);
f(location, value);
cuda_legacy_atomic_memory_barrier_post(order);
}
}
fn @cuda_legacy_atomic_memory_order_wrap_rmw[T](f: fn(&mut addrspace(1) T, T) -> T) {
@|location:&mut addrspace(1) T, value:T, order:memory_order| -> T {
cuda_legacy_atomic_memory_barrier_pre(order);
let old = f(location, value);
cuda_legacy_atomic_memory_barrier_post(order);
old
}
}
fn @cuda_legacy_atomic_memory_order_wrap_cas[T](f: fn(&mut addrspace(1) T, T, T) -> T, cmp: fn(T, T) -> bool) {
@|location:&mut addrspace(1) T, expected:T, desired:T, memory_order_succ:memory_order, memory_order_fail:memory_order| {
cuda_legacy_atomic_memory_barrier_pre(stronger_memory_order(memory_order_succ, memory_order_fail));
let old = f(location, expected, desired);
if cmp(old, expected) {
cuda_legacy_atomic_memory_barrier_post(memory_order_succ);
(old, true)
}
else {
cuda_legacy_atomic_memory_barrier_post(memory_order_fail);
(old, false)
}
}
}
fn @atomic_load_global_i32(location: &addrspace(1) i32, order: memory_order) -> i32 = atomic_load_global_u32(location as &addrspace(1) u32, order) as i32;
fn @atomic_load_global_u32(location: &addrspace(1) u32, order: memory_order) -> u32 {
let mut value: u32;
if cuda_device_arch() >= 700 {
(@|| {
match order {
memory_order::relaxed => asm("ld.relaxed.gpu.b32 %0, [%1];" : "=r"(value) : "l"(location) : "memory"),
memory_order::acquire => asm("ld.acquire.gpu.b32 %0, [%1];" : "=r"(value) : "l"(location) : "memory"),
memory_order::seq_cst => asm("fence.sc.gpu; ld.relaxed.gpu.b32 %0, [%1];" : "=r"(value) : "l"(location) : "memory"),
_ => cuda_trap()
}
})();
}
else {
cuda_legacy_atomic_memory_barrier_pre(order);
asm("ld.volatile.global.b32 %0, [%1];" : "=r"(value) : "l"(location) : "memory");
cuda_legacy_atomic_memory_barrier_post(order);
}
value
}
fn @atomic_load_global_i64(location: &addrspace(1) i64, order: memory_order) = atomic_load_global_u64(location as &addrspace(1) u64, order) as i64;
fn @atomic_load_global_u64(location: &addrspace(1) u64, order: memory_order) -> u64 {
let mut value: u64;
if cuda_device_arch() >= 700 {
(@|| {
match order {
memory_order::relaxed => asm("ld.relaxed.gpu.b64 %0, [%1];" : "=l"(value) : "l"(location) : "memory"),
memory_order::acquire => asm("ld.acquire.gpu.b64 %0, [%1];" : "=l"(value) : "l"(location) : "memory"),
memory_order::seq_cst => asm("fence.sc.gpu; ld.relaxed.gpu.b64 %0, [%1];" : "=l"(value) : "l"(location) : "memory"),
_ => cuda_trap()
}
})();
}
else {
cuda_legacy_atomic_memory_barrier_pre(order);
asm("ld.volatile.global.b64 %0, [%1];" : "=l"(value) : "l"(location) : "memory");
cuda_legacy_atomic_memory_barrier_post(order);
}
value
}
fn @atomic_store_global_i32(location: &addrspace(1) i32, value: i32, order: memory_order) = atomic_store_global_u32(location as &addrspace(1) u32, value as u32, order);
fn @atomic_store_global_u32(location: &addrspace(1) u32, value: u32, order: memory_order) -> () {
if cuda_device_arch() >= 700 {
(@|| {
match order {
memory_order::relaxed => asm("st.relaxed.gpu.b32 [%0], %1;" : : "l"(location), "r"(value) : "memory"),
memory_order::release => asm("st.release.gpu.b32 [%0], %1;" : : "l"(location), "r"(value) : "memory"),
memory_order::seq_cst => asm("st.relaxed.gpu.b32 [%0], %1; fence.sc.gpu;" : : "l"(location), "r"(value) : "memory"),
_ => cuda_trap()
}
})()
}
else {
cuda_legacy_atomic_memory_barrier_pre(order);
asm("st.volatile.global.b32 [%0], %1;" : : "l"(location), "r"(value) : "memory");
cuda_legacy_atomic_memory_barrier_post(order);
}
}
fn @atomic_store_global_i64(location: &addrspace(1) i64, value: i64, order: memory_order) = atomic_store_global_u64(location as &addrspace(1) u64, value as u64, order);
fn @atomic_store_global_u64(location: &addrspace(1) u64, value: u64, order: memory_order) -> () {
if cuda_device_arch() >= 700 {
(@|| {
match order {
memory_order::relaxed => asm("st.relaxed.gpu.b64 [%0], %1;" : : "l"(location), "l"(value) : "memory"),
memory_order::release => asm("st.release.gpu.b64 [%0], %1;" : : "l"(location), "l"(value) : "memory"),
memory_order::seq_cst => asm("st.relaxed.gpu.b64 [%0], %1; fence.sc.gpu;" : : "l"(location), "l"(value) : "memory"),
_ => cuda_trap()
}
})()
}
else {
cuda_legacy_atomic_memory_barrier_pre(order);
asm("st.volatile.global.b64 [%0], %1;" : : "l"(location), "l"(value) : "memory");
cuda_legacy_atomic_memory_barrier_post(order);
}
}
// fn @cuda_atomic_wait_and_transition[T](f: fn(&mut addrspace(1) T, T, T) -> T, cmp: fn(T, T) -> bool) {
// @|location:&mut addrspace(1) T, expected:T, desired:T, order:memory_order, _debug_msg:&[u8]| -> () {
// cuda_legacy_atomic_memory_barrier_pre(order);
// while !cmp(f(location, expected, desired), expected) {
// cuda_threadfence();
// }
// cuda_legacy_atomic_memory_barrier_post(order);
// }
// }
fn @cuda_thread(idx: fn(i32) -> u32, gid: fn() -> u32, body: fn(thread_context) -> ()) -> () {
let sleep = @|t: u32| {
if cuda_device_arch() >= 700 {
asm("nanosleep.u32 %0;\n" :: "r"(t) :: "volatile"); // use asm to avoid compilation error on devices < 700 due to __nanosleep not being defined
}
else {
}
};
@body(thread_context {
idx = idx,
gid = gid,
uid = @|| gid() as i32,
atomic_load_global_i32 = atomic_load_global_i32,
atomic_load_global_u32 = atomic_load_global_u32,
atomic_load_global_i64 = atomic_load_global_i64,
atomic_load_global_u64 = atomic_load_global_u64,
atomic_load_global_i32_coalesced = cuda_legacy_atomic_memory_order_wrap_load[i32](@|location| { let mut res:i32; asm("ld.volatile.global.b32 %0, [%1];" : "=r"(res) : "l"(location) : "memory"); res }),
atomic_load_global_u32_coalesced = cuda_legacy_atomic_memory_order_wrap_load[u32](@|location| { let mut res:u32; asm("ld.volatile.global.b32 %0, [%1];" : "=r"(res) : "l"(location) : "memory"); res }),
atomic_load_global_i64_coalesced = cuda_legacy_atomic_memory_order_wrap_load[i64](@|location| { let mut res:i64; asm("ld.volatile.global.b64 %0, [%1];" : "=l"(res) : "l"(location) : "memory"); res }),
atomic_load_global_u64_coalesced = cuda_legacy_atomic_memory_order_wrap_load[u64](@|location| { let mut res:u64; asm("ld.volatile.global.b64 %0, [%1];" : "=l"(res) : "l"(location) : "memory"); res }),
atomic_store_global_i32 = atomic_store_global_i32,
atomic_store_global_u32 = atomic_store_global_u32,
atomic_store_global_i64 = atomic_store_global_i64,
atomic_store_global_u64 = atomic_store_global_u64,
atomic_store_global_i32_coalesced = cuda_legacy_atomic_memory_order_wrap_store[i32](@|location, value| asm("st.volatile.global.b32 [%0], %1;" : : "l"(location), "r"(value) : "memory")),
atomic_store_global_u32_coalesced = cuda_legacy_atomic_memory_order_wrap_store[u32](@|location, value| asm("st.volatile.global.b32 [%0], %1;" : : "l"(location), "r"(value) : "memory")),
atomic_store_global_i64_coalesced = cuda_legacy_atomic_memory_order_wrap_store[i64](@|location, value| asm("st.volatile.global.b64 [%0], %1;" : : "l"(location), "l"(value) : "memory")),
atomic_store_global_u64_coalesced = cuda_legacy_atomic_memory_order_wrap_store[u64](@|location, value| asm("st.volatile.global.b64 [%0], %1;" : : "l"(location), "l"(value) : "memory")),
atomic_add_global_i32 = cuda_legacy_atomic_memory_order_wrap_rmw[i32](cuda_atomic_add_global_i32),
atomic_add_global_u32 = cuda_legacy_atomic_memory_order_wrap_rmw[u32](cuda_atomic_add_global_u32),
atomic_add_global_i64 = cuda_legacy_atomic_memory_order_wrap_rmw[i64](@|location, value| cuda_atomic_add_global_u64(location as &mut addrspace(1) u64, value as u64) as i64),
atomic_add_global_u64 = cuda_legacy_atomic_memory_order_wrap_rmw[u64](cuda_atomic_add_global_u64),
atomic_sub_global_i32 = cuda_legacy_atomic_memory_order_wrap_rmw[i32](cuda_atomic_sub_global_i32),
atomic_sub_global_u32 = cuda_legacy_atomic_memory_order_wrap_rmw[u32](cuda_atomic_sub_global_u32),
atomic_sub_global_u64 = cuda_legacy_atomic_memory_order_wrap_rmw[u64](cuda_atomic_sub_global_u64),
atomic_and_global_i32 = cuda_legacy_atomic_memory_order_wrap_rmw[i32](cuda_atomic_and_global_i32),
atomic_and_global_u32 = cuda_legacy_atomic_memory_order_wrap_rmw[u32](cuda_atomic_and_global_u32),
atomic_and_global_u64 = cuda_legacy_atomic_memory_order_wrap_rmw[u64](cuda_atomic_and_global_u64),
atomic_or_global_i32 = cuda_legacy_atomic_memory_order_wrap_rmw[i32](cuda_atomic_or_global_i32),
atomic_or_global_u32 = cuda_legacy_atomic_memory_order_wrap_rmw[u32](cuda_atomic_or_global_u32),
atomic_or_global_u64 = cuda_legacy_atomic_memory_order_wrap_rmw[u64](cuda_atomic_or_global_u64),
atomic_xor_global_i32 = cuda_legacy_atomic_memory_order_wrap_rmw[i32](cuda_atomic_xor_global_i32),
atomic_xor_global_u32 = cuda_legacy_atomic_memory_order_wrap_rmw[u32](cuda_atomic_xor_global_u32),
atomic_xor_global_u64 = cuda_legacy_atomic_memory_order_wrap_rmw[u64](cuda_atomic_xor_global_u64),
atomic_exch_global_i32 = cuda_legacy_atomic_memory_order_wrap_rmw[i32](cuda_atomic_exch_global_i32),
atomic_exch_global_u32 = cuda_legacy_atomic_memory_order_wrap_rmw[u32](cuda_atomic_exch_global_u32),
atomic_exch_global_u64 = cuda_legacy_atomic_memory_order_wrap_rmw[u64](cuda_atomic_exch_global_u64),
atomic_min_global_i32 = cuda_legacy_atomic_memory_order_wrap_rmw[i32](cuda_atomic_min_global_i32),
atomic_min_global_u32 = cuda_legacy_atomic_memory_order_wrap_rmw[u32](cuda_atomic_min_global_u32),
atomic_min_global_u64 = cuda_legacy_atomic_memory_order_wrap_rmw[u64](cuda_atomic_min_global_u64),
atomic_max_global_i32 = cuda_legacy_atomic_memory_order_wrap_rmw[i32](cuda_atomic_max_global_i32),
atomic_max_global_u32 = cuda_legacy_atomic_memory_order_wrap_rmw[u32](cuda_atomic_max_global_u32),
atomic_max_global_u64 = cuda_legacy_atomic_memory_order_wrap_rmw[u64](cuda_atomic_max_global_u64),
atomic_cas_global_i32 = cuda_legacy_atomic_memory_order_wrap_cas[i32](cuda_atomic_cas_global_i32, @|a, b| a == b),
atomic_cas_global_u32 = cuda_legacy_atomic_memory_order_wrap_cas[u32](cuda_atomic_cas_global_u32, @|a, b| a == b),
atomic_cas_global_i64 = cuda_legacy_atomic_memory_order_wrap_cas[i64](@|location, expected, desired| cuda_atomic_cas_global_u64(location as &mut addrspace(1) u64, expected as u64, desired as u64) as i64, @|a, b| a == b),
atomic_cas_global_u64 = cuda_legacy_atomic_memory_order_wrap_cas[u64](cuda_atomic_cas_global_u64, @|a, b| a == b),
atomic_cas_global_i32_weak = cuda_legacy_atomic_memory_order_wrap_cas[i32](cuda_atomic_cas_global_i32, @|a, b| a == b),
atomic_cas_global_u32_weak = cuda_legacy_atomic_memory_order_wrap_cas[u32](cuda_atomic_cas_global_u32, @|a, b| a == b),
atomic_cas_global_i64_weak = cuda_legacy_atomic_memory_order_wrap_cas[i64](@|location, expected, desired| cuda_atomic_cas_global_u64(location as &mut addrspace(1) u64, expected as u64, desired as u64) as i64, @|a, b| a == b),
atomic_cas_global_u64_weak = cuda_legacy_atomic_memory_order_wrap_cas[u64](cuda_atomic_cas_global_u64, @|a, b| a == b),
atomic_inc_global_u32 = cuda_atomic_inc_global_u32,
memory_barrier = cuda_memory_barrier,
timestamp = @|| cuda_globaltimer() as i64,
timestamp32 = @|| cuda_globaltimer_lo() as i32,
wait = @|f, _debug_msg| {
while !f() { sleep(1); }
// for t in exponential_backoff(2, 128) {
// if f() { false } else { sleep(t as u32); true }
// }
}
})
}
fn @cuda_subwarp(idx: fn() -> u32, gid: fn() -> u32, membermask: u32, num_threads: u32, body: fn(wave_context) -> ()) -> () {
let thread_idx = @|i: i32| match i { 0 => cuda_laneid(), _ => 0 };
@body(wave_context {
idx = idx,
membermask = @|| membermask as u64,
threads = @|body|@|| cuda_thread(thread_idx, @|| gid() * num_threads + thread_idx(0), body),
num_threads = @|| num_threads,
barrier = @|| cuda_warp_sync(membermask),
barrier_all = @|predicate| cuda_warp_sync_all(membermask, cuda_pred(predicate)) != 0,
barrier_any = @|predicate| cuda_warp_sync_any(membermask, cuda_pred(predicate)) != 0,
barrier_count = @|predicate| cuda_popc_u32(cuda_warp_sync_vote(membermask, cuda_pred(predicate))),
barrier_vote = @|predicate| cuda_warp_sync_vote(membermask, cuda_pred(predicate)) as u64,
// activemask = cuda_warp_activemask,
shfl_i32 = @|x:i32, src_lane:i32, width:u32| cuda_warp_shfl_i32(membermask, x, src_lane, width as i32),
shfl_u32 = @|x:u32, src_lane:i32, width:u32| cuda_warp_shfl_u32(membermask, x, src_lane, width as i32),
// shfl_i64 = @|x:i64, src_lane:i32, width:u32| cuda_warp_shfl_i64(membermask, x, src_lane, width as i32),
// shfl_u64 = @|x:u64, src_lane:i32, width:u32| cuda_warp_shfl_u64(membermask, x, src_lane, width as i32),
// shfl_f32 = @|x:f32, src_lane:i32, width:u32| cuda_warp_shfl_f32(membermask, x, src_lane, width as i32),
// shfl_f64 = @|x:f64, src_lane:i32, width:u32| cuda_warp_shfl_f64(membermask, x, src_lane, width as i32),
shfl_up_i32 = @|x:i32, delta:u32, width:u32| cuda_warp_shfl_up_i32(membermask, x, delta, width as i32),
shfl_up_u32 = @|x:u32, delta:u32, width:u32| cuda_warp_shfl_up_u32(membermask, x, delta, width as i32),
// shfl_up_i64 = @|x:i64, delta:u32, width:u32| cuda_warp_shfl_up_i64(membermask, x, delta, width as i32),
// shfl_up_u64 = @|x:u64, delta:u32, width:u32| cuda_warp_shfl_up_u64(membermask, x, delta, width as i32),
// shfl_up_f32 = @|x:f32, delta:u32, width:u32| cuda_warp_shfl_up_f32(membermask, x, delta, width as i32),
// shfl_up_f64 = @|x:f64, delta:u32, width:u32| cuda_warp_shfl_up_f64(membermask, x, delta, width as i32),
shfl_down_i32 = @|x:i32, delta:u32, width:u32| cuda_warp_shfl_down_i32(membermask, x, delta, width as i32),
shfl_down_u32 = @|x:u32, delta:u32, width:u32| cuda_warp_shfl_down_u32(membermask, x, delta, width as i32),
// shfl_down_i64 = @|x:i64, delta:u32, width:u32| cuda_warp_shfl_down_i64(membermask, x, delta, width as i32),
// shfl_down_u64 = @|x:u64, delta:u32, width:u32| cuda_warp_shfl_down_u64(membermask, x, delta, width as i32),
// shfl_down_f32 = @|x:f32, delta:u32, width:u32| cuda_warp_shfl_down_f32(membermask, x, delta, width as i32),
// shfl_down_f64 = @|x:f64, delta:u32, width:u32| cuda_warp_shfl_down_f64(membermask, x, delta, width as i32),
shfl_bfly_i32 = @|x:i32, lane_mask:i32, width:u32| cuda_warp_shfl_xor_i32(membermask, x, lane_mask, width as i32),
shfl_bfly_u32 = @|x:u32, lane_mask:i32, width:u32| cuda_warp_shfl_xor_u32(membermask, x, lane_mask, width as i32),
// shfl_bfly_i64 = @|x:i64, lane_mask:i32, width:u32| cuda_warp_shfl_xor_i64(membermask, x, lane_mask, width as i32),
// shfl_bfly_u64 = @|x:u64, lane_mask:i32, width:u32| cuda_warp_shfl_xor_u64(membermask, x, lane_mask, width as i32),
// shfl_bfly_f32 = @|x:f32, lane_mask:i32, width:u32| cuda_warp_shfl_xor_f32(membermask, x, lane_mask, width as i32),
// shfl_bfly_f64 = @|x:f64, lane_mask:i32, width:u32| cuda_warp_shfl_xor_f64(membermask, x, lane_mask, width as i32),
// match_any_i32 = @|x:i32| cuda_warp_match_any_i32(membermask, x),
// match_any_u32 = @|x:u32| cuda_warp_match_any_u32(membermask, x),
// match_any_i64 = @|x:i64| cuda_warp_match_any_i64(membermask, x),
// match_any_u64 = @|x:u64| cuda_warp_match_any_u64(membermask, x),
// match_any_f32 = @|x:f32| cuda_warp_match_any_f32(membermask, x),
// match_any_f64 = @|x:f64| cuda_warp_match_any_f64(membermask, x),
// match_all_i32 = @|x:i32, predicate:&mut i32| cuda_warp_match_all_i32(membermask, x, predicate),
// match_all_u32 = @|x:u32, predicate:&mut i32| cuda_warp_match_all_u32(membermask, x, predicate),
// match_all_i64 = @|x:i64, predicate:&mut i32| cuda_warp_match_all_i64(membermask, x, predicate),
// match_all_u64 = @|x:u64, predicate:&mut i32| cuda_warp_match_all_u64(membermask, x, predicate),
// match_all_f32 = @|x:f32, predicate:&mut i32| cuda_warp_match_all_f32(membermask, x, predicate),
// match_all_f64 = @|x:f64, predicate:&mut i32| cuda_warp_match_all_f64(membermask, x, predicate),
lanemask = @|| cuda_lanemask() as u64,
lanemask_le = @|| cuda_lanemask_le() as u64,
lanemask_lt = @|| cuda_lanemask_lt() as u64,
lanemask_ge = @|| cuda_lanemask_ge() as u64,
lanemask_gt = @|| cuda_lanemask_gt() as u64
})
}
fn @cuda_block(idx: fn(i32) -> u32, gid: fn() -> u32, thread_idx: fn(i32) -> u32, block_size: fn(i32) -> u32, warp_size: u32, body: fn(group_context) -> ()) -> () {
let linear_thread_idx = @|| {
(thread_idx(2) * block_size(1) + thread_idx(1)) * block_size(0) + thread_idx(0)
};
let warp_idx = @|| {
linear_thread_idx() / warp_size
};
let num_threads = @|| block_size(0) * block_size(1) * block_size(2);
let num_warps = @|| (num_threads() + warp_size - 1) / warp_size;
@body(group_context {
idx = idx,
waves = @|body|@|| cuda_subwarp(warp_idx, @|| gid() * num_warps() + warp_idx(), get_member_mask_u32(warp_size), warp_size, body),
threads = @|body|@|| cuda_thread(thread_idx, @|| gid() * num_threads() + linear_thread_idx(), body),
num_waves = num_warps,
num_threads = block_size,
barrier = cuda_block_sync,
barrier_all = @|predicate| cuda_block_sync_all(cuda_pred(predicate)) != 0,
barrier_any = @|predicate| cuda_block_sync_any(cuda_pred(predicate)) != 0,
barrier_count = @|predicate| cuda_block_sync_count(cuda_pred(predicate))
})
}
fn @cuda_launch(device: i32, (grid_dim_x: i32, grid_dim_y: i32, grid_dim_z: i32), (block_dim_x: i32, block_dim_y: i32, block_dim_z: i32), wrap_index: index_wrapper, wrap_dim: dim_wrapper, body: fn(grid_context) -> ()) -> () {
// TODO: assert(warp_size == 32)
let warp_size: u32 = 32;
let block_size = wrap_dim(@|i: i32| {
match i {
0 => if ?block_dim_x { block_dim_x as u32 } else { cuda_blockDim_x() as u32 },
1 => if ?block_dim_y { block_dim_y as u32 } else { cuda_blockDim_y() as u32 },
2 => if ?block_dim_z { block_dim_z as u32 } else { cuda_blockDim_z() as u32 },
_ => 1
}
});
let num_blocks = wrap_dim(@|i: i32| {
match i {
0 => if ?grid_dim_x { grid_dim_x as u32 } else { cuda_gridDim_x() as u32 },
1 => if ?grid_dim_y { grid_dim_y as u32 } else { cuda_gridDim_y() as u32 },
2 => if ?grid_dim_y { grid_dim_z as u32 } else { cuda_gridDim_z() as u32 },
_ => 1
}
});
let num_threads_per_block = @|| block_size(0) * block_size(1) * block_size(2);
let num_warps_per_block = @|| (num_threads_per_block() + warp_size - 1) / warp_size;
let num_warps = @|| (num_blocks(0) * num_blocks(1) * num_blocks(2)) * num_warps_per_block();
let num_threads = @|i: i32| num_blocks(i) * block_size(i);
let block_idx = wrap_index(@|i: i32| {
match i { 0 => cuda_blockIdx_x() as u32, 1 => cuda_blockIdx_y() as u32, 2 => cuda_blockIdx_z() as u32, _ => 0 }
});
let linear_block_idx = @|| (block_idx(2) * num_blocks(1) + block_idx(1)) * num_blocks(0) + block_idx(0);
let thread_idx = wrap_index(@|i: i32| {
match i { 0 => cuda_threadIdx_x() as u32, 1 => cuda_threadIdx_y() as u32, 2 => cuda_threadIdx_z() as u32, _ => 0 }
});
let global_thread_idx = @|i: i32| block_idx(i) * block_size(i) + thread_idx(i);
let linear_thread_idx = @|| (thread_idx(2) * block_size(1) + thread_idx(1)) * block_size(0) + thread_idx(0);
let global_linear_thread_idx = @|| linear_block_idx() * num_threads_per_block() + linear_thread_idx();
let global_warp_idx = @|| linear_block_idx() * num_warps_per_block() + linear_thread_idx() / warp_size;
cuda(device, (grid_dim_x * block_dim_x, grid_dim_y * block_dim_y, grid_dim_z * block_dim_z), (block_dim_x, block_dim_y, block_dim_z), @|| @body(grid_context {
device = device,
max_concurrency = @|| (num_blocks(0) * num_blocks(1) * num_blocks(2)) as i32 * num_threads_per_block() as i32,
groups = @|body|@|| cuda_block(block_idx, linear_block_idx, thread_idx, block_size, warp_size, body),
waves = @|body|@|| cuda_subwarp(global_warp_idx, global_warp_idx, get_member_mask_u32(warp_size), warp_size, body),
threads = @|body|@|| cuda_thread(global_thread_idx, global_linear_thread_idx, body),
num_groups = num_blocks,
num_waves = num_warps,
num_threads = num_threads
}))
}
fn @get_member_mask_u32(size: u32) -> u32 {
if size == 32 { -1 } else { (1 << size) - 1 }
}
fn @get_member_mask_u64(size: u32) -> u64 {
if size == 64 { -1 } else { (1 << size as u64) - 1 }
}
type index_wrapper = fn(fn(i32) -> u32) -> fn(i32) -> u32;
type dim_wrapper = fn(fn(i32) -> u32) -> fn(i32) -> u32;
fn @wrap_index_1d(idx: fn(i32) -> u32) -> fn(i32) -> u32 {
@|i: i32| -> u32 {
if ?i {
match i { 0 => idx(0), _ => 0 }
}
else {
idx(i)
}
}
}
fn @wrap_index_2d(idx: fn(i32) -> u32) -> fn(i32) -> u32 {
@|i: i32| -> u32 {
if ?i {
match i { 0 => idx(0), 1 => idx(1), _ => 0 }
}
else {
idx(i)
}
}
}
fn @wrap_index_3d(idx: fn(i32) -> u32) -> fn(i32) -> u32 {
@|i: i32| -> u32 {
if ?i {
match i { 0 => idx(0), 1 => idx(1), 2 => idx(2), _ => 0 }
}
else {
idx(i)
}
}
}
fn @wrap_dim_1d(idx: fn(i32) -> u32) -> fn(i32) -> u32 {
@|i: i32| -> u32 {
if ?i {
match i { 0 => idx(0), _ => 1 }
}
else {
idx(i)
}
}
}
fn @wrap_dim_2d(idx: fn(i32) -> u32) -> fn(i32) -> u32 {
@|i: i32| -> u32 {
if ?i {
match i { 0 => idx(0), 1 => idx(1), _ => 1 }
}
else {
idx(i)
}
}
}
fn @wrap_dim_3d(idx: fn(i32) -> u32) -> fn(i32) -> u32 {
@|i: i32| -> u32 {
if ?i {
match i { 0 => idx(0), 1 => idx(1), 2 => idx(2), _ => 1 }
}
else {
idx(i)
}
}
}
fn @print_hex[T](val: T, skip_leading_zeros: bool) -> () {
print_char('0'); print_char('x');
let mut leading = skip_leading_zeros;
let l = 2 * sizeof[T]() as i32;
let p = &val as &[u8];
for i in range(0, l) {
let j = l - i - 1;
let v:u8 = p(j / 2);
let curr = (v >> (4*(j%2) as u8) & 0xf);
leading = leading && (curr == 0) && i < l-1;
if leading {
continue()
}
let off = if curr < 10 { '0' } else { 'A' - 10 };
print_char(off + curr);
}
}
fn @print_binary(value: u32) -> () {
let group = 4;
let length = 16;
for j in range_step(1, length + 1, group) {
for i in range(j, j + group) {
let b = 1 << (length - i) as u32;
if value & b == b {
print_char('1');
} else {
print_char('0');
}
}
print_char(' ');
}
}
struct AccDevice {
supports_its: bool,
supports_npot_atomic_inc: bool,
launch_1d: fn(fn(grid_context) -> ()) -> fn(i32, i32) -> (),
synchronize: fn() -> (),
alloc: fn(i64) -> Buffer,
platform_device: i32,
platform_name: &[u8],
print_i32: fn(&[u8], i32) -> (),
print_2xi32: fn(&[u8], i32, i32) -> (),
print_3xi32: fn(&[u8], i32, i32, i32) -> ()
}
struct BaseTest {
run_test: fn(fn(&mut addrspace(1) [i8], &mut addrspace(1) u32) -> bool) -> fn() -> (),
finish: fn() -> i32
}
fn createBaseTest(device: AccDevice, additional_mem: i64) -> BaseTest {
let memory_size = round_up_i64(additional_mem + sizeof[u32](), sizeof[u32]());
let device_test_state_alloc = device.alloc(memory_size);
let device_memory = device_test_state_alloc.data as &mut addrspace(1) [i8];
let device_failed_flag = &mut device_memory((memory_size - sizeof[u32]()) as i32) as &mut addrspace(1) u32;
let gpu_result = runtime_alloc(0, sizeof[u32]()) as &mut u32;
*gpu_result = 0;
runtime_copy(0, gpu_result as &[i8], 0, device.platform_device, device_failed_flag as &mut[i8], 0, sizeof[u32]());
let mut cpu_result: bool = true;
BaseTest {
run_test = @|body|@|| {
cpu_result = cpu_result & body(device_memory, device_failed_flag);
},
finish = @|| {
device.synchronize();
runtime_copy(device.platform_device, device_failed_flag as &[i8], 0, 0, gpu_result as &mut[i8], 0, sizeof[u32]());
print_string("gpu_result ");
print_binary(*gpu_result);
print_char('\n');
print_string("cpu_result ");
print_i32((if cpu_result { 0 } else { 1 }));
print_char('\n');
let result = if cpu_result && *gpu_result == 0 { 0 } else { -1 };
runtime_release(0, gpu_result as &[i8]);
release(device_test_state_alloc);
result
}
}
}
type ReadFn = fn(i32) -> i32;
type WriteFn = fn(i32, i32) -> ();
type GlobalFn = fn(i32) -> &mut addrspace(1) i32;
type ValidateFn = fn(fn(i32,i32)->bool, fn(i32,i32)->bool, bool) -> bool;
struct IntrinsicsTest {
run_test: fn(fn(ReadFn, WriteFn, GlobalFn, &mut addrspace(1) u32) -> bool) -> fn() -> (),
globals: fn(i32) -> i32,
locals: fn(i32) -> i32,
validate: ValidateFn,
finish: fn() -> i32
}
fn createIntrinsicsTest(device: AccDevice, num_globals: i32, init_global: fn(i32)->i32, num_locals: i32, init_local: fn(i32)->i32) -> IntrinsicsTest {
let additional_mem = (num_globals + num_locals) as i64 * sizeof[i32]();
let test = createBaseTest(device, additional_mem);
let buffer = alloc_cpu(additional_mem);
let data = buffer.data as &mut[i32];
let mut validation_result = true;
IntrinsicsTest {
run_test = @|body|@|| {
for idx in range(0, num_globals) {
data(idx) = init_global(idx);
}
for idx in range(0, num_locals) {
data(num_globals + idx) = init_local(idx);
}
for device_memory, device_failed_flag in test.run_test() {
runtime_copy(0, buffer.data, 0, device.platform_device, device_memory as &mut[i8], 0, additional_mem);
let read = @|idx:i32| -> i32 { let ptr = device_memory as & addrspace(1) [i32]; ptr(idx + num_globals) };
let write = @|idx:i32, value:i32| -> () { let ptr = device_memory as &mut addrspace(1) [i32]; ptr(idx + num_globals) = value; };
let global = @|idx:i32| -> &mut addrspace(1) i32 { let ptr = device_memory as &mut addrspace(1) [i32]; &mut(ptr(idx)) };
let result = body(read, write, global, device_failed_flag);
device.synchronize();
runtime_copy(device.platform_device, device_memory as &mut[i8], 0, 0, buffer.data, 0, additional_mem);
result
}
},
globals = @|idx| { data(idx) },
locals = @|idx| { data(num_globals + idx) },
validate = @|validate_global: fn(i32,i32)->bool, validate_local: fn(i32,i32)->bool, stop_on_first_fail:bool| -> bool {
let result = (@|| -> bool {
let mut res = true;
for idx in range(0, num_globals) {
let check = validate_global(idx, data(idx));
res &= check;
if !check {
print_string("validation failed! global("); print_hex(idx, false); print_string(") = "); print_i32(data(idx)); print_char('\n');
if stop_on_first_fail { return(false) }
}
}
for idx in range(0, num_locals) {
let check = validate_local(idx, data(num_globals + idx));
res &= check;
if !check {
print_string("validation failed! local("); print_hex(idx, false); print_string(") = "); print_i32(data(num_globals + idx)); print_char('\n');
if stop_on_first_fail { return(false) }
}
}
res
})();
validation_result &= result;
result
},
finish = @|| {
release(buffer);
let test_result = test.finish();
if !validation_result { -2 } else { test_result }
}
}
}
struct error_cont
{
throw: fn() -> (),
throw_print_i32: fn(&[u8], i32) -> (),
throw_print_2xi32: fn(&[u8], i32, i32) -> (),
throw_print_3xi32: fn(&[u8], i32, i32, i32) -> ()
}
static error_handler = @|device: AccDevice, cont: fn() -> ()| error_cont {
throw = @|| cont(),
throw_print_i32 = @|format, arg| -> () { device.print_i32(format, arg); cont(); },
throw_print_2xi32 = @|format, arg_1, arg_2| -> () { device.print_2xi32(format, arg_1, arg_2); cont(); },
throw_print_3xi32 = @|format, arg_1, arg_2, arg_3| -> () { device.print_3xi32(format, arg_1, arg_2, arg_3); cont(); }
};
fn @min(a: i32, b: i32) -> i32 { if a < b { a } else { b } }
fn @min_i64(a: i64, b: i64) -> i64 { if a < b { a } else { b } }
fn @max(a: i32, b: i32) -> i32 { if a > b { a } else { b } }
fn @max_i64(a: i64, b: i64) -> i64 { if a > b { a } else { b } }
fn @abs(a: i32) -> i32 { if a < 0 { -a } else { a } }
fn @div_up(a: i32, b: i32) -> i32 { (a + b - 1) / b }
fn @div_up_u32(a: u32, b: u32) -> u32 { (a + b - 1) / b }
fn @div_up_i64(a: i64, b: i64) -> i64 { (a + b - 1) / b }
// nearest multiple of num
fn @round_up(num: i32, multiple: i32) -> i32 { div_up(num, multiple) * multiple }
fn @round_up_u32(num: u32, multiple: u32) -> u32 { div_up_u32(num, multiple) * multiple }
fn @round_up_i64(num: i64, multiple: i64) -> i64 { div_up_i64(num, multiple) * multiple }
fn @round_down(num: i32, multiple: i32) -> i32 { (num / multiple) * multiple }
fn @round_down_u32(num: u32, multiple: u32) -> u32 { (num / multiple) * multiple }
fn @round_down_i64(num: i64, multiple: i64) -> i64 { (num / multiple) * multiple }
fn @is_pot(value: u32) = (value != 0) && ((value & (value - 1)) == 0);
fn @exponential_backoff(body: fn(i32) -> bool) {
fn @loop(min: i32, max: i32) -> () {
if @body(min) {
if min < max {
loop(min * 3 / 2, max);
}
else {
while @body(min) {}
}
}
}
loop
}
static mut total_kernel_timing: i64 = 0;
static mut total_cpu_timing: i64 = 0;
fn @benchmark(get_time: fn() -> i64, num_iter: i32, body: fn() -> (), sync: fn() -> ()) -> i64 {
let times_buf = alloc_cpu(num_iter as i64 * sizeof[i64]());
let times = times_buf.data as &mut[i64];
for i in range(0, num_iter) {
let start = get_time();
body();
sync();
times(i) = get_time() - start;
}
sort_i64(num_iter, times);
print_string("Timing: ");
print_f64(times(num_iter/2) as f64 / 1000.0);
print_string(" | ");
print_f64(times(0) as f64 / 1000.0);
print_string(" | ");
print_f64(times(num_iter-1) as f64 / 1000.0);
print_string(" (median(");
print_i32(num_iter);
print_string(") | minimum | maximum) ms\n");
let median = times(num_iter/2);
release(times_buf);
median
}
static iter_acc = 30;
static iter_cpu = 270;
fn @benchmark_acc( body: fn() -> ()) = @|acc: Accelerator| total_kernel_timing += benchmark(get_kernel_time, iter_acc, body, acc.sync);
fn @benchmark_cpu( body: fn() -> ()) = @|| total_cpu_timing += benchmark(get_micro_time, iter_cpu, body, @||{});
fn @benchmark_cuda(body: fn() -> ()) = @|dev: i32, N: i32| total_kernel_timing += benchmark(get_kernel_time, N, body, @|| synchronize_cuda(dev));
fn print_total_timing() -> () {
print_string("Total timing for cpu / kernel: ");
print_f64(total_cpu_timing as f64 / 1000.0);
print_string(" / ");
print_f64(total_kernel_timing as f64 / 1000.0);
print_string(" ms\n")
}
fn @(?num) sort_i64(num: i32, arr: &mut[i64]) -> () {
// insertion sort
for i in range(1, num) {
let x = arr(i);
let mut j = i;
while j > 0 && arr(j-1) > x {
arr(j) = arr(j-1);
j = j - 1;
}
arr(j) = x;
}
}
fn @(?num) sort_f32(num: i32, arr: &mut[f32]) -> () {
// insertion sort
for i in range(1, num) {
let x = arr(i);
let mut j = i;
while j > 0 && arr(j-1) > x {
arr(j) = arr(j-1);
j = j - 1;
}
arr(j) = x;
}
}
fn print_endl() -> () { print_string("\n"); }
mod rng {
fn @xorshift32(state: u32) -> u32 {
let mut x = state;
x ^= x << 13;
x ^= x >> 17;
x ^= x << 5;
x
}
fn @xorseed32(i: u32) -> u32 {
xorshift32(xorshift32((i + 23) * 42))
}
}
struct CSVPrinter {
row: fn(fn(ValueVisitor) -> ()) -> fn() -> (),
flush: fn() -> ()
}
fn @make_csv_printer() = CSVPrinter {
row = @|body|@|| {
let mut b = false;
@body(ValueVisitor {
enum_string = @|v| { if b { print_char(';'); } print_string(v); b = true; },
enum_i32 = @|v| { if b { print_char(';'); } print_i32(v); b = true; },
enum_i64 = @|v| { if b { print_char(';'); } print_i64(v); b = true; },
enum_f32 = @|v| { if b { print_char(';'); } print_f32(v); b = true; }
});
print_char('\n');
},
flush = print_flush
};
#[import(cc = "C")] fn FINGERPRINT() -> &[u8];
#[import(cc = "C")] fn parse_int_arg(&mut u8, i32) -> i32;
#[import(cc = "C")] fn parse_float_arg(&mut u8, i32) -> f32;
#[import(cc = "C")] fn enum_int_arg(&mut u8, &[u8]) -> ();
#[import(cc = "C")] fn enum_float_arg(&mut u8, &[u8]) -> ();
#[import(cc = "C")] fn throw_usage_error(&[u8]) -> ();
struct CmdArgs {
parse_int: fn() -> i32,
parse_float: fn() -> f32
}
fn @make_cmd_args(argc: i32, argv: &mut u8) {
let mut i = 0;
let next_arg = @|| {
if i >= argc {
throw_usage_error("missing arguments");
}
let tmp = i;
i = i + 1;
tmp
};
CmdArgs {
parse_int = @|| parse_int_arg(argv, next_arg()),
parse_float = @|| parse_float_arg(argv, next_arg())
}
}
struct BenchmarkParamsVisitor {
visit_int_param: fn(&[u8]) -> bool,
visit_float_param: fn(&[u8]) -> bool
}
fn wrap_visit_func_int(visit_func: fn(BenchmarkParamsVisitor) -> bool, name: &[u8]) -> fn(BenchmarkParamsVisitor) -> bool {
@|visitor| if visit_func(visitor) { visitor.visit_int_param(name) } else { false }
}
fn wrap_visit_func_float(visit_func: fn(BenchmarkParamsVisitor) -> bool, name: &[u8]) -> fn(BenchmarkParamsVisitor) -> bool {
@|visitor| if visit_func(visitor) { visitor.visit_float_param(name) } else { false }
}
struct BenchmarkParams7[A1, A2, A3, A4, A5, A6, A7] {
// add_int: fn(&[u8]) -> BenchmarkParams8[A1, A2, A3, A4, A5, A6, A7, i32],
// add_float: fn(&[u8]) -> BenchmarkParams8[A1, A2, A3, A4, A5, A6, A7, f32],
visit: fn(BenchmarkParamsVisitor) -> BenchmarkParamsVisitor,
bind: fn(fn(AccDevice, A1, A2, A3, A4, A5, A6, A7) -> i32) -> fn(AccDevice, CmdArgs) -> (i32, CmdArgs)
}
fn @make_cmd_args_7[A1, A2, A3, A4, A5, A6, A7](visit: fn(BenchmarkParamsVisitor) -> bool, bind: fn(fn(AccDevice, A1, A2, A3, A4, A5, A6, A7) -> i32) -> fn(AccDevice, CmdArgs) -> (i32, CmdArgs)) = BenchmarkParams7[A1, A2, A3, A4, A5, A6, A7] {
// add_int = @|name| make_cmd_args_8[A1, A2, A3, A4, A5, A6, A7, i32](wrap_visit_func_int(visit, name), @|f| @|device, args| bind(@|device, a1, a2, a3, a4, a5, a6, a7| f(device, a1, a2, a3, a4, a5, a6, a7, args.parse_int()))(device, args)),
// add_float = @|name| make_cmd_args_8[A1, A2, A3, A4, A5, A6, A7, f32](wrap_visit_func_float(visit, name), @|f| @|device, args| bind(@|device, a1, a2, a3, a4, a5, a6, a7| f(device, a1, a2, a3, a4, a5, a6, a7, args.parse_float()))(device, args)),
visit = @|visitor| { visit(visitor); visitor },
bind = bind
};
struct BenchmarkParams6[A1, A2, A3, A4, A5, A6] {
add_int: fn(&[u8]) -> BenchmarkParams7[A1, A2, A3, A4, A5, A6, i32],
add_float: fn(&[u8]) -> BenchmarkParams7[A1, A2, A3, A4, A5, A6, f32],
visit: fn(BenchmarkParamsVisitor) -> BenchmarkParamsVisitor,
bind: fn(fn(AccDevice, A1, A2, A3, A4, A5, A6) -> i32) -> fn(AccDevice, CmdArgs) -> (i32, CmdArgs)
}
fn @make_cmd_args_6[A1, A2, A3, A4, A5, A6](visit: fn(BenchmarkParamsVisitor) -> bool, bind: fn(fn(AccDevice, A1, A2, A3, A4, A5, A6) -> i32) -> fn(AccDevice, CmdArgs) -> (i32, CmdArgs)) = BenchmarkParams6[A1, A2, A3, A4, A5, A6] {
add_int = @|name| make_cmd_args_7[A1, A2, A3, A4, A5, A6, i32](wrap_visit_func_int(visit, name), @|f| @|device, args| bind(@|device, a1, a2, a3, a4, a5, a6| f(device, a1, a2, a3, a4, a5, a6, args.parse_int()))(device, args)),
add_float = @|name| make_cmd_args_7[A1, A2, A3, A4, A5, A6, f32](wrap_visit_func_float(visit, name), @|f| @|device, args| bind(@|device, a1, a2, a3, a4, a5, a6| f(device, a1, a2, a3, a4, a5, a6, args.parse_float()))(device, args)),
visit = @|visitor| { visit(visitor); visitor },
bind = bind
};
struct BenchmarkParams5[A1, A2, A3, A4, A5] {
add_int: fn(&[u8]) -> BenchmarkParams6[A1, A2, A3, A4, A5, i32],
add_float: fn(&[u8]) -> BenchmarkParams6[A1, A2, A3, A4, A5, f32],
visit: fn(BenchmarkParamsVisitor) -> BenchmarkParamsVisitor,
bind: fn(fn(AccDevice, A1, A2, A3, A4, A5) -> i32) -> fn(AccDevice, CmdArgs) -> (i32, CmdArgs)
}
fn @make_cmd_args_5[A1, A2, A3, A4, A5](visit: fn(BenchmarkParamsVisitor) -> bool, bind: fn(fn(AccDevice, A1, A2, A3, A4, A5) -> i32) -> fn(AccDevice, CmdArgs) -> (i32, CmdArgs)) = BenchmarkParams5[A1, A2, A3, A4, A5] {
add_int = @|name| make_cmd_args_6[A1, A2, A3, A4, A5, i32](wrap_visit_func_int(visit, name), @|f| @|device, args| bind(@|device, a1, a2, a3, a4, a5| f(device, a1, a2, a3, a4, a5, args.parse_int()))(device, args)),
add_float = @|name| make_cmd_args_6[A1, A2, A3, A4, A5, f32](wrap_visit_func_float(visit, name), @|f| @|device, args| bind(@|device, a1, a2, a3, a4, a5| f(device, a1, a2, a3, a4, a5, args.parse_float()))(device, args)),
visit = @|visitor| { visit(visitor); visitor },
bind = bind
};
struct BenchmarkParams4[A1, A2, A3, A4] {
add_int: fn(&[u8]) -> BenchmarkParams5[A1, A2, A3, A4, i32],
add_float: fn(&[u8]) -> BenchmarkParams5[A1, A2, A3, A4, f32],
visit: fn(BenchmarkParamsVisitor) -> BenchmarkParamsVisitor,
bind: fn(fn(AccDevice, A1, A2, A3, A4) -> i32) -> fn(AccDevice, CmdArgs) -> (i32, CmdArgs)
}
fn @make_cmd_args_4[A1, A2, A3, A4](visit: fn(BenchmarkParamsVisitor) -> bool, bind: fn(fn(AccDevice, A1, A2, A3, A4) -> i32) -> fn(AccDevice, CmdArgs) -> (i32, CmdArgs)) = BenchmarkParams4[A1, A2, A3, A4] {
add_int = @|name| make_cmd_args_5[A1, A2, A3, A4, i32](wrap_visit_func_int(visit, name), @|f| @|device, args| bind(@|device, a1, a2, a3, a4| f(device, a1, a2, a3, a4, args.parse_int()))(device, args)),
add_float = @|name| make_cmd_args_5[A1, A2, A3, A4, f32](wrap_visit_func_float(visit, name), @|f| @|device, args| bind(@|device, a1, a2, a3, a4| f(device, a1, a2, a3, a4, args.parse_float()))(device, args)),
visit = @|visitor| { visit(visitor); visitor },
bind = bind
};
struct BenchmarkParams3[A1, A2, A3] {
add_int: fn(&[u8]) -> BenchmarkParams4[A1, A2, A3, i32],
add_float: fn(&[u8]) -> BenchmarkParams4[A1, A2, A3, f32],
visit: fn(BenchmarkParamsVisitor) -> BenchmarkParamsVisitor,
bind: fn(fn(AccDevice, A1, A2, A3) -> i32) -> fn(AccDevice, CmdArgs) -> (i32, CmdArgs)
}
fn @make_cmd_args_3[A1, A2, A3](visit: fn(BenchmarkParamsVisitor) -> bool, bind: fn(fn(AccDevice, A1, A2, A3) -> i32) -> fn(AccDevice, CmdArgs) -> (i32, CmdArgs)) = BenchmarkParams3[A1, A2, A3] {
add_int = @|name| make_cmd_args_4[A1, A2, A3, i32](wrap_visit_func_int(visit, name), @|f| @|device, args| bind(@|device, a1, a2, a3| f(device, a1, a2, a3, args.parse_int()))(device, args)),
add_float = @|name| make_cmd_args_4[A1, A2, A3, f32](wrap_visit_func_float(visit, name), @|f| @|device, args| bind(@|device, a1, a2, a3| f(device, a1, a2, a3, args.parse_float()))(device, args)),
visit = @|visitor| { visit(visitor); visitor },
bind = bind
};
struct BenchmarkParams2[A1, A2] {
add_int: fn(&[u8]) -> BenchmarkParams3[A1, A2, i32],
add_float: fn(&[u8]) -> BenchmarkParams3[A1, A2, f32],
visit: fn(BenchmarkParamsVisitor) -> BenchmarkParamsVisitor,
bind: fn(fn(AccDevice, A1, A2) -> i32) -> fn(AccDevice, CmdArgs) -> (i32, CmdArgs)
}
fn @make_cmd_args_2[A1, A2](visit: fn(BenchmarkParamsVisitor) -> bool, bind: fn(fn(AccDevice, A1, A2) -> i32) -> fn(AccDevice, CmdArgs) -> (i32, CmdArgs)) = BenchmarkParams2[A1, A2] {
add_int = @|name| make_cmd_args_3[A1, A2, i32](wrap_visit_func_int(visit, name), @|f| @|device, args| bind(@|device, a1, a2| f(device, a1, a2, args.parse_int()))(device, args)),
add_float = @|name| make_cmd_args_3[A1, A2, f32](wrap_visit_func_float(visit, name), @|f| @|device, args| bind(@|device, a1, a2| f(device, a1, a2, args.parse_float()))(device, args)),
visit = @|visitor| { visit(visitor); visitor },
bind = bind
};
struct BenchmarkParams1[A1] {
add_int: fn(&[u8]) -> BenchmarkParams2[A1, i32],
add_float: fn(&[u8]) -> BenchmarkParams2[A1, f32],
visit: fn(BenchmarkParamsVisitor) -> BenchmarkParamsVisitor,
bind: fn(fn(AccDevice, A1) -> i32) -> fn(AccDevice, CmdArgs) -> (i32, CmdArgs)
}
fn @make_cmd_args_1[A1](visit: fn(BenchmarkParamsVisitor) -> bool, bind: fn(fn(AccDevice, A1) -> i32) -> fn(AccDevice, CmdArgs) -> (i32, CmdArgs)) = BenchmarkParams1[A1] {
add_int = @|name| make_cmd_args_2[A1, i32](wrap_visit_func_int(visit, name), @|f| @|device, args| bind(@|device, a1| f(device, a1, args.parse_int()))(device, args)),
add_float = @|name| make_cmd_args_2[A1, f32](wrap_visit_func_float(visit, name), @|f| @|device, args| bind(@|device, a1| f(device, a1, args.parse_float()))(device, args)),
visit = @|visitor| { visit(visitor); visitor },
bind = bind
};
struct BenchmarkParams {
add_int: fn(&[u8]) -> BenchmarkParams1[i32],
add_float: fn(&[u8]) -> BenchmarkParams1[f32],
visit: fn(BenchmarkParamsVisitor) -> BenchmarkParamsVisitor,
bind: fn(fn(AccDevice) -> i32) -> fn(AccDevice, CmdArgs) -> (i32, CmdArgs)
}
fn @make_benchmark_params() = BenchmarkParams {
add_int = @|name| make_cmd_args_1[i32](@|visitor| visitor.visit_int_param(name), @|f| @|device, args| (f(device, args.parse_int()), args)),
add_float = @|name| make_cmd_args_1[f32](@|visitor| visitor.visit_float_param(name), @|f| @|device, args| (f(device, args.parse_float()), args)),
visit = @|visitor| visitor,
bind = @|f| @|device, args| (f(device), args)
};
#[export]
fn benchmark_enum_args(ctx: &mut u8) {
let params = describe_benchmark_params(make_benchmark_params());
params.visit(BenchmarkParamsVisitor {
visit_int_param = @|name| { enum_int_arg(ctx, name); true },
visit_float_param = @|name| { enum_float_arg(ctx, name); true },
});
}
#[export]
fn benchmark_print_info(device: i32) {
print_string(runtime_device_name(createAccDevice(device).platform_device)); print_char('\n');
print_string(FINGERPRINT()); print_char('\n');
0
}
#[export]
fn benchmark_run(device: i32, argc: i32, argv: &mut u8) {
let params = describe_benchmark_params(make_benchmark_params());
let num_params = {
let mut n = 0;
params.visit(BenchmarkParamsVisitor {
visit_int_param = @|_name| { n = n + 1; true },
visit_float_param = @|_name| { n = n + 1; true },
});
n
};
if argc != num_params {
throw_usage_error("incorrect number of arguments");
}
let run = params.bind(run_benchmark);
run(createAccDevice(device), make_cmd_args(argc, argv)).0
}
#[import(cc = "C")] fn instrumentation_create(i32) -> &mut u8;
#[import(cc = "C")] fn instrumentation_begin(&mut u8) -> ();
#[import(cc = "C")] fn instrumentation_end(&mut u8) -> f32;
#[import(cc = "C")] fn instrumentation_destroy(&mut u8) -> ();
struct Instrumentation {
time: fn(fn() -> ()) -> fn() -> f32,
destroy: fn() -> ()
}
fn @create_instrumentation(device: AccDevice) {
let ctx = instrumentation_create(device.platform_device >> 4);
// ^ HACK!
Instrumentation {
time = @|body|@|| {
instrumentation_begin(ctx);
@body();
instrumentation_end(ctx)
},
destroy = @|| instrumentation_destroy(ctx)
}
}
fn @upsweep_exponential(body: fn(i32) -> ()) {
fn @(?min & ?max & ?exp) loop(min: i32, max: i32, exp: i32) -> () {
if min <= max {
@body(min);
loop(min * exp, max, exp);
}
}
loop
}
struct ValueVisitor {
enum_string: fn(&[u8]) -> (),
enum_i32: fn(i32) -> (),
enum_i64: fn(i64) -> (),
enum_f32: fn(f32) -> (),
}
struct QueueBenchmark {
enum_param_names: fn(fn(&[u8]) -> ()) -> (),
enum_param_values: fn(ValueVisitor) -> (),
reset: fn(grid_context) -> (),
run: fn(QueueInstrumentationScope, i32, i32) -> ()
}
fn @run_queue_benchmark(device: AccDevice, block_size: i32, benchmark: QueueBenchmark, queue_name: &[u8], create_queue: queue_constructor[u32], queue_size: i32, num_threads_min: i32, num_threads_max: i32, create_queue_instrumentation: fn(AccDevice) -> QueueInstrumentation) -> i32 {
match create_queue(device, queue_size) {
create_queue_result[u32]::Ok(queue) => {
let instrumentation = create_instrumentation(device);
let queue_instrumentation = create_queue_instrumentation(device);
let csv = make_csv_printer();
// doing this during printing messes with the PE, probably due to the external calls
let device_name = runtime_device_name(device.platform_device);
let fingerprint = FINGERPRINT();
for r in csv.row() { r.enum_string("queue_type"); r.enum_string("queue_size"); benchmark.enum_param_names(r.enum_string); }
for r in csv.row() { r.enum_string(queue_name); r.enum_i32(queue_size); benchmark.enum_param_values(r); }
for _ in csv.row() { }
for r in csv.row() { r.enum_string("platform"); r.enum_string("device_name"); r.enum_string("fingerprint"); }
for r in csv.row() { r.enum_string(device.platform_name); r.enum_string(device_name); r.enum_string(fingerprint); }
for _ in csv.row() { }
for r in csv.row() { r.enum_string("num_threads"); r.enum_string("t/ms"); queue_instrumentation.enum_result_names(r.enum_string); }
let N = 10;
for num_threads in upsweep_exponential(num_threads_min, num_threads_max, 2) {
for i in range(0, N) {
for r in csv.row() {
r.enum_i32(num_threads);
// TODO: better launch config (launch based on occupancy?)
// TODO: the queue reset may require launch config of benchmark
for grid in device.launch_1d(1, block_size) {
queue.reset(grid);
queue_instrumentation.reset(grid);
benchmark.reset(grid);
}
device.synchronize();
let t = for instrumentation.time() {
for qis in queue_instrumentation.record(queue) {
benchmark.run(qis, num_threads, i);
}
};
device.synchronize();
r.enum_f32(t);
queue_instrumentation.results().enum_values(r);
}
csv.flush();
}
}
0
},
create_queue_result[u32]::Err(msg) => {
print_string(msg);
print_endl();
-1
}
}
}
struct QueueOperationStatistics {
num_operations: i64,
t_total: i64,
t_min: i32,
t_max: i32,
}
struct QueueBenchmarkStatistics {
enqueue_stats_succ: QueueOperationStatistics,
enqueue_stats_fail: QueueOperationStatistics,
dequeue_stats_succ: QueueOperationStatistics,
dequeue_stats_fail: QueueOperationStatistics
}
fn @init_queue_operation_stats() = QueueOperationStatistics {
num_operations = 0,
t_total = 0,
t_min = ((1 as u32 << 31) - 1) as i32,
t_max = 0
};
fn @record_queue_operation(stats: &mut QueueOperationStatistics, t: i32) {
++stats.num_operations;
stats.t_total += t as i64;
stats.t_min = min(stats.t_min, t);
stats.t_max = max(stats.t_max, t);
}
fn @init_benchmark_stats() = QueueBenchmarkStatistics {
enqueue_stats_succ = init_queue_operation_stats(),
enqueue_stats_fail = init_queue_operation_stats(),
dequeue_stats_succ = init_queue_operation_stats(),
dequeue_stats_fail = init_queue_operation_stats(),
};
fn @accumulate_queue_operation_stats(dest: &mut addrspace(1) QueueOperationStatistics, stats: QueueOperationStatistics, thread: thread_context) {
thread.atomic_add_global_i64(dest.num_operations, stats.num_operations, memory_order::relaxed);
thread.atomic_add_global_i64(dest.t_total, stats.t_total, memory_order::relaxed);
thread.atomic_min_global_i32(dest.t_min, stats.t_min, memory_order::relaxed);
thread.atomic_max_global_i32(dest.t_max, stats.t_max, memory_order::relaxed);
}
fn @accumulate_benchmark_stats(dest: &mut addrspace(1) QueueBenchmarkStatistics, stats: QueueBenchmarkStatistics, thread: thread_context) {
accumulate_queue_operation_stats(&mut dest.enqueue_stats_succ, stats.enqueue_stats_succ, thread);
accumulate_queue_operation_stats(&mut dest.enqueue_stats_fail, stats.enqueue_stats_fail, thread);
accumulate_queue_operation_stats(&mut dest.dequeue_stats_succ, stats.dequeue_stats_succ, thread);
accumulate_queue_operation_stats(&mut dest.dequeue_stats_fail, stats.dequeue_stats_fail, thread);
}
fn @wrap_queue_instrumentation[T](queue: ProducerConsumerQueue[T], stats: &mut QueueBenchmarkStatistics) -> ProducerConsumerQueue[T] {
let wrapped_queue = ProducerConsumerQueue[T] {
push = @|source:fn()->T| @|thread:thread_context| -> i32 {
let t_begin = thread.timestamp32();
let num_pushed = for queue.push(thread) {
@source()
};
let t_end = thread.timestamp32();
let dt = t_end - t_begin;
if num_pushed > 0 {
record_queue_operation(stats.enqueue_stats_succ, dt);
}
else {
record_queue_operation(stats.enqueue_stats_fail, dt);
}
num_pushed
},
pop = @|sink:fn(T)->()| @|thread:thread_context| -> i32 {
let t_begin = thread.timestamp32();
let num_poppped = for el in queue.pop(thread) {
@sink(el);
};
let t_end = thread.timestamp32();
let dt = t_end - t_begin;
if num_poppped > 0 {
record_queue_operation(stats.dequeue_stats_succ, dt);
}
else {
record_queue_operation(stats.dequeue_stats_fail, dt);
}
num_poppped
},
size = queue.size,
reset = queue.reset,
validate = queue.validate,
release = queue.release
};
wrapped_queue
}
struct QueueInstrumentationResults {
enum_values: fn(ValueVisitor) -> ()
}
struct QueueInstrumentationScope {
record: fn(fn(ProducerConsumerQueue[u32]) -> ()) -> fn(thread_context) -> ()
}
struct QueueInstrumentation {
enum_result_names: fn(fn(&[u8]) -> ()) -> (),
reset: fn(grid_context) -> (),
record: fn(fn(QueueInstrumentationScope) -> ()) -> fn(ProducerConsumerQueue[u32]) -> (),
results: fn() -> QueueInstrumentationResults,
}
fn @create_queue_instrumentation_full(device: AccDevice) {
let stats_buffer_alloc = device.alloc(sizeof[QueueBenchmarkStatistics]());
let stats_buffer = stats_buffer_alloc.data as &mut addrspace(1) QueueBenchmarkStatistics;
// TODO: release resources
QueueInstrumentation {
enum_result_names = enum_queue_instrumentation_result_names,
reset = @|grid| {
for thread in grid.threads() {
if thread.idx(0) == 0 {
*stats_buffer = init_benchmark_stats();
}
}
},
record = @|body|@|queue| {
@body(QueueInstrumentationScope {
record = @|body|@|thread| {
let mut stats = init_benchmark_stats();
let q = wrap_queue_instrumentation(queue, &mut stats);
@body(q);
accumulate_benchmark_stats(stats_buffer, stats, thread);
}
});
},
results = @|| {
let mut stats: QueueBenchmarkStatistics;
runtime_copy(device.platform_device, stats_buffer_alloc.data, 0, 0, &mut stats as &mut[i8], 0, sizeof[QueueBenchmarkStatistics]());
QueueInstrumentationResults {
enum_values = enum_queue_instrumentation_result_values(stats)
}
}
}
}
fn @enum_queue_instrumentation_result_names(enum_name: fn(&[u8]) -> ()) {
enum_name("num_enqueues"); enum_name("t_enqueues"); enum_name("t_enqueue_min"); enum_name("t_enqueue_max");
enum_name("num_enqueues_failed"); enum_name("t_enqueues_failed"); enum_name("t_enqueue_failed_min"); enum_name("t_enqueue_failed_max");
enum_name("num_dequeues"); enum_name("t_dequeues"); enum_name("t_dequeue_min"); enum_name("t_dequeue_max");
enum_name("num_dequeues_failed"); enum_name("t_dequeues_failed"); enum_name("t_dequeue_failed_min"); enum_name("t_dequeue_failed_max");
}
fn @enum_queue_instrumentation_result_values(stats: QueueBenchmarkStatistics) {
@|v: ValueVisitor| {
v.enum_i64(stats.enqueue_stats_succ.num_operations); v.enum_i64(stats.enqueue_stats_succ.t_total); v.enum_i32(stats.enqueue_stats_succ.t_min); v.enum_i32(stats.enqueue_stats_succ.t_max);
v.enum_i64(stats.enqueue_stats_fail.num_operations); v.enum_i64(stats.enqueue_stats_fail.t_total); v.enum_i32(stats.enqueue_stats_fail.t_min); v.enum_i32(stats.enqueue_stats_fail.t_max);
v.enum_i64(stats.dequeue_stats_succ.num_operations); v.enum_i64(stats.dequeue_stats_succ.t_total); v.enum_i32(stats.dequeue_stats_succ.t_min); v.enum_i32(stats.dequeue_stats_succ.t_max);
v.enum_i64(stats.dequeue_stats_fail.num_operations); v.enum_i64(stats.dequeue_stats_fail.t_total); v.enum_i32(stats.dequeue_stats_fail.t_min); v.enum_i32(stats.dequeue_stats_fail.t_max);
}
}
fn @create_queue_instrumentation_none(_device: AccDevice) {
QueueInstrumentation {
enum_result_names = @|_| {},
reset = @|_| {},
record = @|body|@|queue| {
@body(QueueInstrumentationScope {
record = @|body|@|_| {
@body(queue);
}
});
},
results = @|| {
QueueInstrumentationResults {
enum_values = @|_| {}
}
}
}
}
fn @describe_benchmark_params(params: BenchmarkParams) {
params.add_int("num-threads-min")
.add_int("num-threads-max")
.add_int("block-size")
.add_int("input-elements")
.add_int("workload-size-producer")
.add_int("workload-size-consumer")
}
fn @run_benchmark(device: AccDevice, num_threads_min: i32, num_threads_max: i32, block_size: i32, num_input_elements: i32, workload_size_producer: i32, workload_size_consumer: i32) {
run_queue_benchmark(
device,
block_size,
simple_pipeline_benchmark(device, block_size, num_input_elements, workload_size_producer, workload_size_consumer),
"CPCQ",
createConcurrentProducerConsumerIndexQueue,
16384,
num_threads_min,
num_threads_max,
create_queue_instrumentation_full)
}
mod CPCQ {
struct Queue {
size: i32,
head: u32,
tail: u32
}
struct Element[T] {
lock: u32,
data: T
}
}
struct QueueElement[T] {
clear: fn() -> (),
is_free: fn() -> bool,
store: fn(fn() -> T, thread_context) -> (),
load: fn(fn(T) -> (), thread_context) -> (),
debug_print: fn(AccDevice) -> ()
}
struct QueueElementType[T] {
buffer_size: fn(i32) -> i64,
buffer_alignment: fn() -> i64,
buffer_element: fn(&mut addrspace(1) [u8], u32) -> QueueElement[T]
}
// note (cc < 7.0): threads within the same warp must only ever either enqueue or dequeue stuff concurrently
// this is fine as long as a warp only ever acts as either a producer or consumer at a time
fn @createConcurrentProducerConsumerQueue[T](device: AccDevice, element_type: QueueElementType[T], num_elements: i32, opt: bool) -> create_queue_result[T] {
if num_elements < 0 {
return(create_queue_result[T]::Err("invalid queue size"))
}
if !device.supports_npot_atomic_inc && !is_pot(num_elements as u32) {
return(create_queue_result[T]::Err("queue size must be power of two"))
}
let buffer_size = element_type.buffer_size(num_elements);
let buffer_alignment = element_type.buffer_alignment();
let buffer_data_offset = round_up_i64(sizeof[CPCQ::Queue](), buffer_alignment);
let queue_device_state_alloc = device.alloc(buffer_data_offset + buffer_size);
let queue_device_memory = queue_device_state_alloc.data as &mut addrspace(1) [u8];
let queue = &mut queue_device_memory(0) as &mut addrspace(1) CPCQ::Queue;
let buffer = &mut queue_device_memory(buffer_data_offset) as &mut addrspace(1) [u8];
let buffer_element = @|i:u32| -> QueueElement[T] { element_type.buffer_element(buffer, i) };
let seq_cst = @|thread: thread_context| if opt { thread.memory_barrier(memory_order::seq_cst); } else {};
let order = @|x: memory_order| if opt { x } else { memory_order::relaxed };
create_queue_result[T]::Ok(ProducerConsumerQueue[T] {
push = @|source| @|thread| {
seq_cst(thread);
let current_size = thread.atomic_load_global_i32(queue.size, order(memory_order::relaxed));
if current_size >= num_elements {
0
}
else {
let new_size = thread.atomic_add_global_i32(queue.size, 1, order(memory_order::relaxed));
seq_cst(thread);
for anyq_verbose() {
device.print_3xi32("%d | QUEUE: reserve for push %d -> %d\n", thread.gid() as i32, current_size, new_size);
}
if new_size >= num_elements {
for anyq_verbose() {
device.print_2xi32("%d | QUEUE: reject push %d\n", thread.gid() as i32, new_size);
}
seq_cst(thread);
thread.atomic_sub_global_i32(queue.size, 1, order(memory_order::relaxed));
seq_cst(thread);
0
}
else {
seq_cst(thread);
let i = thread.atomic_inc_global_u32(queue.tail, (num_elements - 1) as u32);
seq_cst(thread);
for anyq_verbose() {
device.print_2xi32("%d | QUEUE: move tail %d\n", thread.gid() as i32, i as i32);
}
buffer_element(i).store(source, thread);
1
}
}
},
pop = @|sink| @|thread| {
seq_cst(thread);
let current_size = thread.atomic_load_global_i32(queue.size, order(memory_order::relaxed));
if current_size <= 0 {
0
}
else {
let available = thread.atomic_sub_global_i32(queue.size, 1, order(memory_order::relaxed));
seq_cst(thread);
for anyq_verbose() {
device.print_3xi32("%d | QUEUE: allocate for pop %d -> %d\n", thread.gid() as i32, current_size, available);
}
if available <= 0 {
for anyq_verbose() {
device.print_2xi32("%d | QUEUE: reject pop %d\n", thread.gid() as i32, available);
}
seq_cst(thread);
thread.atomic_add_global_i32(queue.size, 1, order(memory_order::relaxed));
seq_cst(thread);
0
}
else {
seq_cst(thread);
let i = thread.atomic_inc_global_u32(queue.head, (num_elements - 1) as u32);
seq_cst(thread);
for anyq_verbose() {
device.print_2xi32("%d | QUEUE: move head %d\n", thread.gid() as i32, i as i32);
}
buffer_element(i).load(sink, thread);
1
}
}
},
size = @|thread| {
thread.atomic_load_global_i32(queue.size, memory_order::relaxed)
},
reset = @|grid| {
for thread in grid.threads() {
if thread.idx(0) == 0 {
queue.size = 0;
queue.head = 0;
queue.tail = 0;
}
for i in range_step(thread.idx(0) as i32, num_elements, grid.num_threads(0) as i32) {
buffer_element(i as u32).clear();
}
}
},
validate = @|corrupted, grid| {
for thread in grid.threads() {
let idx = thread.idx(0);
if idx == 0 {
if (queue.size != 0) {
device.print_i32("VALIDATION ERROR: queue size (%d) is not zero!\n", queue.size);
thread.atomic_store_global_u32(corrupted, 1, memory_order::relaxed);
}
}
if idx < num_elements as u32 {
let element = buffer_element(idx);
if !element.is_free() {
thread.atomic_store_global_u32(corrupted, 1, memory_order::relaxed);
element.debug_print(device);
}
}
}
},
release = @|| {
release(queue_device_state_alloc);
}
})
}
fn @genericQueueElementType[T]() {
let element_alignment = alignof[CPCQ::Element[T]]();
let element_size = round_up_i64(sizeof[CPCQ::Element[T]](), element_alignment);
QueueElementType[T] {
buffer_size = @|num_elements| num_elements as i64 * element_size,
buffer_alignment = @|| element_alignment,
buffer_element = @|buffer, i| {
let element = &mut (buffer as &mut addrspace(1) [CPCQ::Element[T]])(i);
QueueElement[T] {
clear = @|| element.lock = 0,
is_free = @|| element.lock == 0,
load = @|sink, thread| {
thread.wait(@|| thread.atomic_cas_global_u32_weak(element.lock, 2, 3, memory_order::acquire, memory_order::relaxed).1, "wait for element lock to read");
sink(element.data);
thread.atomic_store_global_u32(element.lock, 0, memory_order::release);
},
store = @|source, thread| {
thread.wait(@|| thread.atomic_cas_global_u32_weak(element.lock, 0, 1, memory_order::acquire, memory_order::relaxed).1, "wait for element lock to write");
element.data = source();
thread.atomic_store_global_u32(element.lock, 2, memory_order::release);
},
debug_print = @|device| {
device.print_2xi32("VALIDATION ERROR: inconsistent queue state: buffer[%d] = %d\n", i as i32, element.lock as i32);
}
}
}
}
}
fn @indexQueueElementType(opt: bool) = QueueElementType[u32] {
buffer_size = @|num_elements| num_elements as i64 * sizeof[u32](),
buffer_alignment = @|| alignof[u32](),
buffer_element = @|buffer, i| {
let FREE = -1 as u32;
let element = &mut (buffer as &mut addrspace(1) [u32])(i);
let order = @|x: memory_order| if opt { memory_order::seq_cst } else { x };
QueueElement[u32] {
clear = @|| *element = FREE,
is_free = @|| *element == FREE,
load = @|sink, thread| {
thread.wait(@|| {
let el = thread.atomic_exch_global_u32(element, FREE, order(memory_order::relaxed));
if el != FREE {
sink(el);
true
}
else {
false
}
}, "wait for element");
},
store = @|source, thread| {
let value = source();
thread.wait(@|| thread.atomic_cas_global_u32_weak(element, FREE, value, order(memory_order::relaxed), order(memory_order::relaxed)).1, "wait for a successful indexQueueElementType.store()");
},
debug_print = @|device| {
device.print_2xi32("VALIDATION ERROR: inconsistent queue state: buffer[%d] = %d\n", i as i32, *element as i32);
}
}
}
};
fn @createConcurrentProducerConsumerIndexQueue(device: AccDevice, num_elements: i32) = createConcurrentProducerConsumerQueue[u32](device, indexQueueElementType(false), num_elements, false);
fn @createConcurrentProducerConsumerIndexQueueOpt(device: AccDevice, num_elements: i32) = createConcurrentProducerConsumerQueue[u32](device, indexQueueElementType(true), num_elements, true);
fn @createConcurrentProducerConsumerQueueGeneric[T](device: AccDevice, num_elements: i32) = createConcurrentProducerConsumerQueue[T](device, genericQueueElementType[T](), num_elements, false);
fn @simple_pipeline_benchmark(device: AccDevice, block_size: i32, num_input_elements: i32, workload_size_producer: i32, workload_size_consumer: i32) {
let pipeline_state_alloc = device.alloc(1 * sizeof[i32]());
let pipeline_state = pipeline_state_alloc.data as &mut addrspace(1) [i32];
let input_elements = &mut pipeline_state(0);
let completed_elements = &mut pipeline_state(1);
// TODO: release resources
QueueBenchmark {
enum_param_names = @|enum_param| {
enum_param("block_size"); enum_param("num_input_elements"); enum_param("workload_size_producer"); enum_param("workload_size_consumer");
},
enum_param_values = @|v| {
v.enum_i32(block_size); v.enum_i32(num_input_elements); v.enum_i32(workload_size_producer); v.enum_i32(workload_size_consumer);
},
reset = @|grid| {
for thread in grid.threads() {
if thread.idx(0) == 0 {
*input_elements = num_input_elements;
*completed_elements = 0;
}
}
},
run = @|queue_instrumentation, num_threads, i| {
for grid in device.launch_1d(div_up(num_threads, block_size), block_size) {
for wave in grid.waves() {
for thread in wave.threads() {
let thread_id = wave.idx() * wave.num_threads() + thread.idx(0);
if thread_id < num_threads as u32 {
for q in queue_instrumentation.record(thread) {
let mut next_value = rng::xorseed32(thread_id + i as u32 * num_threads as u32);
let simulate_workload = @|workload_size: i32| {
for _ in range(0, workload_size) {
next_value = rng::xorshift32(next_value);
}
next_value
};
while (thread.atomic_load_global_i32(completed_elements, memory_order::relaxed) < num_input_elements) {
let should_drain = @|thread: thread_context| q.size(thread) >= wave.num_threads() as i32 || thread.atomic_load_global_i32(input_elements, memory_order::relaxed) <= 0;
if wave.barrier_any(if thread.idx(0) == 0 { should_drain(thread) } else { false }) {
if for value in q.pop(thread) {
next_value = value;
} > 0 {
simulate_workload(workload_size_consumer);
thread.atomic_add_global_i32(completed_elements, 1, memory_order::relaxed);
}
}
else {
if thread.atomic_sub_global_i32(input_elements, 1, memory_order::relaxed) > 0 {
next_value = simulate_workload(workload_size_producer);
while for q.push(thread) {
next_value
} < 1 {};
}
else {
thread.atomic_add_global_i32(input_elements, 0, memory_order::relaxed);
}
}
}
}
}
}
}
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment