-
-
Save michael-kenzel/7ebbb6f1fc1dda7835cd055a3014f723 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
struct WorkItem { | |
tidx : fn() -> i32, | |
tidy : fn() -> i32, | |
tidz : fn() -> i32, | |
bidx : fn() -> i32, | |
bidy : fn() -> i32, | |
bidz : fn() -> i32, | |
gidx : fn() -> i32, | |
gidy : fn() -> i32, | |
gidz : fn() -> i32, | |
bdimx : fn() -> i32, | |
bdimy : fn() -> i32, | |
bdimz : fn() -> i32, | |
gdimx : fn() -> i32, | |
gdimy : fn() -> i32, | |
gdimz : fn() -> i32, | |
nblkx : fn() -> i32, | |
nblky : fn() -> i32, | |
nblkz : fn() -> i32 | |
} | |
struct Accelerator { | |
exec : fn(fn(WorkItem) -> ()) -> fn((i32, i32, i32), (i32, i32, i32)) -> (), // fn(grid, block)->() | |
sync : fn() -> (), | |
alloc : fn(i64) -> Buffer, | |
alloc_unified : fn(i64) -> Buffer, | |
barrier : fn() -> () | |
} | |
struct Intrinsics { | |
expf : fn(f32) -> f32, | |
exp2f : fn(f32) -> f32, | |
logf : fn(f32) -> f32, | |
log2f : fn(f32) -> f32, | |
powf : fn(f32, f32) -> f32, | |
rsqrtf : fn(f32) -> f32, | |
sqrtf : fn(f32) -> f32, | |
fabsf : fn(f32) -> f32, | |
sinf : fn(f32) -> f32, | |
cosf : fn(f32) -> f32, | |
tanf : fn(f32) -> f32, | |
asinf : fn(f32) -> f32, | |
acosf : fn(f32) -> f32, | |
atanf : fn(f32) -> f32, | |
erff : fn(f32) -> f32, | |
atan2f : fn(f32, f32) -> f32, | |
copysignf : fn(f32, f32) -> f32, | |
fmaf : fn(f32, f32, f32) -> f32, | |
fmaxf : fn(f32, f32) -> f32, | |
fminf : fn(f32, f32) -> f32, | |
fmodf : fn(f32, f32) -> f32, | |
floorf : fn(f32) -> f32, | |
isinff : fn(f32) -> i32, | |
isnanf : fn(f32) -> i32, | |
isfinitef : fn(f32) -> i32, | |
exp : fn(f64) -> f64, | |
exp2 : fn(f64) -> f64, | |
log : fn(f64) -> f64, | |
log2 : fn(f64) -> f64, | |
pow : fn(f64, f64) -> f64, | |
rsqrt : fn(f64) -> f64, | |
sqrt : fn(f64) -> f64, | |
fabs : fn(f64) -> f64, | |
sin : fn(f64) -> f64, | |
cos : fn(f64) -> f64, | |
tan : fn(f64) -> f64, | |
asin : fn(f64) -> f64, | |
acos : fn(f64) -> f64, | |
atan : fn(f64) -> f64, | |
erf : fn(f64) -> f64, | |
atan2 : fn(f64, f64) -> f64, | |
copysign : fn(f64, f64) -> f64, | |
fma : fn(f64, f64, f64) -> f64, | |
fmax : fn(f64, f64) -> f64, | |
fmin : fn(f64, f64) -> f64, | |
fmod : fn(f64, f64) -> f64, | |
floor : fn(f64) -> f64, | |
isinf : fn(f64) -> i32, | |
isnan : fn(f64) -> i32, | |
isfinite : fn(f64) -> i32, | |
min : fn(i32, i32) -> i32, | |
max : fn(i32, i32) -> i32, | |
} | |
#[import(cc = "C")] fn rv_mask() -> bool; | |
#[import(cc = "C")] fn rv_any(_: bool) -> bool; | |
#[import(cc = "C")] fn rv_all(_: bool) -> bool; | |
#[import(cc = "C")] fn rv_ballot(_: bool) -> i32; | |
#[import(cc = "C")] fn rv_extract(_: f32, _: i32) -> f32; | |
#[import(cc = "C")] fn rv_insert(_: f32, _: i32, _: f32) -> f32; | |
#[import(cc = "C")] fn rv_load(_: &f32, _: i32) -> f32; | |
#[import(cc = "C")] fn rv_store(_: &mut f32, _: i32, _: f32) -> (); | |
#[import(cc = "C")] fn rv_shuffle(_: f32, _: i32) -> f32; | |
#[import(cc = "C")] fn rv_align(_: &i8, _: i32)-> &i8; | |
#[import(cc = "C")] fn rv_compact(_: f32, _: bool) -> f32; | |
#[import(cc = "C")] fn rv_lane_id() -> i32; | |
#[import(cc = "C")] fn rv_num_lanes() -> i32; | |
//#[import(cc = "C", name = "sinf")] fn cpu_sinf(f32) -> f32; | |
//#[import(cc = "C", name = "cosf")] fn cpu_cosf(f32) -> f32; | |
#[import(cc = "C", name = "tanf")] fn cpu_tanf(_: f32) -> f32; | |
#[import(cc = "C", name = "asinf")] fn cpu_asinf(_: f32) -> f32; | |
#[import(cc = "C", name = "acosf")] fn cpu_acosf(_: f32) -> f32; | |
#[import(cc = "C", name = "atanf")] fn cpu_atanf(_: f32) -> f32; | |
#[import(cc = "C", name = "erff")] fn cpu_erff(_: f32) -> f32; | |
#[import(cc = "C", name = "fmodf")] fn cpu_fmodf(_: f32, _: f32) -> f32; | |
#[import(cc = "C", name = "atan2f")] fn cpu_atan2f(_: f32, _: f32) -> f32; | |
#[import(cc = "C", name = "anydsl_isinff")] fn cpu_isinff(_: f32) -> i32; | |
#[import(cc = "C", name = "anydsl_isnanf")] fn cpu_isnanf(_: f32) -> i32; | |
#[import(cc = "C", name = "anydsl_isfinitef")] fn cpu_isfinitef(_: f32) -> i32; | |
//#[import(cc = "C", name = "sin")] fn cpu_sin(f64) -> f64; | |
//#[import(cc = "C", name = "cos")] fn cpu_cos(f64) -> f64; | |
#[import(cc = "C", name = "tan")] fn cpu_tan(_: f64) -> f64; | |
#[import(cc = "C", name = "asin")] fn cpu_asin(_: f64) -> f64; | |
#[import(cc = "C", name = "acos")] fn cpu_acos(_: f64) -> f64; | |
#[import(cc = "C", name = "atan")] fn cpu_atan(_: f64) -> f64; | |
#[import(cc = "C", name = "erf")] fn cpu_erf(_: f64) -> f64; | |
#[import(cc = "C", name = "fmod")] fn cpu_fmod(_: f64, _: f64) -> f64; | |
#[import(cc = "C", name = "atan2")] fn cpu_atan2(_: f64, _: f64) -> f64; | |
#[import(cc = "C", name = "anydsl_isinf")] fn cpu_isinf(_: f64) -> i32; | |
#[import(cc = "C", name = "anydsl_isnan")] fn cpu_isnan(_: f64) -> i32; | |
#[import(cc = "C", name = "anydsl_isfinite")] fn cpu_isfinite(_: f64) -> i32; | |
#[import(cc = "device", name = "llvm.exp.f32")] fn cpu_expf(_: f32) -> f32; | |
#[import(cc = "device", name = "llvm.exp2.f32")] fn cpu_exp2f(_: f32) -> f32; | |
#[import(cc = "device", name = "llvm.log.f32")] fn cpu_logf(_: f32) -> f32; | |
#[import(cc = "device", name = "llvm.log2.f32")] fn cpu_log2f(_: f32) -> f32; | |
#[import(cc = "device", name = "llvm.pow.f32")] fn cpu_powf(_: f32, _: f32) -> f32; | |
#[import(cc = "device", name = "llvm.sqrt.f32")] fn cpu_sqrtf(_: f32) -> f32; | |
#[import(cc = "device", name = "llvm.fabs.f32")] fn cpu_fabsf(_: f32) -> f32; | |
#[import(cc = "device", name = "llvm.sin.f32")] fn cpu_sinf(_: f32) -> f32; | |
#[import(cc = "device", name = "llvm.cos.f32")] fn cpu_cosf(_: f32) -> f32; | |
#[import(cc = "device", name = "llvm.floor.f32")] fn cpu_floorf(_: f32) -> f32; | |
#[import(cc = "device", name = "llvm.fma.f32")] fn cpu_fmaf(_: f32, _: f32, _: f32) -> f32; | |
#[import(cc = "device", name = "llvm.fmuladd.f32")] fn cpu_madf(_: f32, _: f32, _: f32) -> f32; | |
#[import(cc = "device", name = "llvm.copysign.f32")] fn cpu_copysignf(_: f32, _: f32) -> f32; | |
#[import(cc = "device", name = "llvm.minnum.f32")] fn cpu_fminf(_: f32, _: f32) -> f32; | |
#[import(cc = "device", name = "llvm.maxnum.f32")] fn cpu_fmaxf(_: f32, _: f32) -> f32; | |
#[import(cc = "device", name = "llvm.exp.f64")] fn cpu_exp(_: f64) -> f64; | |
#[import(cc = "device", name = "llvm.exp2.f64")] fn cpu_exp2(_: f64) -> f64; | |
#[import(cc = "device", name = "llvm.log.f64")] fn cpu_log(_: f64) -> f64; | |
#[import(cc = "device", name = "llvm.log2.f64")] fn cpu_log2(_: f64) -> f64; | |
#[import(cc = "device", name = "llvm.pow.f64")] fn cpu_pow(_: f64, _: f64) -> f64; | |
#[import(cc = "device", name = "llvm.sqrt.f64")] fn cpu_sqrt(_: f64) -> f64; | |
#[import(cc = "device", name = "llvm.fabs.f64")] fn cpu_fabs(_: f64) -> f64; | |
#[import(cc = "device", name = "llvm.sin.f64")] fn cpu_sin(_: f64) -> f64; | |
#[import(cc = "device", name = "llvm.cos.f64")] fn cpu_cos(_: f64) -> f64; | |
#[import(cc = "device", name = "llvm.floor.f64")] fn cpu_floor(_: f64) -> f64; | |
#[import(cc = "device", name = "llvm.fma.f64")] fn cpu_fma(_: f64, _: f64, _: f64) -> f64; | |
#[import(cc = "device", name = "llvm.fmuladd.f64")] fn cpu_mad(_: f64, _: f64, _: f64) -> f64; | |
#[import(cc = "device", name = "llvm.copysign.f64")] fn cpu_copysign(_: f64, _: f64) -> f64; | |
#[import(cc = "device", name = "llvm.minnum.f64")] fn cpu_fmin(_: f64, _: f64) -> f64; | |
#[import(cc = "device", name = "llvm.maxnum.f64")] fn cpu_fmax(_: f64, _: f64) -> f64; | |
#[import(cc = "device", name = "llvm.ctpop.i32")] fn cpu_popcount32(_: i32) -> i32; | |
#[import(cc = "device", name = "llvm.ctpop.i64")] fn cpu_popcount64(_: i64) -> i64; | |
#[import(cc = "device", name = "llvm.ctlz.i32")] fn cpu_clz32(_: i32, _: bool) -> i32; | |
#[import(cc = "device", name = "llvm.ctlz.i64")] fn cpu_clz64(_: i64, _: bool) -> i64; | |
#[import(cc = "device", name = "llvm.cttz.i32")] fn cpu_ctz32(_: i32, _: bool) -> i32; | |
#[import(cc = "device", name = "llvm.cttz.i64")] fn cpu_ctz64(_: i64, _: bool) -> i64; | |
#[import(cc = "device", name = "llvm.x86.bmi.pext.32")] fn cpu_pext32(_: i32, _: i32) -> i32; | |
#[import(cc = "device", name = "llvm.x86.bmi.pext.64")] fn cpu_pext64(_: i64, _: i64) -> i64; | |
#[import(cc = "device", name = "llvm.x86.bmi.pdep.32")] fn cpu_pdep32(_: i32, _: i32) -> i32; | |
#[import(cc = "device", name = "llvm.x86.bmi.pdep.64")] fn cpu_pdep64(_: i64, _: i64) -> i64; | |
#[import(cc = "device", name = "llvm.prefetch.p0")] fn cpu_prefetch(&u8, i32, i32, i32) -> (); | |
// | |
// atomics | |
// 0 1 2 3 4 5 6 7 8 9 10 11 12 | |
// operation: Xchg Add Sub And Nand Or Xor Max Min UMax UMin FAdd FSub | |
// 0 1 2 4 5 6 7 | |
// ordering: NotAtomic Unordered Monotonic Acquire Release AcquireRelease SequentiallyConsistent | |
// syncscope: singlethread "" (system) | |
// | |
fn @cpu_atomic_xchg(a: &mut i32, b: i32) -> i32 = atomic[i32](0, a, b, 7, ""); | |
fn @cpu_atomic_add(a: &mut i32, b: i32) -> i32 = atomic[i32](1, a, b, 7, ""); | |
fn @cpu_atomic_sub(a: &mut i32, b: i32) -> i32 = atomic[i32](2, a, b, 7, ""); | |
fn @cpu_atomic_max(a: &mut i32, b: i32) -> i32 = atomic[i32](7, a, b, 7, ""); | |
fn @cpu_atomic_min(a: &mut i32, b: i32) -> i32 = atomic[i32](8, a, b, 7, ""); | |
static cpu_intrinsics = Intrinsics { | |
expf = cpu_expf, | |
exp2f = cpu_exp2f, | |
logf = cpu_logf, | |
log2f = cpu_log2f, | |
powf = cpu_powf, | |
rsqrtf = @|a| { 1:f32 / cpu_sqrtf(a) }, | |
sqrtf = cpu_sqrtf, | |
fabsf = cpu_fabsf, | |
sinf = cpu_sinf, | |
cosf = cpu_cosf, | |
tanf = cpu_tanf, | |
asinf = cpu_asinf, | |
acosf = cpu_acosf, | |
atanf = cpu_atanf, | |
erff = cpu_erff, | |
atan2f = cpu_atan2f, | |
copysignf = cpu_copysignf, | |
fmaf = cpu_fmaf, | |
fmaxf = cpu_fmaxf, | |
fminf = cpu_fminf, | |
fmodf = cpu_fmodf, | |
floorf = cpu_floorf, | |
isinff = cpu_isinff, | |
isnanf = cpu_isnanf, | |
isfinitef = cpu_isfinitef, | |
exp = cpu_exp, | |
exp2 = cpu_exp2, | |
log = cpu_log, | |
log2 = cpu_log2, | |
pow = cpu_pow, | |
rsqrt = @|a| { 1.0 / cpu_sqrt(a) }, | |
sqrt = cpu_sqrt, | |
fabs = cpu_fabs, | |
sin = cpu_sin, | |
cos = cpu_cos, | |
tan = cpu_tan, | |
asin = cpu_asin, | |
acos = cpu_acos, | |
atan = cpu_atan, | |
erf = cpu_erf, | |
atan2 = cpu_atan2, | |
copysign = cpu_copysign, | |
fma = cpu_fma, | |
fmax = cpu_fmax, | |
fmin = cpu_fmin, | |
fmod = cpu_fmod, | |
floor = cpu_floor, | |
isinf = cpu_isinf, | |
isnan = cpu_isnan, | |
isfinite = cpu_isfinite, | |
min = @|a, b| { if a < b { a } else { b } }, | |
max = @|a, b| { if a > b { a } else { b } }, | |
}; | |
// no declarations are emitted for "device" functions | |
#[import(cc = "C", name = "exp")] fn hls_expf(f32) -> f32; | |
#[import(cc = "C", name = "exp2")] fn hls_exp2f(f32) -> f32; | |
#[import(cc = "C", name = "log")] fn hls_logf(f32) -> f32; | |
#[import(cc = "C", name = "log2")] fn hls_log2f(f32) -> f32; | |
#[import(cc = "C", name = "pow")] fn hls_powf(f32, f32) -> f32; | |
#[import(cc = "C", name = "rsqrt")] fn hls_rsqrtf(f32) -> f32; | |
#[import(cc = "C", name = "sqrt")] fn hls_sqrtf(f32) -> f32; | |
#[import(cc = "C", name = "fabs")] fn hls_fabsf(f32) -> f32; | |
#[import(cc = "C", name = "sin")] fn hls_sinf(f32) -> f32; | |
#[import(cc = "C", name = "cos")] fn hls_cosf(f32) -> f32; | |
#[import(cc = "C", name = "tan")] fn hls_tanf(f32) -> f32; | |
#[import(cc = "C", name = "asin")] fn hls_asinf(f32) -> f32; | |
#[import(cc = "C", name = "acos")] fn hls_acosf(f32) -> f32; | |
#[import(cc = "C", name = "atan")] fn hls_atanf(f32) -> f32; | |
#[import(cc = "C", name = "erf")] fn hls_erff(f32) -> f32; | |
#[import(cc = "C", name = "atan2")] fn hls_atan2f(f32, f32) -> f32; | |
#[import(cc = "C", name = "fmod")] fn hls_fmodf(f32, f32) -> f32; | |
#[import(cc = "C", name = "floor")] fn hls_floorf(f32) -> f32; | |
#[import(cc = "C", name = "isinf")] fn hls_isinff(f32) -> i32; | |
#[import(cc = "C", name = "isnan")] fn hls_isnanf(f32) -> i32; | |
#[import(cc = "C", name = "isfinite")] fn hls_isfinitef(f32) -> i32; | |
#[import(cc = "C", name = "fma")] fn hls_fmaf(f32, f32, f32) -> f32; | |
#[import(cc = "C", name = "mad")] fn hls_madf(f32, f32, f32) -> f32; | |
#[import(cc = "C", name = "copysign")] fn hls_copysignf(f32, f32) -> f32; | |
#[import(cc = "C", name = "exp")] fn hls_exp(f64) -> f64; | |
#[import(cc = "C", name = "exp2")] fn hls_exp2(f64) -> f64; | |
#[import(cc = "C", name = "log")] fn hls_log(f64) -> f64; | |
#[import(cc = "C", name = "log2")] fn hls_log2(f64) -> f64; | |
#[import(cc = "C", name = "pow")] fn hls_pow(f64, f64) -> f64; | |
#[import(cc = "C", name = "rsqrt")] fn hls_rsqrt(f64) -> f64; | |
#[import(cc = "C", name = "sqrt")] fn hls_sqrt(f64) -> f64; | |
#[import(cc = "C", name = "fabs")] fn hls_fabs(f64) -> f64; | |
#[import(cc = "C", name = "sin")] fn hls_sin(f64) -> f64; | |
#[import(cc = "C", name = "cos")] fn hls_cos(f64) -> f64; | |
#[import(cc = "C", name = "tan")] fn hls_tan(f64) -> f64; | |
#[import(cc = "C", name = "asin")] fn hls_asin(f64) -> f64; | |
#[import(cc = "C", name = "acos")] fn hls_acos(f64) -> f64; | |
#[import(cc = "C", name = "atan")] fn hls_atan(f64) -> f64; | |
#[import(cc = "C", name = "erf")] fn hls_erf(f64) -> f64; | |
#[import(cc = "C", name = "atan2")] fn hls_atan2(f64, f64) -> f64; | |
#[import(cc = "C", name = "fmod")] fn hls_fmod(f64, f64) -> f64; | |
#[import(cc = "C", name = "floor")] fn hls_floor(f64) -> f64; | |
#[import(cc = "C", name = "isinf")] fn hls_isinf(f64) -> i32; | |
#[import(cc = "C", name = "isnan")] fn hls_isnan(f64) -> i32; | |
#[import(cc = "C", name = "isfinite")] fn hls_isfinite(f64) -> i32; | |
#[import(cc = "C", name = "fma")] fn hls_fma(f64, f64, f64) -> f64; | |
#[import(cc = "C", name = "mad")] fn hls_mad(f64, f64, f64) -> f64; | |
#[import(cc = "C", name = "copysign")] fn hls_copysign(f64, f64) -> f64; | |
#[import(cc = "C", name = "fmin")] fn hls_fminf(f32, f32) -> f32; | |
#[import(cc = "C", name = "fmax")] fn hls_fmaxf(f32, f32) -> f32; | |
#[import(cc = "C", name = "fmin")] fn hls_fmin(f64, f64) -> f64; | |
#[import(cc = "C", name = "fmax")] fn hls_fmax(f64, f64) -> f64; | |
#[import(cc = "C", name = "min")] fn hls_min(i32, i32) -> i32; | |
#[import(cc = "C", name = "max")] fn hls_max(i32, i32) -> i32; | |
#[import(cc = "device")] fn print_pragma(&[u8]) -> (); | |
// channel scalar types | |
struct channel[T] { data : T } | |
// channel array types | |
struct channel1[T] { data : [T * 1 ] } | |
struct channel2[T] { data : [T * 2 ] } | |
struct channel4[T] { data : [T * 4 ] } | |
struct channel8[T] { data : [T * 8 ] } | |
struct channel16[T] { data : [T * 16 ] } | |
struct channel32[T] { data : [T * 32 ] } | |
struct channel64[T] { data : [T * 64 ] } | |
struct channel128[T] { data : [T * 128] } | |
// read and write on scalar channels | |
#[import(cc = "device", name = "read_channel")] fn read_channel[T] (&mut channel[T]) -> T; | |
#[import(cc = "device", name = "write_channel")] fn write_channel[T] (&mut channel[T], T ) -> (); | |
// read and write on array channels | |
#[import(cc = "device", name = "read_channel")] fn read_channel1[T] ( &mut channel1[T] ) -> [T * 1 ]; | |
#[import(cc = "device", name = "read_channel")] fn read_channel2[T] ( &mut channel2[T] ) -> [T * 2 ]; | |
#[import(cc = "device", name = "read_channel")] fn read_channel4[T] ( &mut channel4[T] ) -> [T * 4 ]; | |
#[import(cc = "device", name = "read_channel")] fn read_channel8[T] ( &mut channel8[T] ) -> [T * 8 ]; | |
#[import(cc = "device", name = "read_channel")] fn read_channel16[T]( &mut channel16[T]) -> [T * 16]; | |
#[import(cc = "device", name = "read_channel")] fn read_channel32[T]( &mut channel32[T]) -> [T * 32]; | |
#[import(cc = "device", name = "write_channel")] fn write_channel1[T] ( &mut channel1[T], [T * 1 ]) -> (); | |
#[import(cc = "device", name = "write_channel")] fn write_channel2[T] ( &mut channel2[T], [T * 2 ]) -> (); | |
#[import(cc = "device", name = "write_channel")] fn write_channel4[T] ( &mut channel4[T], [T * 4 ]) -> (); | |
#[import(cc = "device", name = "write_channel")] fn write_channel8[T] ( &mut channel8[T], [T * 8 ]) -> (); | |
#[import(cc = "device", name = "write_channel")] fn write_channel16[T]( &mut channel16[T], [T * 16]) -> (); | |
#[import(cc = "device", name = "write_channel")] fn write_channel32[T]( &mut channel32[T], [T * 32]) -> (); | |
#[import(cc = "device", name = " ")] fn bitcast_channel[T]( &mut channel1[T]) -> [T * 2]; | |
fn @hls_accelerator(dev: i32) = Accelerator { | |
exec = @|body| |_grid, _block| { | |
let work_item = WorkItem { | |
tidx = @|| 0, tidy = @|| 0, tidz = @|| 0, | |
bidx = @|| 0, bidy = @|| 0, bidz = @|| 0, | |
gidx = @|| 0, gidy = @|| 0, gidz = @|| 0, | |
bdimx = @|| 1, bdimy = @|| 1, bdimz = @|| 1, | |
gdimx = @|| 1, gdimy = @|| 1, gdimz = @|| 1, | |
nblkx = @|| 1, nblky = @|| 1, nblkz = @|| 1 | |
}; | |
hls(dev, || @body(work_item)); | |
}, | |
sync = @|| synchronize_hls(dev), | |
alloc = @|size| alloc_hls(dev, size), | |
alloc_unified = @|size| alloc_hls_unified(dev, size), | |
barrier = @|| () | |
}; | |
static hls_intrinsics = Intrinsics { | |
expf = hls_expf, | |
exp2f = hls_exp2f, | |
logf = hls_logf, | |
log2f = hls_log2f, | |
powf = hls_powf, | |
rsqrtf = hls_rsqrtf, | |
sqrtf = hls_sqrtf, | |
fabsf = hls_fabsf, | |
sinf = hls_sinf, | |
cosf = hls_cosf, | |
tanf = hls_tanf, | |
asinf = hls_asinf, | |
acosf = hls_acosf, | |
atanf = hls_atanf, | |
erff = hls_erff, | |
atan2f = hls_atan2f, | |
copysignf = hls_copysignf, | |
fmaf = hls_fmaf, | |
fmaxf = hls_fmaxf, | |
fminf = hls_fminf, | |
fmodf = hls_fmodf, | |
floorf = hls_floorf, | |
isinff = hls_isinff, | |
isnanf = hls_isnanf, | |
isfinitef = hls_isfinitef, | |
exp = hls_exp, | |
exp2 = hls_exp2, | |
log = hls_log, | |
log2 = hls_log2, | |
pow = hls_pow, | |
rsqrt = hls_rsqrt, | |
sqrt = hls_sqrt, | |
fabs = hls_fabs, | |
sin = hls_sin, | |
cos = hls_cos, | |
tan = hls_tan, | |
asin = hls_asin, | |
acos = hls_acos, | |
atan = hls_atan, | |
erf = hls_erf, | |
atan2 = hls_atan2, | |
copysign = hls_copysign, | |
fma = hls_fma, | |
fmax = hls_fmax, | |
fmin = hls_fmin, | |
fmod = hls_fmod, | |
floor = hls_floor, | |
isinf = hls_isinf, | |
isnan = hls_isnan, | |
isfinite = hls_isfinite, | |
min = hls_min, | |
max = hls_max, | |
}; | |
// no declarations are emitted for "device" functions | |
#[import(cc = "device", name = "__syncthreads")] fn cuda_syncthreads() -> (); | |
#[import(cc = "device", name = "expf")] fn cuda_expf(f32) -> f32; | |
#[import(cc = "device", name = "exp2f")] fn cuda_exp2f(f32) -> f32; | |
#[import(cc = "device", name = "logf")] fn cuda_logf(f32) -> f32; | |
#[import(cc = "device", name = "log2f")] fn cuda_log2f(f32) -> f32; | |
#[import(cc = "device", name = "powf")] fn cuda_powf(f32, f32) -> f32; | |
#[import(cc = "device", name = "rsqrtf")] fn cuda_rsqrtf(f32) -> f32; | |
#[import(cc = "device", name = "sqrtf")] fn cuda_sqrtf(f32) -> f32; | |
#[import(cc = "device", name = "fabsf")] fn cuda_fabsf(f32) -> f32; | |
#[import(cc = "device", name = "sinf")] fn cuda_sinf(f32) -> f32; | |
#[import(cc = "device", name = "cosf")] fn cuda_cosf(f32) -> f32; | |
#[import(cc = "device", name = "tanf")] fn cuda_tanf(f32) -> f32; | |
#[import(cc = "device", name = "asinf")] fn cuda_asinf(f32) -> f32; | |
#[import(cc = "device", name = "acosf")] fn cuda_acosf(f32) -> f32; | |
#[import(cc = "device", name = "atanf")] fn cuda_atanf(f32) -> f32; | |
#[import(cc = "device", name = "erff")] fn cuda_erff(f32) -> f32; | |
#[import(cc = "device", name = "atan2f")] fn cuda_atan2f(f32, f32) -> f32; | |
#[import(cc = "device", name = "copysignf")] fn cuda_copysignf(f32, f32) -> f32; | |
#[import(cc = "device", name = "fmaf")] fn cuda_fmaf(f32, f32, f32) -> f32; | |
#[import(cc = "device", name = "fmaxf")] fn cuda_fmaxf(f32, f32) -> f32; | |
#[import(cc = "device", name = "fminf")] fn cuda_fminf(f32, f32) -> f32; | |
#[import(cc = "device", name = "fmodf")] fn cuda_fmodf(f32, f32) -> f32; | |
#[import(cc = "device", name = "floorf")] fn cuda_floorf(f32) -> f32; | |
#[import(cc = "device", name = "isinf")] fn cuda_isinff(f32) -> i32; | |
#[import(cc = "device", name = "isnan")] fn cuda_isnanf(f32) -> i32; | |
#[import(cc = "device", name = "isfinite")] fn cuda_isfinitef(f32) -> i32; | |
#[import(cc = "device", name = "exp")] fn cuda_exp(f64) -> f64; | |
#[import(cc = "device", name = "exp2")] fn cuda_exp2(f64) -> f64; | |
#[import(cc = "device", name = "log")] fn cuda_log(f64) -> f64; | |
#[import(cc = "device", name = "log2")] fn cuda_log2(f64) -> f64; | |
#[import(cc = "device", name = "pow")] fn cuda_pow(f64, f64) -> f64; | |
#[import(cc = "device", name = "rsqrt")] fn cuda_rsqrt(f64) -> f64; | |
#[import(cc = "device", name = "sqrt")] fn cuda_sqrt(f64) -> f64; | |
#[import(cc = "device", name = "fabs")] fn cuda_fabs(f64) -> f64; | |
#[import(cc = "device", name = "sin")] fn cuda_sin(f64) -> f64; | |
#[import(cc = "device", name = "cos")] fn cuda_cos(f64) -> f64; | |
#[import(cc = "device", name = "tan")] fn cuda_tan(f64) -> f64; | |
#[import(cc = "device", name = "asin")] fn cuda_asin(f64) -> f64; | |
#[import(cc = "device", name = "acos")] fn cuda_acos(f64) -> f64; | |
#[import(cc = "device", name = "atan")] fn cuda_atan(f64) -> f64; | |
#[import(cc = "device", name = "erf")] fn cuda_erf(f64) -> f64; | |
#[import(cc = "device", name = "atan2")] fn cuda_atan2(f64, f64) -> f64; | |
#[import(cc = "device", name = "copysign")] fn cuda_copysign(f64, f64) -> f64; | |
#[import(cc = "device", name = "fma")] fn cuda_fma(f64, f64, f64) -> f64; | |
#[import(cc = "device", name = "fmax")] fn cuda_fmax(f64, f64) -> f64; | |
#[import(cc = "device", name = "fmin")] fn cuda_fmin(f64, f64) -> f64; | |
#[import(cc = "device", name = "fmod")] fn cuda_fmod(f64, f64) -> f64; | |
#[import(cc = "device", name = "floor")] fn cuda_floor(f64) -> f64; | |
#[import(cc = "device", name = "isinf")] fn cuda_isinf(f64) -> i32; | |
#[import(cc = "device", name = "isnan")] fn cuda_isnan(f64) -> i32; | |
#[import(cc = "device", name = "isfinite")] fn cuda_isfinite(f64) -> i32; | |
#[import(cc = "device", name = "atomicAdd")] fn cuda_atomic_add_global(&mut addrspace(1)i32, i32) -> i32; | |
#[import(cc = "device", name = "atomicAdd")] fn cuda_atomic_add_shared(&mut addrspace(3)i32, i32) -> i32; | |
#[import(cc = "device", name = "atomicMin")] fn cuda_atomic_min_global(&mut addrspace(1)i32, i32) -> i32; | |
#[import(cc = "device", name = "atomicMin")] fn cuda_atomic_min_shared(&mut addrspace(3)i32, i32) -> i32; | |
#[import(cc = "device", name = "__ldg")] fn cuda_ldg_f32(&addrspace(1)f32) -> f32; | |
#[import(cc = "device", name = "__ldg")] fn cuda_ldg_i32(&addrspace(1)i32) -> i32; | |
#[import(cc = "device", name = "__ldg")] fn cuda_ldg_u8(&addrspace(1)u8) -> u8; | |
#[import(cc = "device", name = "__ldg")] fn cuda_ldg4_f32(&addrspace(1)simd[f32 * 4]) -> simd[f32 * 4]; | |
#[import(cc = "device", name = "__ldg")] fn cuda_ldg4_i32(&addrspace(1)simd[i32 * 4]) -> simd[i32 * 4]; | |
#[import(cc = "device", name = "min")] fn cuda_min(i32, i32) -> i32; | |
#[import(cc = "device", name = "max")] fn cuda_max(i32, i32) -> i32; | |
// wrappers for threadIdx.x etc. are provided by the CUDA backend | |
#[import(cc = "device", name = "threadIdx_x")] fn cuda_threadIdx_x() -> i32; | |
#[import(cc = "device", name = "threadIdx_y")] fn cuda_threadIdx_y() -> i32; | |
#[import(cc = "device", name = "threadIdx_z")] fn cuda_threadIdx_z() -> i32; | |
#[import(cc = "device", name = "blockIdx_x")] fn cuda_blockIdx_x() -> i32; | |
#[import(cc = "device", name = "blockIdx_y")] fn cuda_blockIdx_y() -> i32; | |
#[import(cc = "device", name = "blockIdx_z")] fn cuda_blockIdx_z() -> i32; | |
#[import(cc = "device", name = "blockDim_x")] fn cuda_blockDim_x() -> i32; | |
#[import(cc = "device", name = "blockDim_y")] fn cuda_blockDim_y() -> i32; | |
#[import(cc = "device", name = "blockDim_z")] fn cuda_blockDim_z() -> i32; | |
#[import(cc = "device", name = "gridDim_x")] fn cuda_gridDim_x() -> i32; | |
#[import(cc = "device", name = "gridDim_y")] fn cuda_gridDim_y() -> i32; | |
#[import(cc = "device", name = "gridDim_z")] fn cuda_gridDim_z() -> i32; | |
#[import(cc = "device", name = "__trap")] fn cuda_trap() -> (); | |
#[import(cc = "device", name = "__clz")] fn cuda_clz_u32(u32) -> i32; | |
#[import(cc = "device", name = "__clzll")] fn cuda_clz_u64(u64) -> i32; | |
#[import(cc = "device", name = "__ffs")] fn cuda_ffs_u32(u32) -> i32; | |
#[import(cc = "device", name = "__ffsll")] fn cuda_ffs_u64(u64) -> i32; | |
#[import(cc = "device", name = "__popc")] fn cuda_popc_u32(u32) -> i32; | |
#[import(cc = "device", name = "__popcll")] fn cuda_popc_u64(u64) -> i32; | |
#[import(cc = "device", name = "__brev")] fn cuda_brev_u32(u32) -> u32; | |
#[import(cc = "device", name = "__brevll")] fn cuda_brev_u64(u64) -> u64; | |
#[import(cc = "device", name = "atomicAdd")] fn cuda_atomic_add_global_i32(&mut addrspace(1)i32, i32) -> i32; | |
#[import(cc = "device", name = "atomicAdd")] fn cuda_atomic_add_global_u32(&mut addrspace(1)u32, u32) -> u32; | |
#[import(cc = "device", name = "atomicAdd")] fn cuda_atomic_add_global_u64(&mut addrspace(1)u64, u64) -> u64; | |
#[import(cc = "device", name = "atomicAdd")] fn cuda_atomic_add_global_f32(&mut addrspace(1)f32, f32) -> f32; | |
#[import(cc = "device", name = "atomicAdd")] fn cuda_atomic_add_global_f64(&mut addrspace(1)f64, f64) -> f64; | |
#[import(cc = "device", name = "atomicSub")] fn cuda_atomic_sub_global_i32(&mut addrspace(1)i32, i32) -> i32; | |
#[import(cc = "device", name = "atomicSub")] fn cuda_atomic_sub_global_u32(&mut addrspace(1)u32, u32) -> u32; | |
#[import(cc = "device", name = "atomicSub")] fn cuda_atomic_sub_global_u64(&mut addrspace(1)u64, u64) -> u64; | |
#[import(cc = "device", name = "atomicSub")] fn cuda_atomic_sub_global_f32(&mut addrspace(1)f32, f32) -> f32; | |
#[import(cc = "device", name = "atomicSub")] fn cuda_atomic_sub_global_f64(&mut addrspace(1)f64, f64) -> f64; | |
#[import(cc = "device", name = "atomicAnd")] fn cuda_atomic_and_global_i32(&mut addrspace(1)i32, i32) -> i32; | |
#[import(cc = "device", name = "atomicAnd")] fn cuda_atomic_and_global_u32(&mut addrspace(1)u32, u32) -> u32; | |
#[import(cc = "device", name = "atomicAnd")] fn cuda_atomic_and_global_u64(&mut addrspace(1)u64, u64) -> u64; | |
#[import(cc = "device", name = "atomicOr")] fn cuda_atomic_or_global_i32(&mut addrspace(1)i32, i32) -> i32; | |
#[import(cc = "device", name = "atomicOr")] fn cuda_atomic_or_global_u32(&mut addrspace(1)u32, u32) -> u32; | |
#[import(cc = "device", name = "atomicOr")] fn cuda_atomic_or_global_u64(&mut addrspace(1)u64, u64) -> u64; | |
#[import(cc = "device", name = "atomicXor")] fn cuda_atomic_xor_global_i32(&mut addrspace(1)i32, i32) -> i32; | |
#[import(cc = "device", name = "atomicXor")] fn cuda_atomic_xor_global_u32(&mut addrspace(1)u32, u32) -> u32; | |
#[import(cc = "device", name = "atomicXor")] fn cuda_atomic_xor_global_u64(&mut addrspace(1)u64, u64) -> u64; | |
#[import(cc = "device", name = "atomicExch")] fn cuda_atomic_exch_global_i32(&mut addrspace(1)i32, i32) -> i32; | |
#[import(cc = "device", name = "atomicExch")] fn cuda_atomic_exch_global_u32(&mut addrspace(1)u32, u32) -> u32; | |
#[import(cc = "device", name = "atomicExch")] fn cuda_atomic_exch_global_u64(&mut addrspace(1)u64, u64) -> u64; | |
#[import(cc = "device", name = "atomicExch")] fn cuda_atomic_exch_global_f32(&mut addrspace(1)f32, f32) -> f32; | |
#[import(cc = "device", name = "atomicMin")] fn cuda_atomic_min_global_i32(&mut addrspace(1)i32, i32) -> i32; | |
#[import(cc = "device", name = "atomicMin")] fn cuda_atomic_min_global_u32(&mut addrspace(1)u32, u32) -> u32; | |
#[import(cc = "device", name = "atomicMin")] fn cuda_atomic_min_global_u64(&mut addrspace(1)u64, u64) -> u64; | |
#[import(cc = "device", name = "atomicMax")] fn cuda_atomic_max_global_i32(&mut addrspace(1)i32, i32) -> i32; | |
#[import(cc = "device", name = "atomicMax")] fn cuda_atomic_max_global_u32(&mut addrspace(1)u32, u32) -> u32; | |
#[import(cc = "device", name = "atomicMax")] fn cuda_atomic_max_global_u64(&mut addrspace(1)u64, u64) -> u64; | |
#[import(cc = "device", name = "atomicCAS")] fn cuda_atomic_cas_global_u16(&mut addrspace(1)u16, u16, u16) -> u16; | |
#[import(cc = "device", name = "atomicCAS")] fn cuda_atomic_cas_global_i32(&mut addrspace(1)i32, i32, i32) -> i32; | |
#[import(cc = "device", name = "atomicCAS")] fn cuda_atomic_cas_global_u32(&mut addrspace(1)u32, u32, u32) -> u32; | |
#[import(cc = "device", name = "atomicCAS")] fn cuda_atomic_cas_global_u64(&mut addrspace(1)u64, u64, u64) -> u64; | |
#[import(cc = "device", name = "atomicInc")] fn cuda_atomic_inc_global_u32(&mut addrspace(1)u32, u32) -> u32; | |
#[import(cc = "device", name = "atomicDec")] fn cuda_atomic_dec_global_u32(&mut addrspace(1)u32, u32) -> u32; | |
#[import(cc = "device", name = "__threadfence")] fn cuda_threadfence() -> (); | |
#[import(cc = "device", name = "__threadfence_block")] fn cuda_threadfence_block() -> (); | |
#[import(cc = "device", name = "__threadfence_system")] fn cuda_threadfence_system() -> (); | |
#[import(cc = "device", name = "__syncthreads")] fn cuda_block_sync() -> (); | |
#[import(cc = "device", name = "__syncthreads_count")] fn cuda_block_sync_count(i32) -> i32; | |
#[import(cc = "device", name = "__syncthreads_and")] fn cuda_block_sync_all(i32) -> i32; | |
#[import(cc = "device", name = "__syncthreads_or")] fn cuda_block_sync_any(i32) -> i32; | |
#[import(cc = "device", name = "__syncwarp")] fn cuda_warp_sync(u32) -> (); | |
#[import(cc = "device", name = "__all_sync")] fn cuda_warp_sync_all(u32, i32) -> i32; | |
#[import(cc = "device", name = "__any_sync")] fn cuda_warp_sync_any(u32, i32) -> i32; | |
#[import(cc = "device", name = "__ballot_sync")] fn cuda_warp_sync_vote(u32, i32) -> u32; | |
#[import(cc = "device", name = "__activemask")] fn cuda_warp_activemask() -> u32; | |
#[import(cc = "device", name = "__shfl_sync")] fn cuda_warp_shfl_i32(u32, i32, i32, i32) -> i32; | |
#[import(cc = "device", name = "__shfl_sync")] fn cuda_warp_shfl_u32(u32, u32, i32, i32) -> u32; | |
#[import(cc = "device", name = "__shfl_sync")] fn cuda_warp_shfl_i64(u32, i64, i32, i32) -> i64; | |
#[import(cc = "device", name = "__shfl_sync")] fn cuda_warp_shfl_u64(u32, u64, i32, i32) -> u64; | |
#[import(cc = "device", name = "__shfl_sync")] fn cuda_warp_shfl_f32(u32, f32, i32, i32) -> f32; | |
#[import(cc = "device", name = "__shfl_sync")] fn cuda_warp_shfl_f64(u32, f64, i32, i32) -> f64; | |
#[import(cc = "device", name = "__shfl_up_sync")] fn cuda_warp_shfl_up_i32(u32, i32, u32, i32) -> i32; | |
#[import(cc = "device", name = "__shfl_up_sync")] fn cuda_warp_shfl_up_u32(u32, u32, u32, i32) -> u32; | |
#[import(cc = "device", name = "__shfl_up_sync")] fn cuda_warp_shfl_up_i64(u32, i64, u32, i32) -> i64; | |
#[import(cc = "device", name = "__shfl_up_sync")] fn cuda_warp_shfl_up_u64(u32, u64, u32, i32) -> u64; | |
#[import(cc = "device", name = "__shfl_up_sync")] fn cuda_warp_shfl_up_f32(u32, f32, u32, i32) -> f32; | |
#[import(cc = "device", name = "__shfl_up_sync")] fn cuda_warp_shfl_up_f64(u32, f64, u32, i32) -> f64; | |
#[import(cc = "device", name = "__shfl_down_sync")] fn cuda_warp_shfl_down_i32(u32, i32, u32, i32) -> i32; | |
#[import(cc = "device", name = "__shfl_down_sync")] fn cuda_warp_shfl_down_u32(u32, u32, u32, i32) -> u32; | |
#[import(cc = "device", name = "__shfl_down_sync")] fn cuda_warp_shfl_down_i64(u32, i64, u32, i32) -> i64; | |
#[import(cc = "device", name = "__shfl_down_sync")] fn cuda_warp_shfl_down_u64(u32, u64, u32, i32) -> u64; | |
#[import(cc = "device", name = "__shfl_down_sync")] fn cuda_warp_shfl_down_f32(u32, f32, u32, i32) -> f32; | |
#[import(cc = "device", name = "__shfl_down_sync")] fn cuda_warp_shfl_down_f64(u32, f64, u32, i32) -> f64; | |
#[import(cc = "device", name = "__shfl_xor_sync")] fn cuda_warp_shfl_xor_i32(u32, i32, i32, i32) -> i32; | |
#[import(cc = "device", name = "__shfl_xor_sync")] fn cuda_warp_shfl_xor_u32(u32, u32, i32, i32) -> u32; | |
#[import(cc = "device", name = "__shfl_xor_sync")] fn cuda_warp_shfl_xor_i64(u32, i64, i32, i32) -> i64; | |
#[import(cc = "device", name = "__shfl_xor_sync")] fn cuda_warp_shfl_xor_u64(u32, u64, i32, i32) -> u64; | |
#[import(cc = "device", name = "__shfl_xor_sync")] fn cuda_warp_shfl_xor_f32(u32, f32, i32, i32) -> f32; | |
#[import(cc = "device", name = "__shfl_xor_sync")] fn cuda_warp_shfl_xor_f64(u32, f64, i32, i32) -> f64; | |
#[import(cc = "device", name = "__match_any_sync")] fn cuda_warp_match_any_i32(u32, i32) -> u32; | |
#[import(cc = "device", name = "__match_any_sync")] fn cuda_warp_match_any_u32(u32, u32) -> u32; | |
#[import(cc = "device", name = "__match_any_sync")] fn cuda_warp_match_any_i64(u32, i64) -> u32; | |
#[import(cc = "device", name = "__match_any_sync")] fn cuda_warp_match_any_u64(u32, u64) -> u32; | |
#[import(cc = "device", name = "__match_any_sync")] fn cuda_warp_match_any_f32(u32, f32) -> u32; | |
#[import(cc = "device", name = "__match_any_sync")] fn cuda_warp_match_any_f64(u32, f64) -> u32; | |
#[import(cc = "device", name = "__match_all_sync")] fn cuda_warp_match_all_i32(u32, i32, &mut i32) -> u32; | |
#[import(cc = "device", name = "__match_all_sync")] fn cuda_warp_match_all_u32(u32, u32, &mut i32) -> u32; | |
#[import(cc = "device", name = "__match_all_sync")] fn cuda_warp_match_all_i64(u32, i64, &mut i32) -> u32; | |
#[import(cc = "device", name = "__match_all_sync")] fn cuda_warp_match_all_u64(u32, u64, &mut i32) -> u32; | |
#[import(cc = "device", name = "__match_all_sync")] fn cuda_warp_match_all_f32(u32, f32, &mut i32) -> u32; | |
#[import(cc = "device", name = "__match_all_sync")] fn cuda_warp_match_all_f64(u32, f64, &mut i32) -> u32; | |
#[import(cc = "device", name = "__nanosleep")] fn cuda_nanosleep(u32) -> (); | |
fn @cuda_vprintf(fmt: &[u8], args: &[u8]) -> i32 { | |
let mut res: i32; | |
asm( | |
"{\n" | |
".param.b64 param0;\n" | |
".param.b64 param1;\n" | |
".param.b32 retval;\n" | |
"st.param.b64 [param0], %1;\n" | |
"st.param.b64 [param1], %2;\n" | |
"call.uni (retval), vprintf, (param0, param1);\n" | |
"ld.param.b32 %0, [retval];\n" | |
"}" : "=r"(res) : "l"(fmt), "l"(args) : "memory" : "volatile" | |
); | |
res | |
} | |
fn @cuda_laneid() -> u32 { | |
let mut id: u32; | |
asm("mov.u32 %0, %laneid;" : "=r" (id)); | |
id | |
} | |
fn @cuda_warpid() -> u32 { | |
let mut id: u32; | |
asm("mov.u32 %0, %warpid;" : "=r" (id)); | |
id | |
} | |
fn @cuda_nwarpid() -> u32 { | |
let mut n: u32; | |
asm("mov.u32 %0, %nwarpid;" : "=r" (n)); | |
n | |
} | |
fn @cuda_smid() -> u32 { | |
let mut id: u32; | |
asm("mov.u32 %0, %smid;" : "=r" (id)); | |
id | |
} | |
fn @cuda_nsmid() -> u32 { | |
let mut n: u32; | |
asm("mov.u32 %0, %nsmid;" : "=r" (n)); | |
n | |
} | |
fn @cuda_lanemask() -> u32 { | |
let mut mask: u32; | |
asm("mov.u32 %0, %lanemask_eq;" : "=r" (mask)); | |
mask | |
} | |
fn @cuda_lanemask_le() -> u32 { | |
let mut mask: u32; | |
asm("mov.u32 %0, %lanemask_le;" : "=r" (mask)); | |
mask | |
} | |
fn @cuda_lanemask_lt() -> u32 { | |
let mut mask: u32; | |
asm("mov.u32 %0, %lanemask_lt;" : "=r" (mask)); | |
mask | |
} | |
fn @cuda_lanemask_ge() -> u32 { | |
let mut mask: u32; | |
asm("mov.u32 %0, %lanemask_ge;" : "=r" (mask)); | |
mask | |
} | |
fn @cuda_lanemask_gt() -> u32 { | |
let mut mask: u32; | |
asm("mov.u32 %0, %lanemask_gt;" : "=r" (mask)); | |
mask | |
} | |
fn @cuda_clock() -> u32 { | |
let mut cycle_count:u32; | |
asm("mov.u32 %0, %clock;" : "=r"(cycle_count) ::: "volatile"); | |
cycle_count | |
} | |
fn @cuda_clock_hi() -> u32 { | |
let mut cycle_count:u32; | |
asm("mov.u32 %0, %clock_hi;" : "=r"(cycle_count) ::: "volatile"); | |
cycle_count | |
} | |
fn @cuda_clock64() -> u64 { | |
let mut cycle_count:u64; | |
asm("mov.u64 %0, %clock64;" : "=l"(cycle_count) ::: "volatile"); | |
cycle_count | |
} | |
fn @cuda_globaltimer() -> u64 { | |
let mut timestamp:u64; | |
asm("mov.u64 %0, %globaltimer;" : "=l"(timestamp) ::: "volatile"); | |
timestamp | |
} | |
fn @cuda_globaltimer_lo() -> u32 { | |
let mut timestamp:u32; | |
asm("mov.u32 %0, %globaltimer_lo;" : "=r"(timestamp) ::: "volatile"); | |
timestamp | |
} | |
fn @cuda_globaltimer_hi() -> u32 { | |
let mut timestamp:u32; | |
asm("mov.u32 %0, %globaltimer_hi;" : "=r"(timestamp) ::: "volatile"); | |
timestamp | |
} | |
fn @cuda_minmin(a: i32, b: i32, c: i32) -> i32 { cuda_min(cuda_min(a, b), c) } | |
fn @cuda_maxmax(a: i32, b: i32, c: i32) -> i32 { cuda_max(cuda_max(a, b), c) } | |
fn @cuda_minmax(a: i32, b: i32, c: i32) -> i32 { cuda_max(cuda_min(a, b), c) } | |
fn @cuda_maxmin(a: i32, b: i32, c: i32) -> i32 { cuda_min(cuda_max(a, b), c) } | |
fn @cuda_madf(a: f32, b: f32, c: f32) -> f32 { cuda_fmaf(a, b, c) } | |
fn @cuda_mad(a: f64, b: f64, c: f64) -> f64 { cuda_fma(a, b, c) } | |
fn @cuda_accelerator(dev: i32) = Accelerator { | |
exec = @|body| |grid, block| { | |
let work_item = WorkItem { | |
tidx = cuda_threadIdx_x, | |
tidy = cuda_threadIdx_y, | |
tidz = cuda_threadIdx_z, | |
bidx = cuda_blockIdx_x, | |
bidy = cuda_blockIdx_y, | |
bidz = cuda_blockIdx_z, | |
gidx = @|| cuda_threadIdx_x() + cuda_blockDim_x() * cuda_blockIdx_x(), | |
gidy = @|| cuda_threadIdx_y() + cuda_blockDim_y() * cuda_blockIdx_y(), | |
gidz = @|| cuda_threadIdx_z() + cuda_blockDim_z() * cuda_blockIdx_z(), | |
bdimx = cuda_blockDim_x, | |
bdimy = cuda_blockDim_y, | |
bdimz = cuda_blockDim_z, | |
gdimx = @|| cuda_gridDim_x() * cuda_blockDim_x(), | |
gdimy = @|| cuda_gridDim_y() * cuda_blockDim_y(), | |
gdimz = @|| cuda_gridDim_z() * cuda_blockDim_z(), | |
nblkx = cuda_gridDim_x, | |
nblky = cuda_gridDim_y, | |
nblkz = cuda_gridDim_z | |
}; | |
cuda(dev, grid, block, || @body(work_item)) | |
}, | |
sync = @|| synchronize_cuda(dev), | |
alloc = @|size| alloc_cuda(dev, size), | |
alloc_unified = @|size| alloc_cuda_unified(dev, size), | |
barrier = cuda_syncthreads | |
}; | |
static cuda_intrinsics = Intrinsics { | |
expf = cuda_expf, | |
exp2f = cuda_exp2f, | |
logf = cuda_logf, | |
log2f = cuda_log2f, | |
powf = cuda_powf, | |
rsqrtf = cuda_rsqrtf, | |
sqrtf = cuda_sqrtf, | |
fabsf = cuda_fabsf, | |
sinf = cuda_sinf, | |
cosf = cuda_cosf, | |
tanf = cuda_tanf, | |
asinf = cuda_asinf, | |
acosf = cuda_acosf, | |
atanf = cuda_atanf, | |
erff = cuda_erff, | |
atan2f = cuda_atan2f, | |
copysignf = cuda_copysignf, | |
fmaf = cuda_fmaf, | |
fmaxf = cuda_fmaxf, | |
fminf = cuda_fminf, | |
fmodf = cuda_fmodf, | |
floorf = cuda_floorf, | |
isinff = cuda_isinff, | |
isnanf = cuda_isnanf, | |
isfinitef = cuda_isfinitef, | |
exp = cuda_exp, | |
exp2 = cuda_exp2, | |
log = cuda_log, | |
log2 = cuda_log2, | |
pow = cuda_pow, | |
rsqrt = cuda_rsqrt, | |
sqrt = cuda_sqrt, | |
fabs = cuda_fabs, | |
sin = cuda_sin, | |
cos = cuda_cos, | |
tan = cuda_tan, | |
asin = cuda_asin, | |
acos = cuda_acos, | |
atan = cuda_atan, | |
erf = cuda_erf, | |
atan2 = cuda_atan2, | |
copysign = cuda_copysign, | |
fma = cuda_fma, | |
fmax = cuda_fmax, | |
fmin = cuda_fmin, | |
fmod = cuda_fmod, | |
floor = cuda_floor, | |
isinf = cuda_isinf, | |
isnan = cuda_isnan, | |
isfinite = cuda_isfinite, | |
min = cuda_min, | |
max = cuda_max, | |
}; | |
#[import(cc = "device", name = "llvm.nvvm.read.ptx.sreg.ctaid.x")] fn nvvm_read_ptx_sreg_ctaid_x() -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.read.ptx.sreg.ctaid.y")] fn nvvm_read_ptx_sreg_ctaid_y() -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.read.ptx.sreg.ctaid.z")] fn nvvm_read_ptx_sreg_ctaid_z() -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.read.ptx.sreg.nctaid.x")] fn nvvm_read_ptx_sreg_nctaid_x() -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.read.ptx.sreg.nctaid.y")] fn nvvm_read_ptx_sreg_nctaid_y() -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.read.ptx.sreg.nctaid.z")] fn nvvm_read_ptx_sreg_nctaid_z() -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.read.ptx.sreg.ntid.x")] fn nvvm_read_ptx_sreg_ntid_x() -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.read.ptx.sreg.ntid.y")] fn nvvm_read_ptx_sreg_ntid_y() -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.read.ptx.sreg.ntid.z")] fn nvvm_read_ptx_sreg_ntid_z() -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.read.ptx.sreg.tid.x")] fn nvvm_read_ptx_sreg_tid_x() -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.read.ptx.sreg.tid.y")] fn nvvm_read_ptx_sreg_tid_y() -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.read.ptx.sreg.tid.z")] fn nvvm_read_ptx_sreg_tid_z() -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.read.ptx.sreg.warpsize")] fn nvvm_read_ptx_sreg_warpsize() -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.read.ptx.sreg.laneid")] fn nvvm_laneid() -> u32; | |
#[import(cc = "device", name = "llvm.nvvm.read.ptx.sreg.warpid")] fn nvvm_warpid() -> u32; | |
#[import(cc = "device", name = "llvm.nvvm.read.ptx.sreg.nwarpid")] fn nvvm_nwarpid() -> u32; | |
#[import(cc = "device", name = "llvm.nvvm.read.ptx.sreg.smid")] fn nvvm_smid() -> u32; | |
#[import(cc = "device", name = "llvm.nvvm.read.ptx.sreg.nsmid")] fn nvvm_nsmid() -> u32; | |
#[import(cc = "device", name = "llvm.nvvm.read.ptx.sreg.lanemask_eq")] fn nvvm_lanemask() -> u32; | |
#[import(cc = "device", name = "llvm.nvvm.read.ptx.sreg.lanemask_le")] fn nvvm_lanemask_le() -> u32; | |
#[import(cc = "device", name = "llvm.nvvm.read.ptx.sreg.lanemask_lt")] fn nvvm_lanemask_lt() -> u32; | |
#[import(cc = "device", name = "llvm.nvvm.read.ptx.sreg.lanemask_ge")] fn nvvm_lanemask_ge() -> u32; | |
#[import(cc = "device", name = "llvm.nvvm.read.ptx.sreg.lanemask_gt")] fn nvvm_lanemask_gt() -> u32; | |
#[import(cc = "device", name = "llvm.nvvm.barrier0")] fn nvvm_barrier() -> (); | |
#[import(cc = "device", name = "llvm.nvvm.abs.i")] fn nvvm_abs_i(i32) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.abs.ll")] fn nvvm_abs_ll(i64) -> i64; | |
#[import(cc = "device", name = "llvm.nvvm.add.rm.d")] fn nvvm_add_rm_d(f64, f64) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.add.rm.f")] fn nvvm_add_rm_f(f32, f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.add.rm.ftz.f")] fn nvvm_add_rm_ftz_f(f32, f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.add.rn.d")] fn nvvm_add_rn_d(f64, f64) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.add.rn.f")] fn nvvm_add_rn_f(f32, f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.add.rn.ftz.f")] fn nvvm_add_rn_ftz_f(f32, f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.add.rp.d")] fn nvvm_add_rp_d(f64, f64) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.add.rp.f")] fn nvvm_add_rp_f(f32, f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.add.rp.ftz.f")] fn nvvm_add_rp_ftz_f(f32, f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.add.rz.d")] fn nvvm_add_rz_d(f64, f64) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.add.rz.f")] fn nvvm_add_rz_f(f32, f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.add.rz.ftz.f")] fn nvvm_add_rz_ftz_f(f32, f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.barrier0.and")] fn nvvm_barrier0_and(i32) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.barrier0.or")] fn nvvm_barrier0_or(i32) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.barrier0.popc")] fn nvvm_barrier0_popc(i32) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.bitcast.d2ll")] fn nvvm_bitcast_d2ll(f64) -> i64; | |
#[import(cc = "device", name = "llvm.nvvm.bitcast.f2i")] fn nvvm_bitcast_f2i(f32) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.bitcast.i2f")] fn nvvm_bitcast_i2f(i32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.bitcast.ll2d")] fn nvvm_bitcast_ll2d(i64) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.brev32")] fn nvvm_brev32(i32) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.brev64")] fn nvvm_brev64(i64) -> i64; | |
#[import(cc = "device", name = "llvm.nvvm.ceil.d")] fn nvvm_ceil_d(f64) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.ceil.f")] fn nvvm_ceil_f(f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.ceil.ftz.f")] fn nvvm_ceil_ftz_f(f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.clz.i")] fn nvvm_clz_i(i32) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.clz.ll")] fn nvvm_clz_ll(i64) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.cos.approx.f")] fn nvvm_cos_approx_f(f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.cos.approx.ftz.f")] fn nvvm_cos_approx_ftz_f(f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.d2f.rm")] fn nvvm_d2f_rm(f64) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.d2f.rm.ftz")] fn nvvm_d2f_rm_ftz(f64) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.d2f.rn")] fn nvvm_d2f_rn(f64) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.d2f.rn.ftz")] fn nvvm_d2f_rn_ftz(f64) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.d2f.rp")] fn nvvm_d2f_rp(f64) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.d2f.rp.ftz")] fn nvvm_d2f_rp_ftz(f64) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.d2f.rz")] fn nvvm_d2f_rz(f64) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.d2f.rz.ftz")] fn nvvm_d2f_rz_ftz(f64) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.d2i.hi")] fn nvvm_d2i_hi(f64) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.d2i.lo")] fn nvvm_d2i_lo(f64) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.d2i.rm")] fn nvvm_d2i_rm(f64) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.d2i.rn")] fn nvvm_d2i_rn(f64) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.d2i.rp")] fn nvvm_d2i_rp(f64) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.d2i.rz")] fn nvvm_d2i_rz(f64) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.d2ll.rm")] fn nvvm_d2ll_rm(f64) -> i64; | |
#[import(cc = "device", name = "llvm.nvvm.d2ll.rn")] fn nvvm_d2ll_rn(f64) -> i64; | |
#[import(cc = "device", name = "llvm.nvvm.d2ll.rp")] fn nvvm_d2ll_rp(f64) -> i64; | |
#[import(cc = "device", name = "llvm.nvvm.d2ll.rz")] fn nvvm_d2ll_rz(f64) -> i64; | |
#[import(cc = "device", name = "llvm.nvvm.d2ui.rm")] fn nvvm_d2ui_rm(f64) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.d2ui.rn")] fn nvvm_d2ui_rn(f64) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.d2ui.rp")] fn nvvm_d2ui_rp(f64) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.d2ui.rz")] fn nvvm_d2ui_rz(f64) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.d2ull.rm")] fn nvvm_d2ull_rm(f64) -> i64; | |
#[import(cc = "device", name = "llvm.nvvm.d2ull.rn")] fn nvvm_d2ull_rn(f64) -> i64; | |
#[import(cc = "device", name = "llvm.nvvm.d2ull.rp")] fn nvvm_d2ull_rp(f64) -> i64; | |
#[import(cc = "device", name = "llvm.nvvm.d2ull.rz")] fn nvvm_d2ull_rz(f64) -> i64; | |
#[import(cc = "device", name = "llvm.nvvm.div.approx.f")] fn nvvm_div_approx_f(f32, f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.div.approx.ftz.f")] fn nvvm_div_approx_ftz_f(f32, f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.div.rm.d")] fn nvvm_div_rm_d(f64, f64) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.div.rm.f")] fn nvvm_div_rm_f(f32, f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.div.rm.ftz.f")] fn nvvm_div_rm_ftz_f(f32, f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.div.rn.d")] fn nvvm_div_rn_d(f64, f64) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.div.rn.f")] fn nvvm_div_rn_f(f32, f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.div.rn.ftz.f")] fn nvvm_div_rn_ftz_f(f32, f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.div.rp.d")] fn nvvm_div_rp_d(f64, f64) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.div.rp.f")] fn nvvm_div_rp_f(f32, f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.div.rp.ftz.f")] fn nvvm_div_rp_ftz_f(f32, f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.div.rz.d")] fn nvvm_div_rz_d(f64, f64) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.div.rz.f")] fn nvvm_div_rz_f(f32, f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.div.rz.ftz.f")] fn nvvm_div_rz_ftz_f(f32, f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.ex2.approx.d")] fn nvvm_ex2_approx_d(f64) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.ex2.approx.f")] fn nvvm_ex2_approx_f(f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.ex2.approx.ftz.f")] fn nvvm_ex2_approx_ftz_f(f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.f2h.rn")] fn nvvm_f2h_rn(f32) -> i16; | |
#[import(cc = "device", name = "llvm.nvvm.f2h.rn.ftz")] fn nvvm_f2h_rn_ftz(f32) -> i16; | |
#[import(cc = "device", name = "llvm.nvvm.f2i.rm")] fn nvvm_f2i_rm(f32) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.f2i.rm.ftz")] fn nvvm_f2i_rm_ftz(f32) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.f2i.rn")] fn nvvm_f2i_rn(f32) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.f2i.rn.ftz")] fn nvvm_f2i_rn_ftz(f32) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.f2i.rp")] fn nvvm_f2i_rp(f32) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.f2i.rp.ftz")] fn nvvm_f2i_rp_ftz(f32) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.f2i.rz")] fn nvvm_f2i_rz(f32) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.f2i.rz.ftz")] fn nvvm_f2i_rz_ftz(f32) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.f2ll.rm")] fn nvvm_f2ll_rm(f32) -> i64; | |
#[import(cc = "device", name = "llvm.nvvm.f2ll.rm.ftz")] fn nvvm_f2ll_rm_ftz(f32) -> i64; | |
#[import(cc = "device", name = "llvm.nvvm.f2ll.rn")] fn nvvm_f2ll_rn(f32) -> i64; | |
#[import(cc = "device", name = "llvm.nvvm.f2ll.rn.ftz")] fn nvvm_f2ll_rn_ftz(f32) -> i64; | |
#[import(cc = "device", name = "llvm.nvvm.f2ll.rp")] fn nvvm_f2ll_rp(f32) -> i64; | |
#[import(cc = "device", name = "llvm.nvvm.f2ll.rp.ftz")] fn nvvm_f2ll_rp_ftz(f32) -> i64; | |
#[import(cc = "device", name = "llvm.nvvm.f2ll.rz")] fn nvvm_f2ll_rz(f32) -> i64; | |
#[import(cc = "device", name = "llvm.nvvm.f2ll.rz.ftz")] fn nvvm_f2ll_rz_ftz(f32) -> i64; | |
#[import(cc = "device", name = "llvm.nvvm.f2ui.rm")] fn nvvm_f2ui_rm(f32) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.f2ui.rm.ftz")] fn nvvm_f2ui_rm_ftz(f32) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.f2ui.rn")] fn nvvm_f2ui_rn(f32) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.f2ui.rn.ftz")] fn nvvm_f2ui_rn_ftz(f32) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.f2ui.rp")] fn nvvm_f2ui_rp(f32) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.f2ui.rp.ftz")] fn nvvm_f2ui_rp_ftz(f32) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.f2ui.rz")] fn nvvm_f2ui_rz(f32) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.f2ui.rz.ftz")] fn nvvm_f2ui_rz_ftz(f32) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.f2ull.rm")] fn nvvm_f2ull_rm(f32) -> i64; | |
#[import(cc = "device", name = "llvm.nvvm.f2ull.rm.ftz")] fn nvvm_f2ull_rm_ftz(f32) -> i64; | |
#[import(cc = "device", name = "llvm.nvvm.f2ull.rn")] fn nvvm_f2ull_rn(f32) -> i64; | |
#[import(cc = "device", name = "llvm.nvvm.f2ull.rn.ftz")] fn nvvm_f2ull_rn_ftz(f32) -> i64; | |
#[import(cc = "device", name = "llvm.nvvm.f2ull.rp")] fn nvvm_f2ull_rp(f32) -> i64; | |
#[import(cc = "device", name = "llvm.nvvm.f2ull.rp.ftz")] fn nvvm_f2ull_rp_ftz(f32) -> i64; | |
#[import(cc = "device", name = "llvm.nvvm.f2ull.rz")] fn nvvm_f2ull_rz(f32) -> i64; | |
#[import(cc = "device", name = "llvm.nvvm.f2ull.rz.ftz")] fn nvvm_f2ull_rz_ftz(f32) -> i64; | |
#[import(cc = "device", name = "llvm.nvvm.fabs.d")] fn nvvm_fabs_d(f64) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.fabs.f")] fn nvvm_fabs_f(f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.fabs.ftz.f")] fn nvvm_fabs_ftz_f(f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.floor.d")] fn nvvm_floor_d(f64) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.floor.f")] fn nvvm_floor_f(f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.floor.ftz.f")] fn nvvm_floor_ftz_f(f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.fma.rm.d")] fn nvvm_fma_rm_d(f64, f64, f64) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.fma.rm.f")] fn nvvm_fma_rm_f(f32, f32, f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.fma.rm.ftz.f")] fn nvvm_fma_rm_ftz_f(f32, f32, f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.fma.rn.d")] fn nvvm_fma_rn_d(f64, f64, f64) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.fma.rn.f")] fn nvvm_fma_rn_f(f32, f32, f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.fma.rn.ftz.f")] fn nvvm_fma_rn_ftz_f(f32, f32, f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.fma.rp.d")] fn nvvm_fma_rp_d(f64, f64, f64) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.fma.rp.f")] fn nvvm_fma_rp_f(f32, f32, f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.fma.rp.ftz.f")] fn nvvm_fma_rp_ftz_f(f32, f32, f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.fma.rz.d")] fn nvvm_fma_rz_d(f64, f64, f64) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.fma.rz.f")] fn nvvm_fma_rz_f(f32, f32, f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.fma.rz.ftz.f")] fn nvvm_fma_rz_ftz_f(f32, f32, f32) -> f32; | |
//#[import(cc = "device", name = "llvm.nvvm.fmax.d")] fn nvvm_fmax(f64, f64) -> f64; | |
//#[import(cc = "device", name = "llvm.nvvm.fmax.f")] fn nvvm_fmaxf(f32, f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.fmax.ftz.f")] fn nvvm_fmax_ftz_f(f32, f32) -> f32; | |
//#[import(cc = "device", name = "llvm.nvvm.fmin.d")] fn nvvm_fmin(f64, f64) -> f64; | |
//#[import(cc = "device", name = "llvm.nvvm.fmin.f")] fn nvvm_fminf(f32, f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.fmin.ftz.f")] fn nvvm_fmin_ftz_f(f32, f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.h2f")] fn nvvm_h2f(i16) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.i2d.rm")] fn nvvm_i2d_rm(i32) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.i2d.rn")] fn nvvm_i2d_rn(i32) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.i2d.rp")] fn nvvm_i2d_rp(i32) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.i2d.rz")] fn nvvm_i2d_rz(i32) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.i2f.rm")] fn nvvm_i2f_rm(i32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.i2f.rn")] fn nvvm_i2f_rn(i32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.i2f.rp")] fn nvvm_i2f_rp(i32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.i2f.rz")] fn nvvm_i2f_rz(i32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.lg2.approx.d")] fn nvvm_lg2_approx_d(f64) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.lg2.approx.f")] fn nvvm_lg2_approx_f(f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.lg2.approx.ftz.f")] fn nvvm_lg2_approx_ftz_f(f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.ll2d.rm")] fn nvvm_ll2d_rm(i64) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.ll2d.rn")] fn nvvm_ll2d_rn(i64) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.ll2d.rp")] fn nvvm_ll2d_rp(i64) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.ll2d.rz")] fn nvvm_ll2d_rz(i64) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.ll2f.rm")] fn nvvm_ll2f_rm(i64) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.ll2f.rn")] fn nvvm_ll2f_rn(i64) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.ll2f.rp")] fn nvvm_ll2f_rp(i64) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.ll2f.rz")] fn nvvm_ll2f_rz(i64) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.lohi.i2d")] fn nvvm_lohi_i2d(i32, i32) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.max.i")] fn nvvm_max(i32, i32) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.max.ll")] fn nvvm_max_ll(i64, i64) -> i64; | |
#[import(cc = "device", name = "llvm.nvvm.max.ui")] fn nvvm_max_ui(i32, i32) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.max.ull")] fn nvvm_max_ull(i64, i64) -> i64; | |
#[import(cc = "device", name = "llvm.nvvm.membar.cta")] fn nvvm_membar_cta() -> (); | |
#[import(cc = "device", name = "llvm.nvvm.membar.gl")] fn nvvm_membar_gl() -> (); | |
#[import(cc = "device", name = "llvm.nvvm.membar.sys")] fn nvvm_membar_sys() -> (); | |
#[import(cc = "device", name = "llvm.nvvm.min.i")] fn nvvm_min(i32, i32) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.min.ll")] fn nvvm_min_ll(i64, i64) -> i64; | |
#[import(cc = "device", name = "llvm.nvvm.min.ui")] fn nvvm_min_ui(i32, i32) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.min.ull")] fn nvvm_min_ull(i64, i64) -> i64; | |
#[import(cc = "device", name = "llvm.nvvm.move.f64")] fn nvvm_move_f64(f64) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.move.f32")] fn nvvm_move_f32(f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.move.i16")] fn nvvm_move_i16(i16) -> i16; | |
#[import(cc = "device", name = "llvm.nvvm.move.i32")] fn nvvm_move_i32(i32) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.move.i64")] fn nvvm_move_i64(i64) -> i64; | |
#[import(cc = "device", name = "llvm.nvvm.mul24.i")] fn nvvm_mul24_i(i32, i32) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.mul24.ui")] fn nvvm_mul24_ui(i32, i32) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.mul.rm.d")] fn nvvm_mul_rm_d(f64, f64) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.mul.rm.f")] fn nvvm_mul_rm_f(f32, f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.mul.rm.ftz.f")] fn nvvm_mul_rm_ftz_f(f32, f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.mul.rn.d")] fn nvvm_mul_rn_d(f64, f64) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.mul.rn.f")] fn nvvm_mul_rn_f(f32, f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.mul.rn.ftz.f")] fn nvvm_mul_rn_ftz_f(f32, f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.mul.rp.d")] fn nvvm_mul_rp_d(f64, f64) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.mul.rp.f")] fn nvvm_mul_rp_f(f32, f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.mul.rp.ftz.f")] fn nvvm_mul_rp_ftz_f(f32, f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.mul.rz.d")] fn nvvm_mul_rz_d(f64, f64) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.mul.rz.f")] fn nvvm_mul_rz_f(f32, f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.mul.rz.ftz.f")] fn nvvm_mul_rz_ftz_f(f32, f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.mulhi.i")] fn nvvm_mulhi_i(i32, i32) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.mulhi.ll")] fn nvvm_mulhi_ll(i64, i64) -> i64; | |
#[import(cc = "device", name = "llvm.nvvm.mulhi.ui")] fn nvvm_mulhi_ui(i32, i32) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.mulhi.ull")] fn nvvm_mulhi_ull(i64, i64) -> i64; | |
#[import(cc = "device", name = "llvm.nvvm.popc.i")] fn nvvm_popc_i(i32) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.popc.ll")] fn nvvm_popc_ll(i64) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.prmt")] fn nvvm_prmt(i32, i32, i32) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.rcp.approx.ftz.d")] fn nvvm_rcp_approx_ftz_d(f64) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.rcp.rm.d")] fn nvvm_rcp_rm_d(f64) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.rcp.rm.f")] fn nvvm_rcp_rm_f(f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.rcp.rm.ftz.f")] fn nvvm_rcp_rm_ftz_f(f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.rcp.rn.d")] fn nvvm_rcp_rn_d(f64) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.rcp.rn.f")] fn nvvm_rcp_rn_f(f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.rcp.rn.ftz.f")] fn nvvm_rcp_rn_ftz_f(f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.rcp.rp.d")] fn nvvm_rcp_rp_d(f64) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.rcp.rp.f")] fn nvvm_rcp_rp_f(f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.rcp.rp.ftz.f")] fn nvvm_rcp_rp_ftz_f(f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.rcp.rz.d")] fn nvvm_rcp_rz_d(f64) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.rcp.rz.f")] fn nvvm_rcp_rz_f(f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.rcp.rz.ftz.f")] fn nvvm_rcp_rz_ftz_f(f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.round.d")] fn nvvm_round_d(f64) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.round.f")] fn nvvm_round_f(f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.round.ftz.f")] fn nvvm_round_ftz_f(f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.rsqrt.approx.d")] fn nvvm_rsqrt_approx_d(f64) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.rsqrt.approx.f")] fn nvvm_rsqrt_approx_f(f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.rsqrt.approx.ftz.f")] fn nvvm_rsqrt_approx_ftz_f(f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.sad.i")] fn nvvm_sad_i(i32, i32, i32) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.sad.ui")] fn nvvm_sad_ui(i32, i32, i32) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.saturate.d")] fn nvvm_saturate_d(f64) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.saturate.f")] fn nvvm_saturate_f(f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.saturate.ftz.f")] fn nvvm_saturate_ftz_f(f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.sin.approx.f")] fn nvvm_sin_approx_f(f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.sin.approx.ftz.f")] fn nvvm_sin_approx_ftz_f(f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.sqrt.approx.f")] fn nvvm_sqrt_approx_f(f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.sqrt.approx.ftz.f")] fn nvvm_sqrt_approx_ftz_f(f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.sqrt.f")] fn nvvm_sqrt_f(f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.sqrt.rm.d")] fn nvvm_sqrt_rm_d(f64) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.sqrt.rm.f")] fn nvvm_sqrt_rm_f(f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.sqrt.rm.ftz.f")] fn nvvm_sqrt_rm_ftz_f(f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.sqrt.rn.d")] fn nvvm_sqrt_rn_d(f64) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.sqrt.rn.f")] fn nvvm_sqrt_rn_f(f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.sqrt.rn.ftz.f")] fn nvvm_sqrt_rn_ftz_f(f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.sqrt.rp.d")] fn nvvm_sqrt_rp_d(f64) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.sqrt.rp.f")] fn nvvm_sqrt_rp_f(f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.sqrt.rp.ftz.f")] fn nvvm_sqrt_rp_ftz_f(f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.sqrt.rz.d")] fn nvvm_sqrt_rz_d(f64) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.sqrt.rz.f")] fn nvvm_sqrt_rz_f(f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.sqrt.rz.ftz.f")] fn nvvm_sqrt_rz_ftz_f(f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.trunc.d")] fn nvvm_trunc_d(f64) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.trunc.f")] fn nvvm_trunc_f(f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.trunc.ftz.f")] fn nvvm_trunc_ftz_f(f32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.ui2d.rm")] fn nvvm_ui2d_rm(i32) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.ui2d.rn")] fn nvvm_ui2d_rn(i32) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.ui2d.rp")] fn nvvm_ui2d_rp(i32) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.ui2d.rz")] fn nvvm_ui2d_rz(i32) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.ui2f.rm")] fn nvvm_ui2f_rm(i32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.ui2f.rn")] fn nvvm_ui2f_rn(i32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.ui2f.rp")] fn nvvm_ui2f_rp(i32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.ui2f.rz")] fn nvvm_ui2f_rz(i32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.ull2d.rm")] fn nvvm_ull2d_rm(i64) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.ull2d.rn")] fn nvvm_ull2d_rn(i64) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.ull2d.rp")] fn nvvm_ull2d_rp(i64) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.ull2d.rz")] fn nvvm_ull2d_rz(i64) -> f64; | |
#[import(cc = "device", name = "llvm.nvvm.ull2f.rm")] fn nvvm_ull2f_rm(i64) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.ull2f.rn")] fn nvvm_ull2f_rn(i64) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.ull2f.rp")] fn nvvm_ull2f_rp(i64) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.ull2f.rz")] fn nvvm_ull2f_rz(i64) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.ldg.global.i.i8.p1")] fn nvvm_ldg_u8_p1(&addrspace(1)u8, i32) -> u8; | |
#[import(cc = "device", name = "llvm.nvvm.ldg.global.i.i32.p1")] fn nvvm_ldg_i32_p1(&addrspace(1)i32, i32) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.ldg.global.f.f32.p1")] fn nvvm_ldg_f32_p1(&addrspace(1)f32, i32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.ldg.global.i.v4i32.p1")] fn nvvm_ldg4_i32_p1(&addrspace(1)simd[i32 * 4], i32) -> simd[i32 * 4]; | |
#[import(cc = "device", name = "llvm.nvvm.ldg.global.f.v4f32.p1")] fn nvvm_ldg4_f32_p1(&addrspace(1)simd[f32 * 4], i32) -> simd[f32 * 4]; | |
#[import(cc = "device", name = "llvm.nvvm.atomic.load.inc.32.p1")] fn nvvm_atomic_inc_global_u32(&mut addrspace(1)u32, u32) -> u32; | |
#[import(cc = "device", name = "llvm.nvvm.atomic.load.dec.32.p1")] fn nvvm_atomic_dec_global_u32(&mut addrspace(1)u32, u32) -> u32; | |
#[import(cc = "device", name = "llvm.nvvm.membar.gl")] fn nvvm_threadfence() -> (); | |
#[import(cc = "device", name = "llvm.nvvm.membar.cta")] fn nvvm_threadfence_block() -> (); | |
#[import(cc = "device", name = "llvm.nvvm.membar.sys")] fn nvvm_threadfence_system() -> (); | |
#[import(cc = "device", name = "llvm.nvvm.barrier0")] fn nvvm_block_sync() -> (); | |
#[import(cc = "device", name = "llvm.nvvm.barrier0.popc")] fn nvvm_block_sync_count(i32) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.barrier0.and")] fn nvvm_block_sync_all(i32) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.barrier0.or")] fn nvvm_block_sync_any(i32) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.bar.warp.sync")] fn nvvm_warp_sync(u32) -> (); | |
// #[import(cc = "device", name = "llvm.nvvm.vote.sync")] fn nvvm_warp_vote_sync(u32, i32, bool) -> (u32, bool); // seems to not be supported by LLVM | |
#[import(cc = "device", name = "llvm.nvvm.vote.all.sync")] fn nvvm_warp_sync_all(u32, bool) -> bool; | |
#[import(cc = "device", name = "llvm.nvvm.vote.any.sync")] fn nvvm_warp_sync_any(u32, bool) -> bool; | |
#[import(cc = "device", name = "llvm.nvvm.vote.ballot.sync")] fn nvvm_warp_sync_ballot(u32, bool) -> u32; | |
// #[import(cc = "device", name = "llvm.nvvm.shfl.sync.i32")] fn nvvm_warp_shfl_sync(i32, i32, i32, i32, i32) -> (i32, bool); // seems to not be supported by LLVM | |
#[import(cc = "device", name = "llvm.nvvm.shfl.sync.idx.i32")] fn nvvm_warp_shfl_sync_idx_i32(u32, i32, i32, i32) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.shfl.sync.idx.f32")] fn nvvm_warp_shfl_sync_idx_f32(u32, f32, i32, i32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.shfl.sync.up.i32")] fn nvvm_warp_shfl_sync_up_i32(u32, i32, i32, i32) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.shfl.sync.up.f32")] fn nvvm_warp_shfl_sync_up_f32(u32, f32, i32, i32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.shfl.sync.down.i32")] fn nvvm_warp_shfl_sync_down_i32(u32, i32, i32, i32) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.shfl.sync.down.f32")] fn nvvm_warp_shfl_sync_down_f32(u32, f32, i32, i32) -> f32; | |
#[import(cc = "device", name = "llvm.nvvm.shfl.sync.bfly.i32")] fn nvvm_warp_shfl_sync_bfly_i32(u32, i32, i32, i32) -> i32; | |
#[import(cc = "device", name = "llvm.nvvm.shfl.sync.bfly.f32")] fn nvvm_warp_shfl_sync_bfly_f32(u32, f32, i32, i32) -> f32; | |
// libdevice intrinsics: https://docs.nvidia.com/cuda/libdevice-users-guide | |
#[import(cc = "C", name = "__nv_expf")] fn nvvm_expf(f32) -> f32; | |
#[import(cc = "C", name = "__nv_exp2f")] fn nvvm_exp2f(f32) -> f32; | |
#[import(cc = "C", name = "__nv_logf")] fn nvvm_logf(f32) -> f32; | |
#[import(cc = "C", name = "__nv_log2f")] fn nvvm_log2f(f32) -> f32; | |
#[import(cc = "C", name = "__nv_powf")] fn nvvm_powf(f32, f32) -> f32; | |
#[import(cc = "C", name = "__nv_rsqrtf")] fn nvvm_rsqrtf(f32) -> f32; | |
#[import(cc = "C", name = "__nv_sqrtf")] fn nvvm_sqrtf(f32) -> f32; | |
#[import(cc = "C", name = "__nv_fabsf")] fn nvvm_fabsf(f32) -> f32; | |
#[import(cc = "C", name = "__nv_sinf")] fn nvvm_sinf(f32) -> f32; | |
#[import(cc = "C", name = "__nv_cosf")] fn nvvm_cosf(f32) -> f32; | |
#[import(cc = "C", name = "__nv_tanf")] fn nvvm_tanf(f32) -> f32; | |
#[import(cc = "C", name = "__nv_asinf")] fn nvvm_asinf(f32) -> f32; | |
#[import(cc = "C", name = "__nv_acosf")] fn nvvm_acosf(f32) -> f32; | |
#[import(cc = "C", name = "__nv_atanf")] fn nvvm_atanf(f32) -> f32; | |
#[import(cc = "C", name = "__nv_erff")] fn nvvm_erff(f32) -> f32; | |
#[import(cc = "C", name = "__nv_atan2f")] fn nvvm_atan2f(f32, f32) -> f32; | |
#[import(cc = "C", name = "__nv_fmaxf")] fn nvvm_fmaxf(f32, f32) -> f32; | |
#[import(cc = "C", name = "__nv_fminf")] fn nvvm_fminf(f32, f32) -> f32; | |
#[import(cc = "C", name = "__nv_fmodf")] fn nvvm_fmodf(f32, f32) -> f32; | |
#[import(cc = "C", name = "__nv_floorf")] fn nvvm_floorf(f32) -> f32; | |
#[import(cc = "C", name = "__nv_fmaf")] fn nvvm_fmaf(f32, f32, f32) -> f32; | |
#[import(cc = "C", name = "__nv_fmaf")] fn nvvm_madf(f32, f32, f32) -> f32; | |
#[import(cc = "C", name = "__nv_isinff")] fn nvvm_isinff(f32) -> i32; | |
#[import(cc = "C", name = "__nv_isnanf")] fn nvvm_isnanf(f32) -> i32; | |
#[import(cc = "C", name = "__nv_finitef")] fn nvvm_isfinitef(f32) -> i32; | |
#[import(cc = "C", name = "__nv_copysignf")] fn nvvm_copysignf(f32, f32) -> f32; | |
#[import(cc = "C", name = "__nv_exp")] fn nvvm_exp(f64) -> f64; | |
#[import(cc = "C", name = "__nv_exp2")] fn nvvm_exp2(f64) -> f64; | |
#[import(cc = "C", name = "__nv_log")] fn nvvm_log(f64) -> f64; | |
#[import(cc = "C", name = "__nv_log2")] fn nvvm_log2(f64) -> f64; | |
#[import(cc = "C", name = "__nv_pow")] fn nvvm_pow(f64, f64) -> f64; | |
#[import(cc = "C", name = "__nv_rsqrt")] fn nvvm_rsqrt(f64) -> f64; | |
#[import(cc = "C", name = "__nv_sqrt")] fn nvvm_sqrt(f64) -> f64; | |
#[import(cc = "C", name = "__nv_fabs")] fn nvvm_fabs(f64) -> f64; | |
#[import(cc = "C", name = "__nv_sin")] fn nvvm_sin(f64) -> f64; | |
#[import(cc = "C", name = "__nv_cos")] fn nvvm_cos(f64) -> f64; | |
#[import(cc = "C", name = "__nv_tan")] fn nvvm_tan(f64) -> f64; | |
#[import(cc = "C", name = "__nv_asin")] fn nvvm_asin(f64) -> f64; | |
#[import(cc = "C", name = "__nv_acos")] fn nvvm_acos(f64) -> f64; | |
#[import(cc = "C", name = "__nv_atan")] fn nvvm_atan(f64) -> f64; | |
#[import(cc = "C", name = "__nv_erf")] fn nvvm_erf(f64) -> f64; | |
#[import(cc = "C", name = "__nv_atan2")] fn nvvm_atan2(f64, f64) -> f64; | |
#[import(cc = "C", name = "__nv_fmin")] fn nvvm_fmin(f64, f64) -> f64; | |
#[import(cc = "C", name = "__nv_fmax")] fn nvvm_fmax(f64, f64) -> f64; | |
#[import(cc = "C", name = "__nv_fmod")] fn nvvm_fmod(f64, f64) -> f64; | |
#[import(cc = "C", name = "__nv_floor")] fn nvvm_floor(f64) -> f64; | |
#[import(cc = "C", name = "__nv_fma")] fn nvvm_fma(f64, f64, f64) -> f64; | |
#[import(cc = "C", name = "__nv_fma")] fn nvvm_mad(f64, f64, f64) -> f64; | |
#[import(cc = "C", name = "__nv_isinfd")] fn nvvm_isinf(f64) -> i32; | |
#[import(cc = "C", name = "__nv_isnand")] fn nvvm_isnan(f64) -> i32; | |
#[import(cc = "C", name = "__nv_isfinited")] fn nvvm_isfinite(f64) -> i32; | |
#[import(cc = "C", name = "__nv_copysign")] fn nvvm_copysign(f64, f64) -> f64; | |
// https://github.com/nvidia-compiler-sdk/nvvmir-samples/tree/master/syscalls/vprintf.ll | |
// There is no direct printf() support. In order to use vprintf(), a local buffer is allocated. | |
// Integer types that are shorter than int need to be extended to int and float needs to be | |
// extended to double before being pushed into the local buffer. | |
#[import(cc = "device", name = "vprintf")] fn nvvm_vprintf(_fmt: &[u8], _args: &[u8]) -> i32; | |
// | |
// atomics | |
// 0 1 2 3 4 5 6 7 8 9 10 11 12 | |
// operation: Xchg Add Sub And Nand Or Xor Max Min UMax UMin FAdd FSub | |
// 0 1 2 4 5 6 7 | |
// ordering: NotAtomic Unordered Monotonic Acquire Release AcquireRelease SequentiallyConsistent | |
// syncscope: "" (system) | |
// | |
fn @nvvm_atomic_xchg_global(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32](0, addr, val, 2, ""); | |
fn @nvvm_atomic_xchg_shared(addr: &mut addrspace(3)i32, val: i32) = atomic_p3[i32](0, addr, val, 2, ""); | |
fn @nvvm_atomic_add_global(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32](1, addr, val, 2, ""); | |
fn @nvvm_atomic_add_shared(addr: &mut addrspace(3)i32, val: i32) = atomic_p3[i32](1, addr, val, 2, ""); | |
fn @nvvm_atomic_sub_global(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32](2, addr, val, 2, ""); | |
fn @nvvm_atomic_sub_shared(addr: &mut addrspace(3)i32, val: i32) = atomic_p3[i32](2, addr, val, 2, ""); | |
fn @nvvm_atomic_max_global(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32](7, addr, val, 2, ""); | |
fn @nvvm_atomic_max_shared(addr: &mut addrspace(3)i32, val: i32) = atomic_p3[i32](7, addr, val, 2, ""); | |
fn @nvvm_atomic_min_global(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32](8, addr, val, 2, ""); | |
fn @nvvm_atomic_min_shared(addr: &mut addrspace(3)i32, val: i32) = atomic_p3[i32](8, addr, val, 2, ""); | |
fn @nvvm_atomic_add_global_i32(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32]( 1, addr, val, 2, ""); | |
fn @nvvm_atomic_add_global_u32(addr: &mut addrspace(1)u32, val: u32) = atomic_p1[u32]( 1, addr, val, 2, ""); | |
fn @nvvm_atomic_add_global_u64(addr: &mut addrspace(1)u64, val: u64) = atomic_p1[u64]( 1, addr, val, 2, ""); | |
fn @nvvm_atomic_add_global_f32(addr: &mut addrspace(1)f32, val: f32) = atomic_p1[f32](11, addr, val, 2, ""); | |
fn @nvvm_atomic_add_global_f64(addr: &mut addrspace(1)f64, val: f64) = atomic_p1[f64](11, addr, val, 2, ""); | |
fn @nvvm_atomic_sub_global_i32(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32]( 2, addr, val, 2, ""); | |
fn @nvvm_atomic_sub_global_u32(addr: &mut addrspace(1)u32, val: u32) = atomic_p1[u32]( 2, addr, val, 2, ""); | |
fn @nvvm_atomic_sub_global_u64(addr: &mut addrspace(1)u64, val: u64) = atomic_p1[u64]( 2, addr, val, 2, ""); | |
fn @nvvm_atomic_sub_global_f32(addr: &mut addrspace(1)f32, val: f32) = atomic_p1[f32](12, addr, val, 2, ""); | |
fn @nvvm_atomic_sub_global_f64(addr: &mut addrspace(1)f64, val: f64) = atomic_p1[f64](12, addr, val, 2, ""); | |
fn @nvvm_atomic_and_global_i32(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32](3, addr, val, 2, ""); | |
fn @nvvm_atomic_and_global_u32(addr: &mut addrspace(1)u32, val: u32) = atomic_p1[u32](3, addr, val, 2, ""); | |
fn @nvvm_atomic_and_global_u64(addr: &mut addrspace(1)u64, val: u64) = atomic_p1[u64](3, addr, val, 2, ""); | |
fn @nvvm_atomic_or_global_i32(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32](5, addr, val, 2, ""); | |
fn @nvvm_atomic_or_global_u32(addr: &mut addrspace(1)u32, val: u32) = atomic_p1[u32](5, addr, val, 2, ""); | |
fn @nvvm_atomic_or_global_u64(addr: &mut addrspace(1)u64, val: u64) = atomic_p1[u64](5, addr, val, 2, ""); | |
fn @nvvm_atomic_xor_global_i32(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32](6, addr, val, 2, ""); | |
fn @nvvm_atomic_xor_global_u32(addr: &mut addrspace(1)u32, val: u32) = atomic_p1[u32](6, addr, val, 2, ""); | |
fn @nvvm_atomic_xor_global_u64(addr: &mut addrspace(1)u64, val: u64) = atomic_p1[u64](6, addr, val, 2, ""); | |
fn @nvvm_atomic_exch_global_i32(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32](0, addr, val, 2, ""); | |
fn @nvvm_atomic_exch_global_u32(addr: &mut addrspace(1)u32, val: u32) = atomic_p1[u32](0, addr, val, 2, ""); | |
fn @nvvm_atomic_exch_global_u64(addr: &mut addrspace(1)u64, val: u64) = atomic_p1[u64](0, addr, val, 2, ""); | |
fn @nvvm_atomic_exch_global_f32(addr: &mut addrspace(1)f32, val: f32) = atomic_p1[f32](0, addr, val, 2, ""); | |
fn @nvvm_atomic_min_global_i32(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32]( 8, addr, val, 2, ""); | |
fn @nvvm_atomic_min_global_u32(addr: &mut addrspace(1)u32, val: u32) = atomic_p1[u32](10, addr, val, 2, ""); | |
fn @nvvm_atomic_min_global_u64(addr: &mut addrspace(1)u64, val: u64) = atomic_p1[u64](10, addr, val, 2, ""); | |
fn @nvvm_atomic_max_global_i32(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32](7, addr, val, 2, ""); | |
fn @nvvm_atomic_max_global_u32(addr: &mut addrspace(1)u32, val: u32) = atomic_p1[u32](9, addr, val, 2, ""); | |
fn @nvvm_atomic_max_global_u64(addr: &mut addrspace(1)u64, val: u64) = atomic_p1[u64](9, addr, val, 2, ""); | |
fn @nvvm_atomic_cas_global_u16(addr: &mut addrspace(1)u16, cmp: u16, new: u16) = match cmpxchg_p1[u16](addr, cmp, new, 2, 2, "") { (old, _success) => old }; | |
fn @nvvm_atomic_cas_global_i32(addr: &mut addrspace(1)i32, cmp: i32, new: i32) = match cmpxchg_p1[i32](addr, cmp, new, 2, 2, "") { (old, _success) => old }; | |
fn @nvvm_atomic_cas_global_u32(addr: &mut addrspace(1)u32, cmp: u32, new: u32) = match cmpxchg_p1[u32](addr, cmp, new, 2, 2, "") { (old, _success) => old }; | |
fn @nvvm_atomic_cas_global_u64(addr: &mut addrspace(1)u64, cmp: u64, new: u64) = match cmpxchg_p1[u64](addr, cmp, new, 2, 2, "") { (old, _success) => old }; | |
// fn @nvvm_warp_sync_all(membermask: u32, predicate: bool) -> bool = match nvvm_warp_vote_sync(membermask, 0, predicate) { (_ballot, bit) => bit }; | |
// fn @nvvm_warp_sync_any(membermask: u32, predicate: bool) -> bool = match nvvm_warp_vote_sync(membermask, 1, predicate) { (_ballot, bit) => bit }; | |
// fn @nvvm_warp_sync_ballot(membermask: u32, predicate: bool) -> u32 = match nvvm_warp_vote_sync(membermask, 3, predicate) { (ballot, _bit) => ballot }; | |
fn @nvvm_warp_activemask() -> u32 { | |
let mut mask: u32; | |
asm("activemask.b32 %0;" : "=r" (mask)); | |
mask | |
} | |
// fn @nvvm_warp_shfl_i32(membermask: u32, x: i32, src_lane: i32, width: i32) -> i32 { nvvm_warp_shfl_sync(membermask as i32, 0, x, src_lane, width)(0) } | |
// fn @nvvm_warp_shfl_u32(membermask: u32, x: u32, src_lane: i32, width: i32) -> u32 { nvvm_warp_shfl_i32(membermask, x as i32, src_lane, width) as u32 } | |
// fn @nvvm_warp_shfl_i64(membermask: u32, x: i64, src_lane: i32, width: i32) -> i64; | |
// fn @nvvm_warp_shfl_u64(membermask: u32, x: u64, src_lane: i32, width: i32) -> u64; | |
// fn @nvvm_warp_shfl_f32(membermask: u32, x: f32, src_lane: i32, width: i32) -> f32 { nvvm_bitcast_i2f(nvvm_warp_shfl_i32(membermask, nvvm_bitcast_f2i(x), src_lane, width)) } | |
// fn @nvvm_warp_shfl_f64(membermask: u32, x: f64, src_lane: i32, width: i32) -> f64 { | |
// let lo = nvvm_warp_shfl_i32(membermask, nvvm_d2i_lo x, src_lane, width) | |
// } | |
// fn @nvvm_warp_shfl_up_i32(membermask: u32, x: i32, delta: u32, width: i32) -> i32 { nvvm_warp_shfl_sync(membermask as i32, 1, x, delta as i32, width)(0) } | |
// fn @nvvm_warp_shfl_up_u32(membermask: u32, x: u32, delta: u32, width: i32) -> u32 { nvvm_warp_shfl_up_i32(membermask, x as i32, delta, width) as u32 } | |
// fn @nvvm_warp_shfl_up_i64(membermask: u32, x: i64, delta: u32, width: i32) -> i64; | |
// fn @nvvm_warp_shfl_up_u64(membermask: u32, x: u64, delta: u32, width: i32) -> u64; | |
// fn @nvvm_warp_shfl_up_f32(membermask: u32, x: f32, delta: u32, width: i32) -> f32 { nvvm_bitcast_i2f(nvvm_warp_shfl_up_i32(membermask, nvvm_bitcast_f2i(x), delta, width)) } | |
// fn @nvvm_warp_shfl_up_f64(membermask: u32, x: f64, delta: u32, width: i32) -> f64; | |
// fn @nvvm_warp_shfl_down_i32(membermask: u32, x: i32, delta: u32, width: i32) -> i32 { nvvm_warp_shfl_sync(membermask as i32, 2, x, delta as i32, width)(0) } | |
// fn @nvvm_warp_shfl_down_u32(membermask: u32, x: u32, delta: u32, width: i32) -> u32 { nvvm_warp_shfl_down_i32(membermask, x as i32, delta, width) as u32 } | |
// fn @nvvm_warp_shfl_down_i64(membermask: u32, x: i64, delta: u32, width: i32) -> i64; | |
// fn @nvvm_warp_shfl_down_u64(membermask: u32, x: u64, delta: u32, width: i32) -> u64; | |
// fn @nvvm_warp_shfl_down_f32(membermask: u32, x: f32, delta: u32, width: i32) -> f32 { nvvm_bitcast_i2f(nvvm_warp_shfl_down_i32(membermask, nvvm_bitcast_f2i(x), delta, width)) } | |
// fn @nvvm_warp_shfl_down_f64(membermask: u32, x: f64, delta: u32, width: i32) -> f64; | |
// fn @nvvm_warp_shfl_xor_i32(membermask: u32, x: i32, lane_mask: i32, width: i32) -> i32 { nvvm_warp_shfl_sync(membermask as i32, 3, x, lane_mask, width)(0) } | |
// fn @nvvm_warp_shfl_xor_u32(membermask: u32, x: u32, lane_mask: i32, width: i32) -> u32 { nvvm_warp_shfl_xor_i32(membermask, x as i32, lane_mask, width) as u32 } | |
// fn @nvvm_warp_shfl_xor_i64(membermask: u32, x: i64, lane_mask: i32, width: i32) -> i64; | |
// fn @nvvm_warp_shfl_xor_u64(membermask: u32, x: u64, lane_mask: i32, width: i32) -> u64; | |
// fn @nvvm_warp_shfl_xor_f32(membermask: u32, x: f32, lane_mask: i32, width: i32) -> f32 { nvvm_bitcast_i2f(nvvm_warp_shfl_xor_i32(membermask, nvvm_bitcast_f2i(x), lane_mask, width)) } | |
// fn @nvvm_warp_shfl_xor_f64(membermask: u32, x: f64, lane_mask: i32, width: i32) -> f64; | |
// fn @nvvm_warp_match_any_i32(membermask: u32, x: i32) -> u32; | |
// fn @nvvm_warp_match_any_u32(membermask: u32, x: u32) -> u32; | |
// fn @nvvm_warp_match_any_i64(membermask: u32, x: i64) -> u32; | |
// fn @nvvm_warp_match_any_u64(membermask: u32, x: u64) -> u32; | |
// fn @nvvm_warp_match_any_f32(membermask: u32, x: f32) -> u32; | |
// fn @nvvm_warp_match_any_f64(membermask: u32, x: f64) -> u32; | |
// fn @nvvm_warp_match_all_i32(membermask: u32, x: i32, predicate: &mut i32) -> u32; | |
// fn @nvvm_warp_match_all_u32(membermask: u32, x: u32, predicate: &mut i32) -> u32; | |
// fn @nvvm_warp_match_all_i64(membermask: u32, x: i64, predicate: &mut i32) -> u32; | |
// fn @nvvm_warp_match_all_u64(membermask: u32, x: u64, predicate: &mut i32) -> u32; | |
// fn @nvvm_warp_match_all_f32(membermask: u32, x: f32, predicate: &mut i32) -> u32; | |
// fn @nvvm_warp_match_all_f64(membermask: u32, x: f64, predicate: &mut i32) -> u32; | |
fn @nvvm_trap() -> () { | |
asm("trap;"); | |
} | |
fn @nvvm_nanosleep(t: u32) -> () { | |
asm("nanosleep.u32 $0;" :: "r"(t) :: "volatile"); | |
} | |
fn @nvvm_clock() -> u32 { | |
let mut cycle_count:u32; | |
asm("mov.u32 $0, %clock;" : "=r"(cycle_count) ::: "volatile"); | |
cycle_count | |
} | |
fn @nvvm_clock_hi() -> u32 { | |
let mut cycle_count:u32; | |
asm("mov.u32 $0, %clock_hi;" : "=r"(cycle_count) ::: "volatile"); | |
cycle_count | |
} | |
fn @nvvm_clock64() -> u64 { | |
let mut cycle_count:u64; | |
asm("mov.u64 $0, %clock64;" : "=l"(cycle_count) ::: "volatile"); | |
cycle_count | |
} | |
fn @nvvm_globaltimer() -> u64 { | |
let mut timestamp:u64; | |
asm("mov.u64 $0, %globaltimer;" : "=l"(timestamp) ::: "volatile"); | |
timestamp | |
} | |
fn @nvvm_globaltimer_lo() -> u32 { | |
let mut timestamp:u32; | |
asm("mov.u32 $0, %globaltimer_lo;" : "=r"(timestamp) ::: "volatile"); | |
timestamp | |
} | |
fn @nvvm_globaltimer_hi() -> u32 { | |
let mut timestamp:u32; | |
asm("mov.u32 $0, %globaltimer_hi;" : "=r"(timestamp) ::: "volatile"); | |
timestamp | |
} | |
fn @nvvm_minmin(a: i32, b: i32, c: i32) -> i32 { | |
let mut res: i32; | |
asm("vmin.s32.s32.s32.min $0, $1, $2, $3;" | |
: "=r"(res) | |
: "r"(a), "r"(b), "r"(c) | |
); | |
res | |
} | |
fn @nvvm_maxmax(a: i32, b: i32, c: i32) -> i32 { | |
let mut res: i32; | |
asm("vmax.s32.s32.s32.max $0, $1, $2, $3;" | |
: "=r"(res) | |
: "r"(a), "r"(b), "r"(c) | |
); | |
res | |
} | |
fn @nvvm_minmax(a: i32, b: i32, c: i32) -> i32 { | |
let mut res: i32; | |
asm("vmin.s32.s32.s32.max $0, $1, $2, $3;" | |
: "=r"(res) | |
: "r"(a), "r"(b), "r"(c) | |
); | |
res | |
} | |
fn @nvvm_maxmin(a: i32, b: i32, c: i32) -> i32 { | |
let mut res: i32; | |
asm("vmax.s32.s32.s32.min $0, $1, $2, $3;" | |
: "=r"(res) | |
: "r"(a), "r"(b), "r"(c) | |
); | |
res | |
} | |
fn @nvvm_ldg_u8(addr: &addrspace(1)u8) -> u8 = nvvm_ldg_u8_p1(addr, 1); | |
fn @nvvm_ldg_i32(addr: &addrspace(1)i32) -> i32 = nvvm_ldg_i32_p1(addr, 4); | |
fn @nvvm_ldg_f32(addr: &addrspace(1)f32) -> f32 = nvvm_ldg_f32_p1(addr, 4); | |
fn @nvvm_ldg4_i32(addr: &addrspace(1)simd[i32 * 4]) -> simd[i32 * 4] = nvvm_ldg4_i32_p1(addr, 16); | |
fn @nvvm_ldg4_f32(addr: &addrspace(1)simd[f32 * 4]) -> simd[f32 * 4] = nvvm_ldg4_f32_p1(addr, 16); | |
fn @nvvm_popcount(i: i32) -> i32 { nvvm_popc_i(i) } | |
fn @nvvm_accelerator(dev: i32) = Accelerator { | |
exec = @|body| |grid, block| { | |
let work_item = WorkItem { | |
tidx = nvvm_read_ptx_sreg_tid_x, | |
tidy = nvvm_read_ptx_sreg_tid_y, | |
tidz = nvvm_read_ptx_sreg_tid_z, | |
bidx = nvvm_read_ptx_sreg_ctaid_x, | |
bidy = nvvm_read_ptx_sreg_ctaid_y, | |
bidz = nvvm_read_ptx_sreg_ctaid_z, | |
gidx = @|| nvvm_read_ptx_sreg_tid_x() + nvvm_read_ptx_sreg_ntid_x() * nvvm_read_ptx_sreg_ctaid_x(), | |
gidy = @|| nvvm_read_ptx_sreg_tid_y() + nvvm_read_ptx_sreg_ntid_y() * nvvm_read_ptx_sreg_ctaid_y(), | |
gidz = @|| nvvm_read_ptx_sreg_tid_z() + nvvm_read_ptx_sreg_ntid_z() * nvvm_read_ptx_sreg_ctaid_z(), | |
bdimx = nvvm_read_ptx_sreg_ntid_x, | |
bdimy = nvvm_read_ptx_sreg_ntid_y, | |
bdimz = nvvm_read_ptx_sreg_ntid_z, | |
gdimx = @|| nvvm_read_ptx_sreg_nctaid_x() * nvvm_read_ptx_sreg_ntid_x(), | |
gdimy = @|| nvvm_read_ptx_sreg_nctaid_y() * nvvm_read_ptx_sreg_ntid_y(), | |
gdimz = @|| nvvm_read_ptx_sreg_nctaid_z() * nvvm_read_ptx_sreg_ntid_z(), | |
nblkx = nvvm_read_ptx_sreg_nctaid_x, | |
nblky = nvvm_read_ptx_sreg_nctaid_y, | |
nblkz = nvvm_read_ptx_sreg_nctaid_z | |
}; | |
nvvm(dev, grid, block, || @body(work_item)) | |
}, | |
sync = @|| synchronize_cuda(dev), | |
alloc = @|size| alloc_cuda(dev, size), | |
alloc_unified = @|size| alloc_cuda_unified(dev, size), | |
barrier = nvvm_barrier | |
}; | |
static nvvm_intrinsics = Intrinsics { | |
expf = nvvm_expf, | |
exp2f = nvvm_exp2f, | |
logf = nvvm_logf, | |
log2f = nvvm_log2f, | |
powf = nvvm_powf, | |
rsqrtf = nvvm_rsqrtf, | |
sqrtf = nvvm_sqrtf, | |
fabsf = nvvm_fabsf, | |
sinf = nvvm_sinf, | |
cosf = nvvm_cosf, | |
tanf = nvvm_tanf, | |
asinf = nvvm_asinf, | |
acosf = nvvm_acosf, | |
atanf = nvvm_atanf, | |
erff = nvvm_erff, | |
atan2f = nvvm_atan2f, | |
copysignf = nvvm_copysignf, | |
fmaf = nvvm_fmaf, | |
fmaxf = nvvm_fmaxf, | |
fminf = nvvm_fminf, | |
fmodf = nvvm_fmodf, | |
floorf = nvvm_floorf, | |
isinff = nvvm_isinff, | |
isnanf = nvvm_isnanf, | |
isfinitef = nvvm_isfinitef, | |
exp = nvvm_exp, | |
exp2 = nvvm_exp2, | |
log = nvvm_log, | |
log2 = nvvm_log2, | |
pow = nvvm_pow, | |
rsqrt = nvvm_rsqrt, | |
sqrt = nvvm_sqrt, | |
fabs = nvvm_fabs, | |
sin = nvvm_sin, | |
cos = nvvm_cos, | |
tan = nvvm_tan, | |
asin = nvvm_asin, | |
acos = nvvm_acos, | |
atan = nvvm_atan, | |
erf = nvvm_erf, | |
atan2 = nvvm_atan2, | |
copysign = nvvm_copysign, | |
fma = nvvm_fma, | |
fmax = nvvm_fmax, | |
fmin = nvvm_fmin, | |
fmod = nvvm_fmod, | |
floor = nvvm_floor, | |
isinf = nvvm_isinf, | |
isnan = nvvm_isnan, | |
isfinite = nvvm_isfinite, | |
min = nvvm_min, | |
max = nvvm_max, | |
}; | |
#[import(cc = "device", name = "llvm.amdgcn.dispatch.id")] fn amdgcn_dispatch_id() -> i64; | |
#[import(cc = "device", name = "llvm.amdgcn.dispatch.ptr")] fn amdgcn_dispatch_ptr() -> &addrspace(4)i8; | |
#[import(cc = "device", name = "llvm.amdgcn.implicitarg.ptr")] fn amdgcn_implicitarg_ptr() -> &addrspace(4)i8; | |
#[import(cc = "device", name = "llvm.amdgcn.ds.gws.barrier")] fn amdgcn_ds_gws_barrier(i32, i32) -> (); | |
#[import(cc = "device", name = "llvm.amdgcn.s.barrier")] fn amdgcn_s_barrier() -> (); | |
#[import(cc = "device", name = "llvm.amdgcn.wave.barrier")] fn amdgcn_wave_barrier() -> (); | |
#[import(cc = "device", name = "llvm.amdgcn.sched.barrier")] fn amdgcn_sched_barrier(i32) -> (); | |
#[import(cc = "device", name = "llvm.amdgcn.s.sethalt")] fn amdgcn_s_sethalt(i32) -> (); | |
#[import(cc = "device", name = "llvm.amdgcn.wavefrontsize")] fn amdgcn_wavefrontsize() -> i32; | |
#[import(cc = "device", name = "llvm.amdgcn.mbcnt.hi")] fn amdgcn_mbcnt_hi(i32, i32) -> i32; | |
#[import(cc = "device", name = "llvm.amdgcn.mbcnt.lo")] fn amdgcn_mbcnt_lo(i32, i32) -> i32; | |
#[import(cc = "device", name = "llvm.amdgcn.ds.bpermute")] fn amdgcn_ds_bpermute(i32, i32) -> i32; | |
#[import(cc = "device", name = "llvm.amdgcn.workgroup.id.x")] fn amdgcn_workgroup_id_x() -> i32; | |
#[import(cc = "device", name = "llvm.amdgcn.workgroup.id.y")] fn amdgcn_workgroup_id_y() -> i32; | |
#[import(cc = "device", name = "llvm.amdgcn.workgroup.id.z")] fn amdgcn_workgroup_id_z() -> i32; | |
#[import(cc = "device", name = "llvm.amdgcn.workitem.id.x")] fn amdgcn_workitem_id_x() -> i32; | |
#[import(cc = "device", name = "llvm.amdgcn.workitem.id.y")] fn amdgcn_workitem_id_y() -> i32; | |
#[import(cc = "device", name = "llvm.amdgcn.workitem.id.z")] fn amdgcn_workitem_id_z() -> i32; | |
#[import(cc = "device", name = "llvm.amdgcn.sin.f32")] fn amdgcn_sinf(f32) -> f32; | |
#[import(cc = "device", name = "llvm.amdgcn.cos.f32")] fn amdgcn_cosf(f32) -> f32; | |
#[import(cc = "device", name = "llvm.amdgcn.sin.f64")] fn amdgcn_sin(f64) -> f64; | |
#[import(cc = "device", name = "llvm.amdgcn.cos.f64")] fn amdgcn_cos(f64) -> f64; | |
#[import(cc = "device", name = "llvm.amdgcn.s.sleep")] fn amdgcn_s_sleep(i32) -> (); | |
#[import(cc = "device", name = "llvm.amdgcn.icmp.i32.i32")] fn amdgcn_icmp_i32(i32, i32, i32) -> i32; | |
#[import(cc = "device", name = "llvm.amdgcn.icmp.i64.i32")] fn amdgcn_icmp_i64(i32, i32, i32) -> i64; | |
#[import(cc = "device", name = "llvm.amdgcn.atomic.inc.i32.p1")] fn amdgcn_atomic_inc_global_u32(&mut addrspace(1)u32, u32, u32, u32, bool) -> u32; | |
#[import(cc = "device", name = "llvm.amdgcn.atomic.dec.i32.p1")] fn amdgcn_atomic_dec_global_u32(&mut addrspace(1)u32, u32, u32, u32, bool) -> u32; | |
#[import(cc = "device", name = "llvm.amdgcn.s.memrealtime")] fn amdgcn_s_memrealtime() -> u64; | |
// https://github.com/RadeonOpenCompute/ROCm-Device-Libs/blob/master/doc/OCML.md | |
#[import(cc = "C", name = "__ocml_exp_f32")] fn ocml_expf(f32) -> f32; | |
#[import(cc = "C", name = "__ocml_exp2_f32")] fn ocml_exp2f(f32) -> f32; | |
#[import(cc = "C", name = "__ocml_log_f32")] fn ocml_logf(f32) -> f32; | |
#[import(cc = "C", name = "__ocml_log2_f32")] fn ocml_log2f(f32) -> f32; | |
#[import(cc = "C", name = "__ocml_powr_f32")] fn ocml_powrf(f32, f32) -> f32; | |
#[import(cc = "C", name = "__ocml_pown_f32")] fn ocml_pownf(f32, i32) -> f32; | |
#[import(cc = "C", name = "__ocml_pow_f32")] fn ocml_powf(f32, f32) -> f32; | |
#[import(cc = "C", name = "__ocml_rsqrt_f32")] fn ocml_rsqrtf(f32) -> f32; | |
#[import(cc = "C", name = "__ocml_sqrt_f32")] fn ocml_sqrtf(f32) -> f32; | |
#[import(cc = "C", name = "__ocml_fabs_f32")] fn ocml_fabsf(f32) -> f32; | |
#[import(cc = "C", name = "__ocml_sin_f32")] fn ocml_sinf(f32) -> f32; | |
#[import(cc = "C", name = "__ocml_cos_f32")] fn ocml_cosf(f32) -> f32; | |
#[import(cc = "C", name = "__ocml_tan_f32")] fn ocml_tanf(f32) -> f32; | |
#[import(cc = "C", name = "__ocml_asin_f32")] fn ocml_asinf(f32) -> f32; | |
#[import(cc = "C", name = "__ocml_acos_f32")] fn ocml_acosf(f32) -> f32; | |
#[import(cc = "C", name = "__ocml_atan_f32")] fn ocml_atanf(f32) -> f32; | |
#[import(cc = "C", name = "__ocml_erf_f32")] fn ocml_erff(f32) -> f32; | |
#[import(cc = "C", name = "__ocml_atan2_f32")] fn ocml_atan2f(f32, f32) -> f32; | |
#[import(cc = "C", name = "__ocml_copysign_f32")] fn ocml_copysignf(f32, f32) -> f32; | |
#[import(cc = "C", name = "__ocml_fma_f32")] fn ocml_fmaf(f32, f32, f32) -> f32; | |
#[import(cc = "C", name = "__ocml_fmax_f32")] fn ocml_fmaxf(f32, f32) -> f32; | |
#[import(cc = "C", name = "__ocml_fmin_f32")] fn ocml_fminf(f32, f32) -> f32; | |
#[import(cc = "C", name = "__ocml_fmod_f32")] fn ocml_fmodf(f32, f32) -> f32; | |
#[import(cc = "C", name = "__ocml_floor_f32")] fn ocml_floorf(f32) -> f32; | |
#[import(cc = "C", name = "__ocml_isinf_f32")] fn ocml_isinff(f32) -> i32; | |
#[import(cc = "C", name = "__ocml_isnan_f32")] fn ocml_isnanf(f32) -> i32; | |
#[import(cc = "C", name = "__ocml_isfinite_f32")] fn ocml_isfinitef(f32) -> i32; | |
#[import(cc = "C", name = "__ocml_mad_f32")] fn ocml_madf(f32, f32, f32) -> f32; | |
#[import(cc = "C", name = "__ocml_exp_f64")] fn ocml_exp(f64) -> f64; | |
#[import(cc = "C", name = "__ocml_exp2_f64")] fn ocml_exp2(f64) -> f64; | |
#[import(cc = "C", name = "__ocml_log_f64")] fn ocml_log(f64) -> f64; | |
#[import(cc = "C", name = "__ocml_log2_f64")] fn ocml_log2(f64) -> f64; | |
#[import(cc = "C", name = "__ocml_powr_f64")] fn ocml_powr(f64, f64) -> f64; | |
#[import(cc = "C", name = "__ocml_pown_f64")] fn ocml_pown(f64, i32) -> f64; | |
#[import(cc = "C", name = "__ocml_pow_f64")] fn ocml_pow(f64, f64) -> f64; | |
#[import(cc = "C", name = "__ocml_rsqrt_f64")] fn ocml_rsqrt(f64) -> f64; | |
#[import(cc = "C", name = "__ocml_sqrt_f64")] fn ocml_sqrt(f64) -> f64; | |
#[import(cc = "C", name = "__ocml_fabs_f64")] fn ocml_fabs(f64) -> f64; | |
#[import(cc = "C", name = "__ocml_sin_f64")] fn ocml_sin(f64) -> f64; | |
#[import(cc = "C", name = "__ocml_cos_f64")] fn ocml_cos(f64) -> f64; | |
#[import(cc = "C", name = "__ocml_tan_f64")] fn ocml_tan(f64) -> f64; | |
#[import(cc = "C", name = "__ocml_asin_f64")] fn ocml_asin(f64) -> f64; | |
#[import(cc = "C", name = "__ocml_acos_f64")] fn ocml_acos(f64) -> f64; | |
#[import(cc = "C", name = "__ocml_atan_f64")] fn ocml_atan(f64) -> f64; | |
#[import(cc = "C", name = "__ocml_erf_f64")] fn ocml_erf(f64) -> f64; | |
#[import(cc = "C", name = "__ocml_atan2_f64")] fn ocml_atan2(f64, f64) -> f64; | |
#[import(cc = "C", name = "__ocml_copysign_f64")] fn ocml_copysign(f64, f64) -> f64; | |
#[import(cc = "C", name = "__ocml_fma_f64")] fn ocml_fma(f64, f64, f64) -> f64; | |
#[import(cc = "C", name = "__ocml_fmax_f64")] fn ocml_fmax(f64, f64) -> f64; | |
#[import(cc = "C", name = "__ocml_fmin_f64")] fn ocml_fmin(f64, f64) -> f64; | |
#[import(cc = "C", name = "__ocml_fmod_f64")] fn ocml_fmod(f64, f64) -> f64; | |
#[import(cc = "C", name = "__ocml_floor_f64")] fn ocml_floor(f64) -> f64; | |
#[import(cc = "C", name = "__ocml_isinf_f64")] fn ocml_isinf(f64) -> i32; | |
#[import(cc = "C", name = "__ocml_isnan_f64")] fn ocml_isnan(f64) -> i32; | |
#[import(cc = "C", name = "__ocml_isfinite_f64")] fn ocml_isfinite(f64) -> i32; | |
#[import(cc = "C", name = "__ocml_mad_f64")] fn ocml_mad(f64, f64, f64) -> f64; | |
// | |
// atomics | |
// 0 1 2 3 4 5 6 7 8 9 10 11 12 | |
// operation: Xchg Add Sub And Nand Or Xor Max Min UMax UMin FAdd FSub | |
// 0 1 2 4 5 6 7 | |
// ordering: NotAtomic Unordered Monotonic Acquire Release AcquireRelease SequentiallyConsistent | |
// syncscope: agent workgroup wavefront one-as agent-one-as workgroup-one-as wavefront-one-as singlethread-one-as | |
// | |
fn @amdgcn_atomic_load_global_i32(addr: &addrspace(1)i32) = atomic_load_p1[i32](addr, 2, "agent"); | |
fn @amdgcn_atomic_load_global_u32(addr: &addrspace(1)u32) = atomic_load_p1[u32](addr, 2, "agent"); | |
fn @amdgcn_atomic_load_global_u64(addr: &addrspace(1)u64) = atomic_load_p1[u64](addr, 2, "agent"); | |
fn @amdgcn_atomic_store_global_i32(addr: &mut addrspace(1)i32, val: i32) = atomic_store_p1[i32](addr, val, 2, "agent"); | |
fn @amdgcn_atomic_store_global_u32(addr: &mut addrspace(1)u32, val: u32) = atomic_store_p1[u32](addr, val, 2, "agent"); | |
fn @amdgcn_atomic_store_global_u64(addr: &mut addrspace(1)u64, val: u64) = atomic_store_p1[u64](addr, val, 2, "agent"); | |
fn @amdgcn_atomic_xchg_global(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32](0, addr, val, 2, "agent"); | |
fn @amdgcn_atomic_xchg_shared(addr: &mut addrspace(3)i32, val: i32) = atomic_p3[i32](0, addr, val, 2, "workgroup"); | |
fn @amdgcn_atomic_add_global(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32](1, addr, val, 2, "agent"); | |
fn @amdgcn_atomic_add_shared(addr: &mut addrspace(3)i32, val: i32) = atomic_p3[i32](1, addr, val, 2, "workgroup"); | |
fn @amdgcn_atomic_sub_global(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32](2, addr, val, 2, "agent"); | |
fn @amdgcn_atomic_sub_shared(addr: &mut addrspace(3)i32, val: i32) = atomic_p3[i32](2, addr, val, 2, "workgroup"); | |
fn @amdgcn_atomic_max_global(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32](7, addr, val, 2, "agent"); | |
fn @amdgcn_atomic_max_shared(addr: &mut addrspace(3)i32, val: i32) = atomic_p3[i32](7, addr, val, 2, "workgroup"); | |
fn @amdgcn_atomic_min_global(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32](8, addr, val, 2, "agent"); | |
fn @amdgcn_atomic_min_shared(addr: &mut addrspace(3)i32, val: i32) = atomic_p3[i32](8, addr, val, 2, "workgroup"); | |
fn @amdgcn_atomic_add_global_i32(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32]( 1, addr, val, 2, "agent"); | |
fn @amdgcn_atomic_add_global_u32(addr: &mut addrspace(1)u32, val: u32) = atomic_p1[u32]( 1, addr, val, 2, "agent"); | |
fn @amdgcn_atomic_add_global_u64(addr: &mut addrspace(1)u64, val: u64) = atomic_p1[u64]( 1, addr, val, 2, "agent"); | |
fn @amdgcn_atomic_add_global_f32(addr: &mut addrspace(1)f32, val: f32) = atomic_p1[f32](11, addr, val, 2, "agent"); | |
fn @amdgcn_atomic_add_global_f64(addr: &mut addrspace(1)f64, val: f64) = atomic_p1[f64](11, addr, val, 2, "agent"); | |
fn @amdgcn_atomic_sub_global_i32(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32]( 2, addr, val, 2, "agent"); | |
fn @amdgcn_atomic_sub_global_u32(addr: &mut addrspace(1)u32, val: u32) = atomic_p1[u32]( 2, addr, val, 2, "agent"); | |
fn @amdgcn_atomic_sub_global_u64(addr: &mut addrspace(1)u64, val: u64) = atomic_p1[u64]( 2, addr, val, 2, "agent"); | |
fn @amdgcn_atomic_sub_global_f32(addr: &mut addrspace(1)f32, val: f32) = atomic_p1[f32](12, addr, val, 2, "agent"); | |
fn @amdgcn_atomic_sub_global_f64(addr: &mut addrspace(1)f64, val: f64) = atomic_p1[f64](12, addr, val, 2, "agent"); | |
fn @amdgcn_atomic_and_global_i32(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32](3, addr, val, 2, "agent"); | |
fn @amdgcn_atomic_and_global_u32(addr: &mut addrspace(1)u32, val: u32) = atomic_p1[u32](3, addr, val, 2, "agent"); | |
fn @amdgcn_atomic_and_global_u64(addr: &mut addrspace(1)u64, val: u64) = atomic_p1[u64](3, addr, val, 2, "agent"); | |
fn @amdgcn_atomic_or_global_i32(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32](5, addr, val, 2, "agent"); | |
fn @amdgcn_atomic_or_global_u32(addr: &mut addrspace(1)u32, val: u32) = atomic_p1[u32](5, addr, val, 2, "agent"); | |
fn @amdgcn_atomic_or_global_u64(addr: &mut addrspace(1)u64, val: u64) = atomic_p1[u64](5, addr, val, 2, "agent"); | |
fn @amdgcn_atomic_xor_global_i32(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32](6, addr, val, 2, "agent"); | |
fn @amdgcn_atomic_xor_global_u32(addr: &mut addrspace(1)u32, val: u32) = atomic_p1[u32](6, addr, val, 2, "agent"); | |
fn @amdgcn_atomic_xor_global_u64(addr: &mut addrspace(1)u64, val: u64) = atomic_p1[u64](6, addr, val, 2, "agent"); | |
fn @amdgcn_atomic_exch_global_i32(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32](0, addr, val, 2, "agent"); | |
fn @amdgcn_atomic_exch_global_u32(addr: &mut addrspace(1)u32, val: u32) = atomic_p1[u32](0, addr, val, 2, "agent"); | |
fn @amdgcn_atomic_exch_global_u64(addr: &mut addrspace(1)u64, val: u64) = atomic_p1[u64](0, addr, val, 2, "agent"); | |
fn @amdgcn_atomic_exch_global_f32(addr: &mut addrspace(1)f32, val: f32) = atomic_p1[f32](0, addr, val, 2, "agent"); | |
fn @amdgcn_atomic_min_global_i32(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32]( 8, addr, val, 2, "agent"); | |
fn @amdgcn_atomic_min_global_u32(addr: &mut addrspace(1)u32, val: u32) = atomic_p1[u32](10, addr, val, 2, "agent"); | |
fn @amdgcn_atomic_min_global_u64(addr: &mut addrspace(1)u64, val: u64) = atomic_p1[u64](10, addr, val, 2, "agent"); | |
fn @amdgcn_atomic_max_global_i32(addr: &mut addrspace(1)i32, val: i32) = atomic_p1[i32](7, addr, val, 2, "agent"); | |
fn @amdgcn_atomic_max_global_u32(addr: &mut addrspace(1)u32, val: u32) = atomic_p1[u32](9, addr, val, 2, "agent"); | |
fn @amdgcn_atomic_max_global_u64(addr: &mut addrspace(1)u64, val: u64) = atomic_p1[u64](9, addr, val, 2, "agent"); | |
fn @amdgcn_atomic_cas_global_u16(addr: &mut addrspace(1)u16, cmp: u16, new: u16) = match cmpxchg_p1[u16](addr, cmp, new, 2, 2, "agent") { (old, _success) => old }; | |
fn @amdgcn_atomic_cas_global_i32(addr: &mut addrspace(1)i32, cmp: i32, new: i32) = match cmpxchg_p1[i32](addr, cmp, new, 2, 2, "agent") { (old, _success) => old }; | |
fn @amdgcn_atomic_cas_global_u32(addr: &mut addrspace(1)u32, cmp: u32, new: u32) = match cmpxchg_p1[u32](addr, cmp, new, 2, 2, "agent") { (old, _success) => old }; | |
fn @amdgcn_atomic_cas_global_u64(addr: &mut addrspace(1)u64, cmp: u64, new: u64) = match cmpxchg_p1[u64](addr, cmp, new, 2, 2, "agent") { (old, _success) => old }; | |
fn @amdgcn_threadfence() = fence(7, "agent"); | |
fn @amdgcn_lane_id() -> i32 { | |
if amdgcn_wavefrontsize() == 64 { | |
amdgcn_mbcnt_hi(-1, amdgcn_mbcnt_lo(-1, 0)) | |
} else { | |
amdgcn_mbcnt_lo(-1, 0) | |
} | |
} | |
fn @amdgcn_activelane() -> i32 { | |
if amdgcn_wavefrontsize() == 64 { | |
amdgcn_mbcnt_hi(amdgcn_read_exec_hi(), amdgcn_mbcnt_lo(amdgcn_read_exec_lo(), 0)) | |
} else { | |
amdgcn_mbcnt_lo(amdgcn_read_exec_lo(), 0) | |
} | |
} | |
static ICMP_NE = 33; | |
fn @amdgcn_ballot(p: i32) -> u64 { | |
if amdgcn_wavefrontsize() == 64 { | |
amdgcn_icmp_i64(p, 0, ICMP_NE) as u64 | |
} else { | |
amdgcn_icmp_i32(p, 0, ICMP_NE) as u32 as u64 | |
} | |
} | |
fn @amdgcn_lanemask_eq() -> u64 { | |
let lane_id = amdgcn_lane_id(); | |
let mask = 1:u64 << lane_id as u64; | |
mask | |
} | |
fn @amdgcn_lanemask_lt() -> u64 { | |
let lane_id = amdgcn_lane_id(); | |
let ballot = amdgcn_ballot(1); | |
let mask = (1 :u64<< lane_id as u64) - 1:u64; | |
mask & ballot | |
} | |
fn @amdgcn_lanemask_le() -> u64 { | |
amdgcn_lanemask_lt() | amdgcn_lanemask_eq() | |
} | |
fn @amdgcn_lanemask_gt() -> u64 { | |
let lane_id = amdgcn_lane_id(); | |
if (amdgcn_wavefrontsize() == 64 && lane_id == 63) || (amdgcn_wavefrontsize() == 32 && lane_id == 31) { | |
return(0:u64) | |
} | |
let ballot = amdgcn_ballot(1); | |
let mask = (!0:u64) << (lane_id as u64 + 1:u64); | |
mask & ballot | |
} | |
fn @amdgcn_lanemask_ge() -> u64 { | |
amdgcn_lanemask_gt() | amdgcn_lanemask_eq() | |
} | |
fn @amdgcn_sync_all(p: i32) -> i32 { | |
amdgcn_wave_barrier(); | |
if amdgcn_icmp_i32(p, 0, ICMP_NE) == amdgcn_read_exec_lo() { 1 } else { 0 } | |
} | |
fn @amdgcn_sync_any(p: i32) -> i32 { | |
amdgcn_wave_barrier(); | |
if amdgcn_icmp_i32(p, 0, ICMP_NE) != 0 { 1 } else { 0 } | |
} | |
fn @amdgcn_sync_count(p: i32) -> i32 { | |
amdgcn_wave_barrier(); | |
cpu_popcount32(amdgcn_icmp_i32(p, 0, ICMP_NE)) | |
} | |
fn @amdgcn_sync_vote(p: i32) -> u64 { | |
amdgcn_wave_barrier(); | |
(amdgcn_icmp_i32(p, 0, ICMP_NE) as u32) as u64 | |
} | |
fn @amdgcn_shfl_i32(var: i32, src_lane: i32, width: i32) { | |
let lane_id = amdgcn_lane_id(); | |
let idx = src_lane + (lane_id & !(width - 1)); | |
amdgcn_ds_bpermute(idx << 2, var) | |
} | |
fn @amdgcn_shfl_u32(var: u32, src_lane: i32, width: i32) { bitcast[u32](amdgcn_shfl_i32(bitcast[i32](var), src_lane, width)) } | |
fn @amdgcn_shfl_f32(var: f32, src_lane: i32, width: i32) { bitcast[f32](amdgcn_shfl_i32(bitcast[i32](var), src_lane, width)) } | |
fn @amdgcn_shfl_up_i32(var: i32, lane_delta: u32, width: i32) { | |
let lane_id = amdgcn_lane_id(); | |
let idx = if lane_id - lane_delta as i32 < lane_id & !(width - 1) { lane_id } else { lane_id - lane_delta as i32 }; | |
amdgcn_ds_bpermute(idx << 2, var) | |
} | |
fn @amdgcn_shfl_up_u32(var: u32, lane_delta: u32, width: i32) { bitcast[u32](amdgcn_shfl_up_i32(bitcast[i32](var), lane_delta, width)) } | |
fn @amdgcn_shfl_up_f32(var: f32, lane_delta: u32, width: i32) { bitcast[f32](amdgcn_shfl_up_i32(bitcast[i32](var), lane_delta, width)) } | |
fn @amdgcn_shfl_down_i32(var: i32, lane_delta: u32, width: i32) { | |
let lane_id = amdgcn_lane_id(); | |
let idx = if (lane_id & (width - 1)) + lane_delta as i32 >= width { lane_id } else { lane_id + lane_delta as i32 }; | |
amdgcn_ds_bpermute(idx << 2, var) | |
} | |
fn @amdgcn_shfl_down_u32(var: u32, lane_delta: u32, width: i32) { bitcast[u32](amdgcn_shfl_down_i32(bitcast[i32](var), lane_delta, width)) } | |
fn @amdgcn_shfl_down_f32(var: f32, lane_delta: u32, width: i32) { bitcast[f32](amdgcn_shfl_down_i32(bitcast[i32](var), lane_delta, width)) } | |
fn @amdgcn_shfl_xor_i32(var: i32, lane_mask: i32, width: i32) { | |
let lane_id = amdgcn_lane_id(); | |
let idx = if lane_id ^ lane_mask >= (lane_id + width) & !(width - 1) { lane_id } else { lane_id ^ lane_mask }; | |
amdgcn_ds_bpermute(idx << 2, var) | |
} | |
fn @amdgcn_shfl_xor_u32(var: u32, lane_mask: i32, width: i32) { bitcast[u32](amdgcn_shfl_xor_i32(bitcast[i32](var), lane_mask, width)) } | |
fn @amdgcn_shfl_xor_f32(var: f32, lane_mask: i32, width: i32) { bitcast[f32](amdgcn_shfl_xor_i32(bitcast[i32](var), lane_mask, width)) } | |
fn @amdgcn_read_exec() -> i64 { | |
let mut exec_lo: i32; | |
let mut exec_hi: i32; | |
asm("v_mov_b32_e32 $0, exec_lo\n" | |
"v_mov_b32_e32 $1, exec_hi" : "=v"(exec_lo), "=v"(exec_hi) : :: "volatile"); | |
exec_hi as i64 << 32:i64 | exec_lo as i64 | |
} | |
fn @amdgcn_read_exec_lo() -> i32 { | |
let mut res: i32; | |
asm("v_mov_b32_e32 $0, exec_lo" : "=v"(res) : :: "volatile"); | |
res | |
} | |
fn @amdgcn_read_exec_hi() -> i32 { | |
let mut res: i32; | |
asm("v_mov_b32_e32 $0, exec_hi" : "=v"(res) : :: "volatile"); | |
res | |
} | |
fn @amdgcn_minmin(a: i32, b: i32, c: i32) -> i32 { | |
let mut res: i32; | |
asm("v_min3_i32 $0, $1, $2, $3" | |
: "=v"(res) | |
: "v"(a), "v"(b), "v"(c) | |
); | |
res | |
} | |
fn @amdgcn_maxmax(a: i32, b: i32, c: i32) -> i32 { | |
let mut res: i32; | |
asm("v_max3_i32 $0, $1, $2, $3" | |
: "=v"(res) | |
: "v"(a), "v"(b), "v"(c) | |
); | |
res | |
} | |
fn @amdpal_breakpoint() -> () { | |
amdgcn_sched_barrier(0); | |
amdgcn_s_sethalt(1); | |
amdgcn_sched_barrier(0); | |
} | |
struct hsa_signal_t { | |
handle : u64 | |
} | |
struct hsa_dispatch_packet_t { | |
header : u16, | |
setup : u16, | |
workgroup_size_x : u16, | |
workgroup_size_y : u16, | |
workgroup_size_z : u16, | |
reserved0 : u16, | |
grid_size_x : u32, | |
grid_size_y : u32, | |
grid_size_z : u32, | |
private_segment_size : u32, | |
group_segment_size : u32, | |
kernel_object : u64, | |
kernarg_address : &[i8], // HSA_LARGE_MODEL | |
reserved2 : u64, | |
completion_signal : hsa_signal_t | |
} | |
fn @amdgpu_hsa_accelerator(dev: i32) = Accelerator { | |
exec = @|body| |grid, block| { | |
fn @div_round_up(num: i32, multiple: i32) -> i32 { (num + multiple - 1) / multiple } | |
let work_item = WorkItem { | |
tidx = amdgcn_workitem_id_x, | |
tidy = amdgcn_workitem_id_y, | |
tidz = amdgcn_workitem_id_z, | |
bidx = amdgcn_workgroup_id_x, | |
bidy = amdgcn_workgroup_id_y, | |
bidz = amdgcn_workgroup_id_z, | |
gidx = @|| amdgcn_workitem_id_x() + bitcast[&addrspace(4)[u16]](amdgcn_dispatch_ptr())(2) as i32 * amdgcn_workgroup_id_x(), | |
gidy = @|| amdgcn_workitem_id_y() + bitcast[&addrspace(4)[u16]](amdgcn_dispatch_ptr())(3) as i32 * amdgcn_workgroup_id_y(), | |
gidz = @|| amdgcn_workitem_id_z() + bitcast[&addrspace(4)[u16]](amdgcn_dispatch_ptr())(4) as i32 * amdgcn_workgroup_id_z(), | |
bdimx = @|| bitcast[&addrspace(4)[u16]](amdgcn_dispatch_ptr())(2) as i32, | |
bdimy = @|| bitcast[&addrspace(4)[u16]](amdgcn_dispatch_ptr())(3) as i32, | |
bdimz = @|| bitcast[&addrspace(4)[u16]](amdgcn_dispatch_ptr())(4) as i32, | |
gdimx = @|| bitcast[&addrspace(4)[u32]](amdgcn_dispatch_ptr())(3) as i32, | |
gdimy = @|| bitcast[&addrspace(4)[u32]](amdgcn_dispatch_ptr())(4) as i32, | |
gdimz = @|| bitcast[&addrspace(4)[u32]](amdgcn_dispatch_ptr())(5) as i32, | |
nblkx = @|| div_round_up(bitcast[&addrspace(4)[u32]](amdgcn_dispatch_ptr())(3) as i32, bitcast[&addrspace(4)[u16]](amdgcn_dispatch_ptr())(2) as i32), | |
nblky = @|| div_round_up(bitcast[&addrspace(4)[u32]](amdgcn_dispatch_ptr())(4) as i32, bitcast[&addrspace(4)[u16]](amdgcn_dispatch_ptr())(3) as i32), | |
nblkz = @|| div_round_up(bitcast[&addrspace(4)[u32]](amdgcn_dispatch_ptr())(5) as i32, bitcast[&addrspace(4)[u16]](amdgcn_dispatch_ptr())(4) as i32) | |
}; | |
amdgpu_hsa(dev, grid, block, || @body(work_item)) | |
}, | |
sync = @|| synchronize_hsa(dev), | |
alloc = @|size| alloc_hsa(dev, size), | |
alloc_unified = @|size| alloc_hsa_unified(dev, size), | |
barrier = amdgcn_s_barrier, | |
}; | |
#[import(cc = "device", name = "anydsl.amdpal.workitem.id.x")] fn amdpal_workitem_id_x() -> i32; | |
#[import(cc = "device", name = "anydsl.amdpal.workitem.id.y")] fn amdpal_workitem_id_y() -> i32; | |
#[import(cc = "device", name = "anydsl.amdpal.workitem.id.z")] fn amdpal_workitem_id_z() -> i32; | |
#[import(cc = "device", name = "anydsl.amdpal.workgroup.id.x")] fn amdpal_workgroup_id_x() -> i32; | |
#[import(cc = "device", name = "anydsl.amdpal.workgroup.id.y")] fn amdpal_workgroup_id_y() -> i32; | |
#[import(cc = "device", name = "anydsl.amdpal.workgroup.id.z")] fn amdpal_workgroup_id_z() -> i32; | |
#[import(cc = "device", name = "anydsl.amdpal.workgroup.size.x")] fn amdpal_workgroup_size_x() -> i32; | |
#[import(cc = "device", name = "anydsl.amdpal.workgroup.size.y")] fn amdpal_workgroup_size_y() -> i32; | |
#[import(cc = "device", name = "anydsl.amdpal.workgroup.size.z")] fn amdpal_workgroup_size_z() -> i32; | |
#[import(cc = "device", name = "anydsl.amdpal.nblk.x")] fn amdpal_nblk_x() -> i32; | |
#[import(cc = "device", name = "anydsl.amdpal.nblk.y")] fn amdpal_nblk_y() -> i32; | |
#[import(cc = "device", name = "anydsl.amdpal.nblk.z")] fn amdpal_nblk_z() -> i32; | |
fn @amdgpu_pal_accelerator(dev: i32) = Accelerator { | |
exec = @|body| |grid, block| { | |
let work_item = WorkItem { | |
tidx = amdpal_workitem_id_x, | |
tidy = amdpal_workitem_id_y, | |
tidz = amdpal_workitem_id_z, | |
bidx = amdpal_workgroup_id_x, | |
bidy = amdpal_workgroup_id_y, | |
bidz = amdpal_workgroup_id_z, | |
gidx = @|| amdpal_workgroup_id_x() * amdpal_workgroup_size_x() + amdpal_workitem_id_x(), | |
gidy = @|| amdpal_workgroup_id_y() * amdpal_workgroup_size_y() + amdpal_workitem_id_y(), | |
gidz = @|| amdpal_workgroup_id_z() * amdpal_workgroup_size_z() + amdpal_workitem_id_z(), | |
bdimx = amdpal_workgroup_size_x, | |
bdimy = amdpal_workgroup_size_y, | |
bdimz = amdpal_workgroup_size_z, | |
gdimx = @|| amdpal_nblk_x() * amdpal_workgroup_size_x(), | |
gdimy = @|| amdpal_nblk_y() * amdpal_workgroup_size_y(), | |
gdimz = @|| amdpal_nblk_z() * amdpal_workgroup_size_z(), | |
nblkx = amdpal_nblk_x, | |
nblky = amdpal_nblk_y, | |
nblkz = amdpal_nblk_z | |
}; | |
amdgpu_pal(dev, grid, block, || @body(work_item)) | |
}, | |
sync = @|| synchronize_pal(dev), | |
alloc = @|size| alloc_pal(dev, size), | |
alloc_unified = @|size| alloc_pal_unified(dev, size), | |
barrier = amdgcn_s_barrier, | |
}; | |
static amdgpu_intrinsics = Intrinsics { | |
expf = ocml_expf, | |
exp2f = ocml_exp2f, | |
logf = ocml_logf, | |
log2f = ocml_log2f, | |
powf = @ |x, p| { | |
if ?(p == ((p as i32) as f32)) { ocml_pownf(x, p as i32) } | |
else if ?(p >= 0) { ocml_powrf(x, p) } | |
else { ocml_powf(x, p) } | |
}, | |
rsqrtf = ocml_rsqrtf, | |
sqrtf = ocml_sqrtf, | |
fabsf = ocml_fabsf, | |
sinf = ocml_sinf, | |
cosf = ocml_cosf, | |
tanf = ocml_tanf, | |
asinf = ocml_asinf, | |
acosf = ocml_acosf, | |
atanf = ocml_atanf, | |
erff = ocml_erff, | |
atan2f = ocml_atan2f, | |
copysignf = ocml_copysignf, | |
fmaf = ocml_fmaf, | |
fmaxf = ocml_fmaxf, | |
fminf = ocml_fminf, | |
fmodf = ocml_fmodf, | |
floorf = ocml_floorf, | |
isinff = ocml_isinff, | |
isnanf = ocml_isnanf, | |
isfinitef = ocml_isfinitef, | |
exp = ocml_exp, | |
exp2 = ocml_exp2, | |
log = ocml_log, | |
log2 = ocml_log2, | |
pow = @ |x, p| { | |
if ?(p == ((p as i32) as f64)) { ocml_pown(x, p as i32) } | |
else if ?(p >= 0) { ocml_powr(x, p) } | |
else { ocml_pow(x, p) } | |
}, | |
rsqrt = ocml_rsqrt, | |
sqrt = ocml_sqrt, | |
fabs = ocml_fabs, | |
sin = ocml_sin, | |
cos = ocml_cos, | |
tan = ocml_tan, | |
asin = ocml_asin, | |
acos = ocml_acos, | |
atan = ocml_atan, | |
erf = ocml_erf, | |
atan2 = ocml_atan2, | |
copysign = ocml_copysign, | |
fma = ocml_fma, | |
fmax = ocml_fmax, | |
fmin = ocml_fmin, | |
fmod = ocml_fmod, | |
floor = ocml_floor, | |
isinf = ocml_isinf, | |
isnan = ocml_isnan, | |
isfinite = ocml_isfinite, | |
min = @|a, b| { if a < b { a } else { b } }, | |
max = @|a, b| { if a > b { a } else { b } }, | |
}; | |
// no declarations are emitted for "device" functions | |
#[import(cc = "device", name = "barrier")] fn opencl_barrier(u32) -> (); | |
#[import(cc = "device", name = "exp")] fn opencl_expf(f32) -> f32; | |
#[import(cc = "device", name = "exp2")] fn opencl_exp2f(f32) -> f32; | |
#[import(cc = "device", name = "log")] fn opencl_logf(f32) -> f32; | |
#[import(cc = "device", name = "log2")] fn opencl_log2f(f32) -> f32; | |
#[import(cc = "device", name = "pow")] fn opencl_powf(f32, f32) -> f32; | |
#[import(cc = "device", name = "rsqrt")] fn opencl_rsqrtf(f32) -> f32; | |
#[import(cc = "device", name = "sqrt")] fn opencl_sqrtf(f32) -> f32; | |
#[import(cc = "device", name = "fabs")] fn opencl_fabsf(f32) -> f32; | |
#[import(cc = "device", name = "sin")] fn opencl_sinf(f32) -> f32; | |
#[import(cc = "device", name = "cos")] fn opencl_cosf(f32) -> f32; | |
#[import(cc = "device", name = "tan")] fn opencl_tanf(f32) -> f32; | |
#[import(cc = "device", name = "asin")] fn opencl_asinf(f32) -> f32; | |
#[import(cc = "device", name = "acos")] fn opencl_acosf(f32) -> f32; | |
#[import(cc = "device", name = "atan")] fn opencl_atanf(f32) -> f32; | |
#[import(cc = "device", name = "erf")] fn opencl_erff(f32) -> f32; | |
#[import(cc = "device", name = "atan2")] fn opencl_atan2f(f32, f32) -> f32; | |
#[import(cc = "device", name = "fmod")] fn opencl_fmodf(f32, f32) -> f32; | |
#[import(cc = "device", name = "floor")] fn opencl_floorf(f32) -> f32; | |
#[import(cc = "device", name = "isinf")] fn opencl_isinff(f32) -> i32; | |
#[import(cc = "device", name = "isnan")] fn opencl_isnanf(f32) -> i32; | |
#[import(cc = "device", name = "isfinite")] fn opencl_isfinitef(f32) -> i32; | |
#[import(cc = "device", name = "fma")] fn opencl_fmaf(f32, f32, f32) -> f32; | |
#[import(cc = "device", name = "mad")] fn opencl_madf(f32, f32, f32) -> f32; | |
#[import(cc = "device", name = "copysign")] fn opencl_copysignf(f32, f32) -> f32; | |
#[import(cc = "device", name = "exp")] fn opencl_exp(f64) -> f64; | |
#[import(cc = "device", name = "exp2")] fn opencl_exp2(f64) -> f64; | |
#[import(cc = "device", name = "log")] fn opencl_log(f64) -> f64; | |
#[import(cc = "device", name = "log2")] fn opencl_log2(f64) -> f64; | |
#[import(cc = "device", name = "pow")] fn opencl_pow(f64, f64) -> f64; | |
#[import(cc = "device", name = "rsqrt")] fn opencl_rsqrt(f64) -> f64; | |
#[import(cc = "device", name = "sqrt")] fn opencl_sqrt(f64) -> f64; | |
#[import(cc = "device", name = "fabs")] fn opencl_fabs(f64) -> f64; | |
#[import(cc = "device", name = "sin")] fn opencl_sin(f64) -> f64; | |
#[import(cc = "device", name = "cos")] fn opencl_cos(f64) -> f64; | |
#[import(cc = "device", name = "tan")] fn opencl_tan(f64) -> f64; | |
#[import(cc = "device", name = "asin")] fn opencl_asin(f64) -> f64; | |
#[import(cc = "device", name = "acos")] fn opencl_acos(f64) -> f64; | |
#[import(cc = "device", name = "atan")] fn opencl_atan(f64) -> f64; | |
#[import(cc = "device", name = "erf")] fn opencl_erf(f64) -> f64; | |
#[import(cc = "device", name = "atan2")] fn opencl_atan2(f64, f64) -> f64; | |
#[import(cc = "device", name = "fmod")] fn opencl_fmod(f64, f64) -> f64; | |
#[import(cc = "device", name = "floor")] fn opencl_floor(f64) -> f64; | |
#[import(cc = "device", name = "isinf")] fn opencl_isinf(f64) -> i32; | |
#[import(cc = "device", name = "isnan")] fn opencl_isnan(f64) -> i32; | |
#[import(cc = "device", name = "isfinite")] fn opencl_isfinite(f64) -> i32; | |
#[import(cc = "device", name = "fma")] fn opencl_fma(f64, f64, f64) -> f64; | |
#[import(cc = "device", name = "mad")] fn opencl_mad(f64, f64, f64) -> f64; | |
#[import(cc = "device", name = "copysign")] fn opencl_copysign(f64, f64) -> f64; | |
#[import(cc = "device", name = "fmin")] fn opencl_fminf(f32, f32) -> f32; | |
#[import(cc = "device", name = "fmax")] fn opencl_fmaxf(f32, f32) -> f32; | |
#[import(cc = "device", name = "fmin")] fn opencl_fmin(f64, f64) -> f64; | |
#[import(cc = "device", name = "fmax")] fn opencl_fmax(f64, f64) -> f64; | |
#[import(cc = "device", name = "min")] fn opencl_min(i32, i32) -> i32; | |
#[import(cc = "device", name = "max")] fn opencl_max(i32, i32) -> i32; | |
#[import(cc = "device", name = "atomic_add")] fn opencl_atomic_add_global(&mut addrspace(1)i32, i32) -> i32; | |
#[import(cc = "device", name = "atomic_add")] fn opencl_atomic_add_shared(&mut addrspace(3)i32, i32) -> i32; | |
#[import(cc = "device", name = "atomic_min")] fn opencl_atomic_min_global(&mut addrspace(1)i32, i32) -> i32; | |
#[import(cc = "device", name = "atomic_min")] fn opencl_atomic_min_shared(&mut addrspace(3)i32, i32) -> i32; | |
#[import(cc = "device", name = "get_work_dim")] fn opencl_get_work_dim() -> u32; | |
#[import(cc = "device", name = "get_global_size")] fn opencl_get_global_size(u32) -> u64; | |
#[import(cc = "device", name = "get_global_id")] fn opencl_get_global_id(u32) -> u64; | |
#[import(cc = "device", name = "get_local_size")] fn opencl_get_local_size(u32) -> u64; | |
#[import(cc = "device", name = "get_local_id")] fn opencl_get_local_id(u32) -> u64; | |
#[import(cc = "device", name = "get_num_groups")] fn opencl_get_num_groups(u32) -> u64; | |
#[import(cc = "device", name = "get_group_id")] fn opencl_get_group_id(u32) -> u64; | |
#[import(cc = "device", name = "get_global_offset")] fn opencl_get_global_offset(u32) -> u64; | |
static CLK_LOCAL_MEM_FENCE = 1:u32; | |
static CLK_GLOBAL_MEM_FENCE = 2:u32; | |
fn @opencl_accelerator(dev: i32) = Accelerator { | |
exec = @|body| |grid, block| { | |
let work_item = WorkItem { | |
tidx = @|| opencl_get_local_id(0) as i32, | |
tidy = @|| opencl_get_local_id(1) as i32, | |
tidz = @|| opencl_get_local_id(2) as i32, | |
bidx = @|| opencl_get_group_id(0) as i32, | |
bidy = @|| opencl_get_group_id(1) as i32, | |
bidz = @|| opencl_get_group_id(2) as i32, | |
gidx = @|| opencl_get_global_id(0) as i32, | |
gidy = @|| opencl_get_global_id(1) as i32, | |
gidz = @|| opencl_get_global_id(2) as i32, | |
bdimx = @|| opencl_get_local_size(0) as i32, | |
bdimy = @|| opencl_get_local_size(1) as i32, | |
bdimz = @|| opencl_get_local_size(2) as i32, | |
gdimx = @|| opencl_get_global_size(0) as i32, | |
gdimy = @|| opencl_get_global_size(1) as i32, | |
gdimz = @|| opencl_get_global_size(2) as i32, | |
nblkx = @|| opencl_get_num_groups(0) as i32, | |
nblky = @|| opencl_get_num_groups(1) as i32, | |
nblkz = @|| opencl_get_num_groups(2) as i32 | |
}; | |
opencl(dev, grid, block, || @body(work_item)) | |
}, | |
sync = @|| synchronize_opencl(dev), | |
alloc = @|size| alloc_opencl(dev, size), | |
alloc_unified = @|size| alloc_opencl_unified(dev, size), | |
barrier = @|| opencl_barrier(CLK_LOCAL_MEM_FENCE), | |
}; | |
static opencl_intrinsics = Intrinsics { | |
expf = opencl_expf, | |
exp2f = opencl_exp2f, | |
logf = opencl_logf, | |
log2f = opencl_log2f, | |
powf = opencl_powf, | |
rsqrtf = opencl_rsqrtf, | |
sqrtf = opencl_sqrtf, | |
fabsf = opencl_fabsf, | |
sinf = opencl_sinf, | |
cosf = opencl_cosf, | |
tanf = opencl_tanf, | |
asinf = opencl_asinf, | |
acosf = opencl_acosf, | |
atanf = opencl_atanf, | |
erff = opencl_erff, | |
atan2f = opencl_atan2f, | |
copysignf = opencl_copysignf, | |
fmaf = opencl_fmaf, | |
fmaxf = opencl_fmaxf, | |
fminf = opencl_fminf, | |
fmodf = opencl_fmodf, | |
floorf = opencl_floorf, | |
isinff = opencl_isinff, | |
isnanf = opencl_isnanf, | |
isfinitef = opencl_isfinitef, | |
exp = opencl_exp, | |
exp2 = opencl_exp2, | |
log = opencl_log, | |
log2 = opencl_log2, | |
pow = opencl_pow, | |
rsqrt = opencl_rsqrt, | |
sqrt = opencl_sqrt, | |
fabs = opencl_fabs, | |
sin = opencl_sin, | |
cos = opencl_cos, | |
tan = opencl_tan, | |
asin = opencl_asin, | |
acos = opencl_acos, | |
atan = opencl_atan, | |
erf = opencl_erf, | |
atan2 = opencl_atan2, | |
copysign = opencl_copysign, | |
fma = opencl_fma, | |
fmax = opencl_fmax, | |
fmin = opencl_fmin, | |
fmod = opencl_fmod, | |
floor = opencl_floor, | |
isinf = opencl_isinf, | |
isnan = opencl_isnan, | |
isfinite = opencl_isfinite, | |
min = opencl_min, | |
max = opencl_max, | |
}; | |
#[import(cc = "builtin")] fn undef[T]() -> T; | |
#[import(cc = "builtin")] fn sizeof[_]() -> i64; | |
#[import(cc = "builtin")] fn alignof[_]() -> i64; | |
#[import(cc = "builtin")] fn bitcast[T, U](_src: U) -> T; | |
#[import(cc = "builtin")] fn select[T, U](_cond: T, _true: U, _false: U) -> U; | |
#[import(cc = "builtin")] fn insert[T, U](_tuple: T, _index: i32, _value: U) -> T; | |
#[import(cc = "thorin")] fn atomic[T](_binop: u32, _addr: &mut T, _val: T, _order: u32, _scope: &[u8]) -> T; // Xchg Add Sub And Nand Or Xor Max Min UMax UMin FAdd FSub | |
#[import(cc = "thorin")] fn atomic_load[T](_addr: &T, _order: u32, _scope: &[u8]) -> T; | |
#[import(cc = "thorin")] fn atomic_store[T](_addr: &mut T, _val: T, _order: u32, _scope: &[u8]) -> (); | |
#[import(cc = "thorin")] fn cmpxchg[T](_addr: &mut T, _cmp: T, _new: T, _success_order: u32, _failure_order: u32, _scope: &[u8]) -> (T, bool); // only for integer data types | |
#[import(cc = "thorin")] fn cmpxchg_weak[T](_addr: &mut T, _cmp: T, _new: T, _success_order: u32, _failure_order: u32, _scope: &[u8]) -> (T, bool); // only for integer data types | |
#[import(cc = "thorin")] fn fence(_order: u32, _scope: &[u8]) -> (); | |
#[import(cc = "thorin")] fn pe_info[T](_src: &[u8], _val: T) -> (); | |
#[import(cc = "thorin")] fn cuda(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _body: fn() -> ()) -> (); | |
#[import(cc = "thorin")] fn nvvm(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _body: fn() -> ()) -> (); | |
#[import(cc = "thorin")] fn opencl(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _body: fn() -> ()) -> (); | |
#[import(cc = "thorin")] fn amdgpu_hsa(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _body: fn() -> ()) -> (); | |
#[import(cc = "thorin")] fn amdgpu_pal(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _body: fn() -> ()) -> (); | |
#[import(cc = "thorin")] fn reserve_shared[T](_size: i32) -> &mut addrspace(3)[T]; | |
#[import(cc = "thorin")] fn hls(_dev: i32, _body: fn() -> ()) -> (); | |
#[import(cc = "thorin", name = "pipeline")] fn thorin_pipeline(_initiation_interval: i32, _lower: i32, _upper: i32, _body: fn(i32) -> ()) -> (); // only for HLS/OpenCL backend | |
#[import(cc = "thorin", name = "parallel")] fn thorin_parallel(_num_threads: i32, _lower: i32, _upper: i32, _body: fn(i32) -> ()) -> (); | |
#[import(cc = "thorin", name = "spawn")] fn thorin_spawn(_body: fn() -> ()) -> i32; | |
#[import(cc = "thorin")] fn sync(_id: i32) -> (); | |
#[import(cc = "thorin")] fn vectorize(_vector_length: i32, _body: fn(i32) -> ()) -> (); | |
#[import(cc = "thorin", name = "atomic")] fn atomic_p1[T](_binop: u32, _addr: &mut addrspace(1)T, _val: T, _order: u32, _scope: &[u8]) -> T; | |
#[import(cc = "thorin", name = "atomic")] fn atomic_p3[T](_binop: u32, _addr: &mut addrspace(3)T, _val: T, _order: u32, _scope: &[u8]) -> T; | |
#[import(cc = "thorin", name = "atomic_load")] fn atomic_load_p1[T](_addr: &addrspace(1)T, _order: u32, _scope: &[u8]) -> T; | |
#[import(cc = "thorin", name = "atomic_load")] fn atomic_load_p3[T](_addr: &addrspace(3)T, _order: u32, _scope: &[u8]) -> T; | |
#[import(cc = "thorin", name = "atomic_store")] fn atomic_store_p1[T](_addr: &mut addrspace(1)T, _val: T, _order: u32, _scope: &[u8]) -> (); | |
#[import(cc = "thorin", name = "atomic_store")] fn atomic_store_p3[T](_addr: &mut addrspace(3)T, _val: T, _order: u32, _scope: &[u8]) -> (); | |
#[import(cc = "thorin", name = "cmpxchg")] fn cmpxchg_p1[T](_addr: &mut addrspace(1)T, _cmp: T, _new: T, _success_order: u32, _failure_order: u32, _scope: &[u8]) -> (T, bool); | |
#[import(cc = "thorin", name = "cmpxchg")] fn cmpxchg_p3[T](_addr: &mut addrspace(3)T, _cmp: T, _new: T, _success_order: u32, _failure_order: u32, _scope: &[u8]) -> (T, bool); | |
#[import(cc = "thorin", name = "cmpxchg_weak")] fn cmpxchg_weak_p1[T](_addr: &mut addrspace(1)T, _cmp: T, _new: T, _success_order: u32, _failure_order: u32, _scope: &[u8]) -> (T, bool); | |
#[import(cc = "thorin", name = "cmpxchg_weak")] fn cmpxchg_weak_p3[T](_addr: &mut addrspace(3)T, _cmp: T, _new: T, _success_order: u32, _failure_order: u32, _scope: &[u8]) -> (T, bool); | |
fn @pipeline(body: fn(i32) -> ()) = @|initiation_interval: i32, lower: i32, upper: i32| thorin_pipeline(initiation_interval, lower, upper, body); | |
fn @parallel(body: fn(i32) -> ()) = @|num_threads: i32, lower: i32, upper: i32| thorin_parallel(num_threads, lower, upper, body); | |
fn @spawn(body: fn() -> ()) = @|| thorin_spawn(body); | |
#[import(cc = "C", name = "anydsl_info")] fn runtime_info() -> (); | |
#[import(cc = "C", name = "anydsl_device_name")] fn runtime_device_name(_device: i32) -> &[u8]; | |
#[import(cc = "C", name = "anydsl_device_check_feature_support")] fn runtime_device_check_feature_support(_device: i32, _feature: &[u8]) -> bool; | |
#[import(cc = "C", name = "anydsl_alloc")] fn runtime_alloc(_device: i32, _size: i64) -> &mut [i8]; | |
#[import(cc = "C", name = "anydsl_alloc_host")] fn runtime_alloc_host(_device: i32, _size: i64) -> &mut [i8]; | |
#[import(cc = "C", name = "anydsl_alloc_unified")] fn runtime_alloc_unified(_device: i32, _size: i64) -> &mut [i8]; | |
#[import(cc = "C", name = "anydsl_copy")] fn runtime_copy(_src_device: i32, _src_ptr: &[i8], _src_offset: i64, _dst_device: i32, _dst_ptr: &mut [i8], _dst_offset: i64, _size: i64) -> (); | |
#[import(cc = "C", name = "anydsl_get_device_ptr")] fn runtime_get_device_ptr(_device: i32, _ptr: &[i8]) -> &[i8]; | |
#[import(cc = "C", name = "anydsl_synchronize")] fn runtime_synchronize(_device: i32) -> (); | |
#[import(cc = "C", name = "anydsl_release")] fn runtime_release(_device: i32, _ptr: &[i8]) -> (); | |
#[import(cc = "C", name = "anydsl_release_host")] fn runtime_release_host(_device: i32, _ptr: &[i8]) -> (); | |
#[import(cc = "C", name = "anydsl_random_seed")] fn random_seed(_: u32) -> (); | |
#[import(cc = "C", name = "anydsl_random_val_f32")] fn random_val_f32() -> f32; | |
#[import(cc = "C", name = "anydsl_random_val_u64")] fn random_val_u64() -> u64; | |
#[import(cc = "C", name = "anydsl_get_micro_time")] fn get_micro_time() -> i64; | |
#[import(cc = "C", name = "anydsl_get_nano_time")] fn get_nano_time() -> i64; | |
#[import(cc = "C", name = "anydsl_get_kernel_time")] fn get_kernel_time() -> i64; | |
#[import(cc = "C", name = "anydsl_print_i16")] fn print_i16(_: i16) -> (); | |
#[import(cc = "C", name = "anydsl_print_i32")] fn print_i32(_: i32) -> (); | |
#[import(cc = "C", name = "anydsl_print_i64")] fn print_i64(_: i64) -> (); | |
#[import(cc = "C", name = "anydsl_print_u16")] fn print_u16(_: u16) -> (); | |
#[import(cc = "C", name = "anydsl_print_u32")] fn print_u32(_: u32) -> (); | |
#[import(cc = "C", name = "anydsl_print_u64")] fn print_u64(_: u64) -> (); | |
#[import(cc = "C", name = "anydsl_print_f32")] fn print_f32(_: f32) -> (); | |
#[import(cc = "C", name = "anydsl_print_f64")] fn print_f64(_: f64) -> (); | |
#[import(cc = "C", name = "anydsl_print_char")] fn print_char(_: u8) -> (); | |
#[import(cc = "C", name = "anydsl_print_string")] fn print_string(_: &[u8]) -> (); | |
#[import(cc = "C", name = "anydsl_print_flush")] fn print_flush() -> (); | |
// TODO | |
//struct Buffer[T] { | |
// data : &mut [T], | |
// size : i64, | |
// device : i32 | |
//} | |
// | |
//fn @alloc[T](device: i32, size: i64) = Buffer[T] { | |
// data = runtime_alloc(device, size * sizeof[T]()) as &mut [T], | |
// size = size, | |
// device = device | |
//}; | |
//fn @alloc_host[T](device: i32, size: i64) = Buffer[T] { | |
// data = runtime_alloc_host(device, size * sizeof[T]()) as &mut [T], | |
// size = size, | |
// device = device | |
//}; | |
//fn @alloc_unified[T](device: i32, size: i64) = Buffer[T] { | |
// data = runtime_alloc_unified(device, size * sizeof[T]()) as &mut [T], | |
// size = size, | |
// device = device | |
//}; | |
// | |
//fn @release[T](buf: Buffer[T]) = runtime_release(buf.device, buf.data as &[i8]); | |
//fn @alloc_cpu[T](size: i64) = alloc[T](0, size); | |
//fn @alloc_cuda[T](dev: i32, size: i64) = alloc[T](runtime_device(1, dev), size); | |
//fn @alloc_cuda_host[T](dev: i32, size: i64) = alloc_host[T](runtime_device(1, dev), size); | |
//fn @alloc_cuda_unified[T](dev: i32, size: i64) = alloc_unified[T](runtime_device(1, dev), size); | |
//fn @synchronize_cuda(dev: i32) = runtime_synchronize(runtime_device(1, dev)); | |
//fn @alloc_opencl[T](dev: i32, size: i64) = alloc[T](runtime_device(2, dev), size); | |
//fn @alloc_opencl_unified[T](dev: i32, size: i64) = alloc_unified[T](runtime_device(2, dev), size); | |
//fn @synchronize_opencl(dev: i32) = runtime_synchronize(runtime_device(2, dev)); | |
//fn @alloc_hsa[T](dev: i32, size: i64) = alloc[T](runtime_device(3, dev), size); | |
//fn @alloc_hsa_host[T](dev: i32, size: i64) = alloc_host[T](runtime_device(3, dev), size); | |
//fn @alloc_hsa_unified[T](dev: i32, size: i64) = alloc_unified[T](runtime_device(3, dev), size); | |
//fn @synchronize_hsa(dev: i32) = runtime_synchronize(runtime_device(3, dev)); | |
//fn @alloc_pal[T](dev: i32, size: i64) = alloc[T](runtime_device(4, dev), size); | |
//fn @alloc_pal_host[T](dev: i32, size: i64) = alloc_host[T](runtime_device(4, dev), size); | |
//fn @alloc_pal_unified[T](dev: i32, size: i64) = alloc_unified[T](runtime_device(4, dev), size); | |
//fn @synchronize_pal(dev: i32) = runtime_synchronize(runtime_device(4, dev)); | |
// | |
//fn @copy[T](src: Buffer[T], dst: Buffer[T]) = runtime_copy(src.device, src.data as &[i8], 0, dst.device, dst.data as &mut [i8], 0, src.size); | |
//fn @copy_offset[T](src: Buffer[T], off_src: i64, dst: Buffer[T], off_dst: i64, size: i64) = runtime_copy(src.device, src.data as &[i8], off_src, dst.device, dst.data as &mut [i8], off_dst, size); | |
struct Buffer { | |
data : &mut [i8], | |
size : i64, | |
device : i32 | |
} | |
fn @alloc(device: i32, size: i64) = Buffer { | |
data = runtime_alloc(device, size), | |
size = size, | |
device = device | |
}; | |
fn @alloc_host(device: i32, size: i64) = Buffer { | |
data = runtime_alloc_host(device, size), | |
size = size, | |
device = device | |
}; | |
fn @alloc_unified(device: i32, size: i64) = Buffer { | |
data = runtime_alloc_unified(device, size), | |
size = size, | |
device = device | |
}; | |
fn @release(buf: Buffer) = runtime_release(buf.device, buf.data); | |
fn @runtime_device(platform: i32, device: i32) -> i32 { platform | (device << 4) } | |
fn @alloc_cpu(size: i64) = alloc(0, size); | |
fn @alloc_cuda(dev: i32, size: i64) = alloc(runtime_device(1, dev), size); | |
fn @alloc_cuda_host(dev: i32, size: i64) = alloc_host(runtime_device(1, dev), size); | |
fn @alloc_cuda_unified(dev: i32, size: i64) = alloc_unified(runtime_device(1, dev), size); | |
fn @synchronize_cuda(dev: i32) = runtime_synchronize(runtime_device(1, dev)); | |
fn @alloc_opencl(dev: i32, size: i64) = alloc(runtime_device(2, dev), size); | |
fn @alloc_opencl_unified(dev: i32, size: i64) = alloc_unified(runtime_device(2, dev), size); | |
fn @synchronize_opencl(dev: i32) = runtime_synchronize(runtime_device(2, dev)); | |
fn @alloc_hls(dev: i32, size: i64) -> Buffer { alloc(runtime_device(2, dev), size) } | |
fn @alloc_hls_unified(dev: i32, size: i64) -> Buffer { alloc_unified(runtime_device(2, dev), size) } | |
fn @synchronize_hls(dev: i32) -> () { runtime_synchronize(runtime_device(2, dev)) } | |
fn @alloc_hsa(dev: i32, size: i64) = alloc(runtime_device(3, dev), size); | |
fn @alloc_hsa_host(dev: i32, size: i64) = alloc_host(runtime_device(3, dev), size); | |
fn @alloc_hsa_unified(dev: i32, size: i64) = alloc_unified(runtime_device(3, dev), size); | |
fn @synchronize_hsa(dev: i32) = runtime_synchronize(runtime_device(3, dev)); | |
fn @alloc_pal(dev: i32, size: i64) = alloc(runtime_device(4, dev), size); | |
fn @alloc_pal_host(dev: i32, size: i64) = alloc_host(runtime_device(4, dev), size); | |
fn @alloc_pal_unified(dev: i32, size: i64) = alloc_unified(runtime_device(4, dev), size); | |
fn @synchronize_pal(dev: i32) = runtime_synchronize(runtime_device(4, dev)); | |
fn @copy(src: Buffer, dst: Buffer) = runtime_copy(src.device, src.data, 0, dst.device, dst.data, 0, src.size); | |
fn @copy_offset(src: Buffer, off_src: i64, dst: Buffer, off_dst: i64, size: i64) = runtime_copy(src.device, src.data, off_src, dst.device, dst.data, off_dst, size); | |
// range, range_step, unroll, unroll_step, etc. | |
fn @unroll_step(body: fn(i32) -> ()) { | |
fn @(?beg & ?end & ?step) loop(beg: i32, end: i32, step: i32) -> () { | |
if beg < end { | |
@body(beg); | |
loop(beg + step, end, step) | |
} | |
} | |
loop | |
} | |
fn @unroll_step_rev(body: fn(i32) -> ()) { | |
fn @(?beg & ?end & ?step) loop(end: i32, beg: i32, step: i32) -> () { | |
if end > beg { | |
@body(end); | |
loop(end - step, beg, step) | |
} | |
} | |
loop | |
} | |
fn @range(body: fn(i32) -> ()) = @|lower: i32, upper: i32| unroll_step(body)($lower, $upper, 1); | |
fn @range_step(body: fn(i32) -> ()) = @|lower: i32, upper: i32, step: i32| unroll_step(body)($lower, $upper, step); | |
fn @range_rev(body: fn(i32) -> ()) = @|upper: i32, lower: i32| unroll_step_rev(body)(upper, lower, 1); | |
fn @unroll(body: fn(i32) -> ()) = @|lower: i32, upper: i32| unroll_step(body)(lower, upper, 1); | |
fn @unroll_rev(body: fn(i32) -> ()) = @|upper: i32, lower: i32| unroll_step_rev(body)(upper, lower, 1); | |
mod math_builtins { | |
#[import(cc = "builtin")] fn fabs[T](T) -> T; | |
#[import(cc = "builtin")] fn copysign[T](T, T) -> T; | |
#[import(cc = "builtin")] fn signbit[T](T) -> bool; | |
#[import(cc = "builtin")] fn round[T](T) -> T; | |
#[import(cc = "builtin")] fn ceil[T](T) -> T; | |
#[import(cc = "builtin")] fn floor[T](T) -> T; | |
#[import(cc = "builtin")] fn fmin[T](T, T) -> T; | |
#[import(cc = "builtin")] fn fmax[T](T, T) -> T; | |
#[import(cc = "builtin")] fn cos[T](T) -> T; | |
#[import(cc = "builtin")] fn sin[T](T) -> T; | |
#[import(cc = "builtin")] fn tan[T](T) -> T; | |
#[import(cc = "builtin")] fn acos[T](T) -> T; | |
#[import(cc = "builtin")] fn asin[T](T) -> T; | |
#[import(cc = "builtin")] fn atan[T](T) -> T; | |
#[import(cc = "builtin")] fn atan2[T](T, T) -> T; | |
#[import(cc = "builtin")] fn sqrt[T](T) -> T; | |
#[import(cc = "builtin")] fn cbrt[T](T) -> T; | |
#[import(cc = "builtin")] fn pow[T](T, T) -> T; | |
#[import(cc = "builtin")] fn exp[T](T) -> T; | |
#[import(cc = "builtin")] fn exp2[T](T) -> T; | |
#[import(cc = "builtin")] fn log[T](T) -> T; | |
#[import(cc = "builtin")] fn log2[T](T) -> T; | |
#[import(cc = "builtin")] fn log10[T](T) -> T; | |
#[import(cc = "builtin")] fn isnan[T](T) -> bool; | |
#[import(cc = "builtin")] fn isfinite[T](T) -> bool; | |
} | |
enum memory_order { | |
relaxed, | |
acquire, | |
release, | |
acq_rel, | |
seq_cst | |
} | |
fn @is_weaker_memory_order(a: memory_order, b: memory_order) -> bool { | |
builtin_memory_order(a) < builtin_memory_order(b) | |
} | |
fn @stronger_memory_order(a: memory_order, b: memory_order) -> memory_order { | |
if is_weaker_memory_order(a, b) { b } else { a } | |
} | |
fn @builtin_memory_order(order: memory_order) -> u32 { | |
match order { | |
memory_order::relaxed => 2, | |
memory_order::acquire => 4, | |
memory_order::release => 5, | |
memory_order::acq_rel => 6, | |
memory_order::seq_cst => 7, | |
} | |
} | |
struct grid_context { | |
device: i32, | |
max_concurrency: fn() -> i32, | |
groups: fn(fn(group_context) -> ()) -> fn() -> (), | |
waves: fn(fn(wave_context) -> ()) -> fn() -> (), | |
threads: fn(fn(thread_context) -> ()) -> fn() -> (), | |
num_groups: fn(i32) -> u32, | |
num_waves: fn() -> u32, | |
num_threads: fn(i32) -> u32, | |
} | |
struct group_context { | |
idx: fn(i32) -> u32, | |
waves: fn(fn(wave_context) -> ()) -> fn() -> (), | |
threads: fn(fn(thread_context) -> ()) -> fn() -> (), | |
num_waves: fn() -> u32, | |
num_threads: fn(i32) -> u32, | |
barrier: fn() -> (), | |
barrier_all: fn(bool) -> bool, | |
barrier_any: fn(bool) -> bool, | |
barrier_count: fn(bool) -> i32, | |
} | |
struct wave_context { | |
idx: fn() -> u32, | |
membermask: fn() -> u64, | |
threads: fn(fn(thread_context) -> ()) -> fn() -> (), | |
num_threads: fn() -> u32, | |
barrier: fn() -> (), | |
barrier_all: fn(bool) -> bool, | |
barrier_any: fn(bool) -> bool, | |
barrier_count: fn(bool) -> i32, | |
barrier_vote: fn(bool) -> u64, | |
// activemask: fn() -> u32, | |
shfl_i32: fn(i32, i32, u32) -> i32, | |
shfl_u32: fn(u32, i32, u32) -> u32, | |
// shfl_i64: fn(i64, i32, u32) -> i64, | |
// shfl_u64: fn(u64, i32, u32) -> u64, | |
// shfl_f32: fn(f32, i32, u32) -> f32, | |
// shfl_f64: fn(f64, i32, u32) -> f64, | |
shfl_up_i32: fn(i32, u32, u32) -> i32, | |
shfl_up_u32: fn(u32, u32, u32) -> u32, | |
// shfl_up_i64: fn(i64, u32, u32) -> i64, | |
// shfl_up_u64: fn(u64, u32, u32) -> u64, | |
// shfl_up_f32: fn(f32, u32, u32) -> f32, | |
// shfl_up_f64: fn(f64, u32, u32) -> f64, | |
shfl_down_i32: fn(i32, u32, u32) -> i32, | |
shfl_down_u32: fn(u32, u32, u32) -> u32, | |
// shfl_down_i64: fn(i64, u32, u32) -> i64, | |
// shfl_down_u64: fn(u64, u32, u32) -> u64, | |
// shfl_down_f32: fn(f32, u32, u32) -> f32, | |
// shfl_down_f64: fn(f64, u32, u32) -> f64, | |
shfl_bfly_i32: fn(i32, i32, u32) -> i32, | |
shfl_bfly_u32: fn(u32, i32, u32) -> u32, | |
// shfl_bfly_i64: fn(i64, i32, u32) -> i64, | |
// shfl_bfly_u64: fn(u64, i32, u32) -> u64, | |
// shfl_bfly_f32: fn(f32, i32, u32) -> f32, | |
// shfl_bfly_f64: fn(f64, i32, u32) -> f64, | |
// match_any_i32: fn(i32) -> u32, | |
// match_any_u32: fn(u32) -> u32, | |
// match_any_i64: fn(i64) -> u32, | |
// match_any_u64: fn(u64) -> u32, | |
// match_any_f32: fn(f32) -> u32, | |
// match_any_f64: fn(f64) -> u32, | |
// match_all_i32: fn(i32, &mut i32) -> u32, | |
// match_all_u32: fn(u32, &mut i32) -> u32, | |
// match_all_i64: fn(i64, &mut i32) -> u32, | |
// match_all_u64: fn(u64, &mut i32) -> u32, | |
// match_all_f32: fn(f32, &mut i32) -> u32, | |
// match_all_f64: fn(f64, &mut i32) -> u32, | |
lanemask: fn() -> u64, | |
lanemask_le: fn() -> u64, | |
lanemask_lt: fn() -> u64, | |
lanemask_ge: fn() -> u64, | |
lanemask_gt: fn() -> u64 | |
} | |
struct thread_context { | |
idx: fn(i32) -> u32, | |
gid: fn() -> u32, | |
uid: fn() -> i32, | |
atomic_load_global_i32: fn(&addrspace(1) i32, memory_order) -> i32, | |
atomic_load_global_u32: fn(&addrspace(1) u32, memory_order) -> u32, | |
atomic_load_global_i64: fn(&addrspace(1) i64, memory_order) -> i64, | |
atomic_load_global_u64: fn(&addrspace(1) u64, memory_order) -> u64, | |
atomic_load_global_i32_coalesced: fn(&addrspace(1) i32, memory_order) -> i32, | |
atomic_load_global_u32_coalesced: fn(&addrspace(1) u32, memory_order) -> u32, | |
atomic_load_global_i64_coalesced: fn(&addrspace(1) i64, memory_order) -> i64, | |
atomic_load_global_u64_coalesced: fn(&addrspace(1) u64, memory_order) -> u64, | |
atomic_store_global_i32: fn(&mut addrspace(1) i32, i32, memory_order) -> (), | |
atomic_store_global_u32: fn(&mut addrspace(1) u32, u32, memory_order) -> (), | |
atomic_store_global_i64: fn(&mut addrspace(1) i64, i64, memory_order) -> (), | |
atomic_store_global_u64: fn(&mut addrspace(1) u64, u64, memory_order) -> (), | |
atomic_store_global_i32_coalesced: fn(&mut addrspace(1) i32, i32, memory_order) -> (), | |
atomic_store_global_u32_coalesced: fn(&mut addrspace(1) u32, u32, memory_order) -> (), | |
atomic_store_global_i64_coalesced: fn(&mut addrspace(1) i64, i64, memory_order) -> (), | |
atomic_store_global_u64_coalesced: fn(&mut addrspace(1) u64, u64, memory_order) -> (), | |
atomic_add_global_i32: fn(&mut addrspace(1) i32, i32, memory_order) -> i32, | |
atomic_add_global_u32: fn(&mut addrspace(1) u32, u32, memory_order) -> u32, | |
atomic_add_global_i64: fn(&mut addrspace(1) i64, i64, memory_order) -> i64, | |
atomic_add_global_u64: fn(&mut addrspace(1) u64, u64, memory_order) -> u64, | |
atomic_sub_global_i32: fn(&mut addrspace(1) i32, i32, memory_order) -> i32, | |
atomic_sub_global_u32: fn(&mut addrspace(1) u32, u32, memory_order) -> u32, | |
atomic_sub_global_u64: fn(&mut addrspace(1) u64, u64, memory_order) -> u64, | |
atomic_and_global_i32: fn(&mut addrspace(1) i32, i32, memory_order) -> i32, | |
atomic_and_global_u32: fn(&mut addrspace(1) u32, u32, memory_order) -> u32, | |
atomic_and_global_u64: fn(&mut addrspace(1) u64, u64, memory_order) -> u64, | |
atomic_or_global_i32: fn(&mut addrspace(1) i32, i32, memory_order) -> i32, | |
atomic_or_global_u32: fn(&mut addrspace(1) u32, u32, memory_order) -> u32, | |
atomic_or_global_u64: fn(&mut addrspace(1) u64, u64, memory_order) -> u64, | |
atomic_xor_global_i32: fn(&mut addrspace(1) i32, i32, memory_order) -> i32, | |
atomic_xor_global_u32: fn(&mut addrspace(1) u32, u32, memory_order) -> u32, | |
atomic_xor_global_u64: fn(&mut addrspace(1) u64, u64, memory_order) -> u64, | |
atomic_exch_global_i32: fn(&mut addrspace(1) i32, i32, memory_order) -> i32, | |
atomic_exch_global_u32: fn(&mut addrspace(1) u32, u32, memory_order) -> u32, | |
atomic_exch_global_u64: fn(&mut addrspace(1) u64, u64, memory_order) -> u64, | |
atomic_min_global_i32: fn(&mut addrspace(1) i32, i32, memory_order) -> i32, | |
atomic_min_global_u32: fn(&mut addrspace(1) u32, u32, memory_order) -> u32, | |
atomic_min_global_u64: fn(&mut addrspace(1) u64, u64, memory_order) -> u64, | |
atomic_max_global_i32: fn(&mut addrspace(1) i32, i32, memory_order) -> i32, | |
atomic_max_global_u32: fn(&mut addrspace(1) u32, u32, memory_order) -> u32, | |
atomic_max_global_u64: fn(&mut addrspace(1) u64, u64, memory_order) -> u64, | |
atomic_cas_global_i32: fn(&mut addrspace(1) i32, i32, i32, memory_order, memory_order) -> (i32, bool), | |
atomic_cas_global_u32: fn(&mut addrspace(1) u32, u32, u32, memory_order, memory_order) -> (u32, bool), | |
atomic_cas_global_i64: fn(&mut addrspace(1) i64, i64, i64, memory_order, memory_order) -> (i64, bool), | |
atomic_cas_global_u64: fn(&mut addrspace(1) u64, u64, u64, memory_order, memory_order) -> (u64, bool), | |
atomic_cas_global_i32_weak: fn(&mut addrspace(1) i32, i32, i32, memory_order, memory_order) -> (i32, bool), | |
atomic_cas_global_u32_weak: fn(&mut addrspace(1) u32, u32, u32, memory_order, memory_order) -> (u32, bool), | |
atomic_cas_global_i64_weak: fn(&mut addrspace(1) i64, i64, i64, memory_order, memory_order) -> (i64, bool), | |
atomic_cas_global_u64_weak: fn(&mut addrspace(1) u64, u64, u64, memory_order, memory_order) -> (u64, bool), | |
atomic_inc_global_u32: fn(&mut addrspace(1) u32, u32) -> u32, | |
memory_barrier: fn(memory_order) -> (), | |
timestamp: fn() -> i64, | |
timestamp32: fn() -> i32, | |
// atomic_wait_and_transition_global_u32: fn(&mut addrspace(1) u32, u32, u32, memory_order, &[u8]) -> (), | |
wait: fn(fn() -> bool, &[u8]) -> (), | |
} | |
struct ProducerConsumerQueue[T] { | |
// [THREAD-SAFETY]: only functions that receive a thread_context (i.e. push/pop/size) are thread-safe | |
push: fn(fn() -> T) -> fn(thread_context) -> i32, | |
pop: fn(fn(T) -> ()) -> fn(thread_context) -> i32, | |
// pop_wave: fn(wave_context, i32, fn(T) -> ()) -> i32, | |
size: fn(thread_context) -> i32, | |
reset: fn(grid_context) -> (), | |
validate: fn(&mut addrspace(1) u32, grid_context) -> (), | |
release: fn() -> () | |
} | |
enum create_queue_result[T] { | |
Ok(ProducerConsumerQueue[T]), | |
Err(&[u8]) | |
} | |
type queue_constructor[T] = fn(AccDevice, i32) -> create_queue_result[T]; | |
struct Allocator[T] { | |
alloc: fn() -> u64, | |
free: fn(u64) -> (), | |
alloc_ptr: fn() -> &mut T, | |
free_ptr: fn(&mut T) -> (), | |
clear: fn()->(), | |
release: fn()->(), | |
alignment: i64, | |
} | |
type pool_idx = i32; | |
fn createPool[T](capacity: pool_idx, alignment: pool_idx) -> Allocator[T] { | |
let size = max_i64(sizeof[T](), sizeof[pool_idx]()); | |
let align = max_i64(max_i64(alignment as i64, alignof[T]()), alignof[pool_idx]()); | |
let width = round_up_i64(size, align); | |
let offset = 2*sizeof[pool_idx](); | |
//print_string("size: "); print_i64(size); print_char('\n'); | |
//print_string("align: "); print_i64(align); print_char('\n'); | |
//print_string("width: "); print_i64(width); print_char('\n'); | |
//print_string("offset: "); print_i64(offset); print_char('\n'); | |
let buffer = alloc_cpu(offset + capacity as i64 * width + align); | |
let begin = buffer.data as u64; | |
let data:u64 = (begin + offset as u64 + align as u64) & !(align as u64 - 1); | |
let next_alloc:&mut pool_idx = begin as &mut pool_idx; | |
let next_free:&mut pool_idx = (begin + sizeof[pool_idx]() as u64) as &mut pool_idx; | |
fn clear() -> () { | |
atomic_store[pool_idx](next_alloc, 0, 5 /* rel */, ""); | |
atomic_store[pool_idx](next_free, -1, 5 /* rel */, ""); | |
} | |
clear(); | |
fn pool_alloc() -> u64 { | |
while true { | |
let idx = atomic_load[pool_idx](next_free, 4 /* acq */, ""); | |
if (idx == -1) { | |
break() | |
} | |
let ptr = data + (idx as u64) * (width as u64); | |
let next:pool_idx = *(ptr as &pool_idx); | |
if cmpxchg[pool_idx](next_free, idx, next, 5 /* rel */, 2 /* rlx */, "").1 { | |
return(ptr) | |
} | |
} | |
let i = atomic[pool_idx](1, next_alloc, 1, 2 /* rlx */, ""); | |
assert(0 <= i && i < capacity, "pool is running out of memory"); | |
data + (i as u64) * (width as u64) | |
} | |
fn @pool_free(t: u64) -> () { | |
let j = t - data; | |
let i = (j / width as u64) as pool_idx; | |
assert(j % width as u64 == 0, "invalid pointer to element"); | |
assert(0 <= i && i < capacity, "invalid pool element index"); | |
while true { | |
let next = atomic_load[pool_idx](next_free, 2 /* rlx */, ""); | |
*(t as &mut pool_idx) = next; | |
if cmpxchg[pool_idx](next_free, next, i, 5 /* rel */, 2 /* rlx */, "").1 { | |
break() | |
} | |
} | |
} | |
Allocator[T] { | |
alloc = pool_alloc, | |
alloc_ptr = @|| { pool_alloc() as &mut T }, | |
free = pool_free, | |
free_ptr = @|t: &mut T| { pool_free(t as u64); }, | |
clear = @|| { clear(); }, | |
release = @|| { release(buffer); }, | |
alignment = width | |
} | |
} | |
fn @anyq_verbose(_body: fn()->()) -> fn()->() { | |
|| { } | |
} | |
fn @assert(_condition: bool, _msg: &[u8]) -> () { | |
// skip assertions in release | |
} | |
#[import(cc = "device", name = "[]{return __CUDA_ARCH__;}")] fn cuda_device_arch() -> i32; // HACK | |
fn @createAccDevice(device: i32) { | |
let platform_device = runtime_device(1, device); | |
let its = runtime_device_check_feature_support(platform_device, "ITS"); | |
AccDevice { | |
supports_its = its, | |
supports_npot_atomic_inc = true, | |
launch_1d = @|body|@|num_groups, group_size| cuda_launch(device, (num_groups, 1, 1), (group_size, 1, 1), wrap_index_1d, wrap_dim_1d, body), | |
synchronize = || synchronize_cuda(device), | |
alloc = |size| alloc_cuda(device, size), | |
platform_device = platform_device, | |
platform_name = "cuda", | |
print_i32 = @|format: &[u8], arg: i32| { | |
cuda_vprintf(format, &arg as &[u8]); | |
}, | |
print_2xi32 = @|format: &[u8], arg1: i32, arg2: i32| { | |
let args:&[i32] = [arg1, arg2]; | |
cuda_vprintf(format, args as &[u8]); | |
}, | |
print_3xi32 = @|format: &[u8], arg1: i32, arg2: i32, arg3: i32| { | |
let args:&[i32] = [arg1, arg2, arg3]; | |
cuda_vprintf(format, args as &[u8]); | |
} | |
} | |
} | |
fn @createDefaultAccDevice() = createAccDevice(0); | |
fn @cuda_pred(b: bool) -> i32 { | |
if b { 1 } else { 0 } | |
} | |
fn @cuda_memory_barrier(order: memory_order) { | |
if cuda_device_arch() >= 700 { | |
(@|| { | |
match order { | |
memory_order::acquire => asm("fence.acq_rel.gpu;" :::: "volatile"), | |
memory_order::release => asm("fence.acq_rel.gpu;" :::: "volatile"), | |
memory_order::acq_rel => asm("fence.acq_rel.gpu;" :::: "volatile"), | |
memory_order::seq_cst => asm("fence.sc.gpu;" :::: "volatile"), | |
_ => () | |
} | |
})() | |
} | |
else { | |
cuda_threadfence() // TODO: seq_cst not supported? | |
} | |
} | |
fn @cuda_legacy_atomic_memory_barrier_pre(order:memory_order) { | |
match order { | |
memory_order::release => cuda_memory_barrier(memory_order::release), | |
memory_order::acq_rel => cuda_memory_barrier(memory_order::acq_rel), | |
memory_order::seq_cst => cuda_memory_barrier(memory_order::seq_cst), | |
_ => () | |
} | |
} | |
fn @cuda_legacy_atomic_memory_barrier_post(order:memory_order) { | |
match order { | |
memory_order::acquire => cuda_memory_barrier(memory_order::acquire), | |
memory_order::acq_rel => cuda_memory_barrier(memory_order::acq_rel), | |
memory_order::seq_cst => cuda_memory_barrier(memory_order::seq_cst), | |
_ => () | |
} | |
} | |
fn @cuda_legacy_atomic_memory_order_wrap_load[T](f: fn(&addrspace(1) T) -> T) { | |
@|location:&addrspace(1) T, order:memory_order| -> T { | |
cuda_legacy_atomic_memory_barrier_pre(order); | |
let res = f(location); | |
cuda_legacy_atomic_memory_barrier_post(order); | |
res | |
} | |
} | |
fn @cuda_legacy_atomic_memory_order_wrap_store[T](f: fn(&mut addrspace(1) T, T) -> ()) { | |
@|location:&mut addrspace(1) T, value:T, order:memory_order| -> () { | |
cuda_legacy_atomic_memory_barrier_pre(order); | |
f(location, value); | |
cuda_legacy_atomic_memory_barrier_post(order); | |
} | |
} | |
fn @cuda_legacy_atomic_memory_order_wrap_rmw[T](f: fn(&mut addrspace(1) T, T) -> T) { | |
@|location:&mut addrspace(1) T, value:T, order:memory_order| -> T { | |
cuda_legacy_atomic_memory_barrier_pre(order); | |
let old = f(location, value); | |
cuda_legacy_atomic_memory_barrier_post(order); | |
old | |
} | |
} | |
fn @cuda_legacy_atomic_memory_order_wrap_cas[T](f: fn(&mut addrspace(1) T, T, T) -> T, cmp: fn(T, T) -> bool) { | |
@|location:&mut addrspace(1) T, expected:T, desired:T, memory_order_succ:memory_order, memory_order_fail:memory_order| { | |
cuda_legacy_atomic_memory_barrier_pre(stronger_memory_order(memory_order_succ, memory_order_fail)); | |
let old = f(location, expected, desired); | |
if cmp(old, expected) { | |
cuda_legacy_atomic_memory_barrier_post(memory_order_succ); | |
(old, true) | |
} | |
else { | |
cuda_legacy_atomic_memory_barrier_post(memory_order_fail); | |
(old, false) | |
} | |
} | |
} | |
fn @atomic_load_global_i32(location: &addrspace(1) i32, order: memory_order) -> i32 = atomic_load_global_u32(location as &addrspace(1) u32, order) as i32; | |
fn @atomic_load_global_u32(location: &addrspace(1) u32, order: memory_order) -> u32 { | |
let mut value: u32; | |
if cuda_device_arch() >= 700 { | |
(@|| { | |
match order { | |
memory_order::relaxed => asm("ld.relaxed.gpu.b32 %0, [%1];" : "=r"(value) : "l"(location) : "memory"), | |
memory_order::acquire => asm("ld.acquire.gpu.b32 %0, [%1];" : "=r"(value) : "l"(location) : "memory"), | |
memory_order::seq_cst => asm("fence.sc.gpu; ld.relaxed.gpu.b32 %0, [%1];" : "=r"(value) : "l"(location) : "memory"), | |
_ => cuda_trap() | |
} | |
})(); | |
} | |
else { | |
cuda_legacy_atomic_memory_barrier_pre(order); | |
asm("ld.volatile.global.b32 %0, [%1];" : "=r"(value) : "l"(location) : "memory"); | |
cuda_legacy_atomic_memory_barrier_post(order); | |
} | |
value | |
} | |
fn @atomic_load_global_i64(location: &addrspace(1) i64, order: memory_order) = atomic_load_global_u64(location as &addrspace(1) u64, order) as i64; | |
fn @atomic_load_global_u64(location: &addrspace(1) u64, order: memory_order) -> u64 { | |
let mut value: u64; | |
if cuda_device_arch() >= 700 { | |
(@|| { | |
match order { | |
memory_order::relaxed => asm("ld.relaxed.gpu.b64 %0, [%1];" : "=l"(value) : "l"(location) : "memory"), | |
memory_order::acquire => asm("ld.acquire.gpu.b64 %0, [%1];" : "=l"(value) : "l"(location) : "memory"), | |
memory_order::seq_cst => asm("fence.sc.gpu; ld.relaxed.gpu.b64 %0, [%1];" : "=l"(value) : "l"(location) : "memory"), | |
_ => cuda_trap() | |
} | |
})(); | |
} | |
else { | |
cuda_legacy_atomic_memory_barrier_pre(order); | |
asm("ld.volatile.global.b64 %0, [%1];" : "=l"(value) : "l"(location) : "memory"); | |
cuda_legacy_atomic_memory_barrier_post(order); | |
} | |
value | |
} | |
fn @atomic_store_global_i32(location: &addrspace(1) i32, value: i32, order: memory_order) = atomic_store_global_u32(location as &addrspace(1) u32, value as u32, order); | |
fn @atomic_store_global_u32(location: &addrspace(1) u32, value: u32, order: memory_order) -> () { | |
if cuda_device_arch() >= 700 { | |
(@|| { | |
match order { | |
memory_order::relaxed => asm("st.relaxed.gpu.b32 [%0], %1;" : : "l"(location), "r"(value) : "memory"), | |
memory_order::release => asm("st.release.gpu.b32 [%0], %1;" : : "l"(location), "r"(value) : "memory"), | |
memory_order::seq_cst => asm("st.relaxed.gpu.b32 [%0], %1; fence.sc.gpu;" : : "l"(location), "r"(value) : "memory"), | |
_ => cuda_trap() | |
} | |
})() | |
} | |
else { | |
cuda_legacy_atomic_memory_barrier_pre(order); | |
asm("st.volatile.global.b32 [%0], %1;" : : "l"(location), "r"(value) : "memory"); | |
cuda_legacy_atomic_memory_barrier_post(order); | |
} | |
} | |
fn @atomic_store_global_i64(location: &addrspace(1) i64, value: i64, order: memory_order) = atomic_store_global_u64(location as &addrspace(1) u64, value as u64, order); | |
fn @atomic_store_global_u64(location: &addrspace(1) u64, value: u64, order: memory_order) -> () { | |
if cuda_device_arch() >= 700 { | |
(@|| { | |
match order { | |
memory_order::relaxed => asm("st.relaxed.gpu.b64 [%0], %1;" : : "l"(location), "l"(value) : "memory"), | |
memory_order::release => asm("st.release.gpu.b64 [%0], %1;" : : "l"(location), "l"(value) : "memory"), | |
memory_order::seq_cst => asm("st.relaxed.gpu.b64 [%0], %1; fence.sc.gpu;" : : "l"(location), "l"(value) : "memory"), | |
_ => cuda_trap() | |
} | |
})() | |
} | |
else { | |
cuda_legacy_atomic_memory_barrier_pre(order); | |
asm("st.volatile.global.b64 [%0], %1;" : : "l"(location), "l"(value) : "memory"); | |
cuda_legacy_atomic_memory_barrier_post(order); | |
} | |
} | |
// fn @cuda_atomic_wait_and_transition[T](f: fn(&mut addrspace(1) T, T, T) -> T, cmp: fn(T, T) -> bool) { | |
// @|location:&mut addrspace(1) T, expected:T, desired:T, order:memory_order, _debug_msg:&[u8]| -> () { | |
// cuda_legacy_atomic_memory_barrier_pre(order); | |
// while !cmp(f(location, expected, desired), expected) { | |
// cuda_threadfence(); | |
// } | |
// cuda_legacy_atomic_memory_barrier_post(order); | |
// } | |
// } | |
fn @cuda_thread(idx: fn(i32) -> u32, gid: fn() -> u32, body: fn(thread_context) -> ()) -> () { | |
let sleep = @|t: u32| { | |
if cuda_device_arch() >= 700 { | |
asm("nanosleep.u32 %0;\n" :: "r"(t) :: "volatile"); // use asm to avoid compilation error on devices < 700 due to __nanosleep not being defined | |
} | |
else { | |
} | |
}; | |
@body(thread_context { | |
idx = idx, | |
gid = gid, | |
uid = @|| gid() as i32, | |
atomic_load_global_i32 = atomic_load_global_i32, | |
atomic_load_global_u32 = atomic_load_global_u32, | |
atomic_load_global_i64 = atomic_load_global_i64, | |
atomic_load_global_u64 = atomic_load_global_u64, | |
atomic_load_global_i32_coalesced = cuda_legacy_atomic_memory_order_wrap_load[i32](@|location| { let mut res:i32; asm("ld.volatile.global.b32 %0, [%1];" : "=r"(res) : "l"(location) : "memory"); res }), | |
atomic_load_global_u32_coalesced = cuda_legacy_atomic_memory_order_wrap_load[u32](@|location| { let mut res:u32; asm("ld.volatile.global.b32 %0, [%1];" : "=r"(res) : "l"(location) : "memory"); res }), | |
atomic_load_global_i64_coalesced = cuda_legacy_atomic_memory_order_wrap_load[i64](@|location| { let mut res:i64; asm("ld.volatile.global.b64 %0, [%1];" : "=l"(res) : "l"(location) : "memory"); res }), | |
atomic_load_global_u64_coalesced = cuda_legacy_atomic_memory_order_wrap_load[u64](@|location| { let mut res:u64; asm("ld.volatile.global.b64 %0, [%1];" : "=l"(res) : "l"(location) : "memory"); res }), | |
atomic_store_global_i32 = atomic_store_global_i32, | |
atomic_store_global_u32 = atomic_store_global_u32, | |
atomic_store_global_i64 = atomic_store_global_i64, | |
atomic_store_global_u64 = atomic_store_global_u64, | |
atomic_store_global_i32_coalesced = cuda_legacy_atomic_memory_order_wrap_store[i32](@|location, value| asm("st.volatile.global.b32 [%0], %1;" : : "l"(location), "r"(value) : "memory")), | |
atomic_store_global_u32_coalesced = cuda_legacy_atomic_memory_order_wrap_store[u32](@|location, value| asm("st.volatile.global.b32 [%0], %1;" : : "l"(location), "r"(value) : "memory")), | |
atomic_store_global_i64_coalesced = cuda_legacy_atomic_memory_order_wrap_store[i64](@|location, value| asm("st.volatile.global.b64 [%0], %1;" : : "l"(location), "l"(value) : "memory")), | |
atomic_store_global_u64_coalesced = cuda_legacy_atomic_memory_order_wrap_store[u64](@|location, value| asm("st.volatile.global.b64 [%0], %1;" : : "l"(location), "l"(value) : "memory")), | |
atomic_add_global_i32 = cuda_legacy_atomic_memory_order_wrap_rmw[i32](cuda_atomic_add_global_i32), | |
atomic_add_global_u32 = cuda_legacy_atomic_memory_order_wrap_rmw[u32](cuda_atomic_add_global_u32), | |
atomic_add_global_i64 = cuda_legacy_atomic_memory_order_wrap_rmw[i64](@|location, value| cuda_atomic_add_global_u64(location as &mut addrspace(1) u64, value as u64) as i64), | |
atomic_add_global_u64 = cuda_legacy_atomic_memory_order_wrap_rmw[u64](cuda_atomic_add_global_u64), | |
atomic_sub_global_i32 = cuda_legacy_atomic_memory_order_wrap_rmw[i32](cuda_atomic_sub_global_i32), | |
atomic_sub_global_u32 = cuda_legacy_atomic_memory_order_wrap_rmw[u32](cuda_atomic_sub_global_u32), | |
atomic_sub_global_u64 = cuda_legacy_atomic_memory_order_wrap_rmw[u64](cuda_atomic_sub_global_u64), | |
atomic_and_global_i32 = cuda_legacy_atomic_memory_order_wrap_rmw[i32](cuda_atomic_and_global_i32), | |
atomic_and_global_u32 = cuda_legacy_atomic_memory_order_wrap_rmw[u32](cuda_atomic_and_global_u32), | |
atomic_and_global_u64 = cuda_legacy_atomic_memory_order_wrap_rmw[u64](cuda_atomic_and_global_u64), | |
atomic_or_global_i32 = cuda_legacy_atomic_memory_order_wrap_rmw[i32](cuda_atomic_or_global_i32), | |
atomic_or_global_u32 = cuda_legacy_atomic_memory_order_wrap_rmw[u32](cuda_atomic_or_global_u32), | |
atomic_or_global_u64 = cuda_legacy_atomic_memory_order_wrap_rmw[u64](cuda_atomic_or_global_u64), | |
atomic_xor_global_i32 = cuda_legacy_atomic_memory_order_wrap_rmw[i32](cuda_atomic_xor_global_i32), | |
atomic_xor_global_u32 = cuda_legacy_atomic_memory_order_wrap_rmw[u32](cuda_atomic_xor_global_u32), | |
atomic_xor_global_u64 = cuda_legacy_atomic_memory_order_wrap_rmw[u64](cuda_atomic_xor_global_u64), | |
atomic_exch_global_i32 = cuda_legacy_atomic_memory_order_wrap_rmw[i32](cuda_atomic_exch_global_i32), | |
atomic_exch_global_u32 = cuda_legacy_atomic_memory_order_wrap_rmw[u32](cuda_atomic_exch_global_u32), | |
atomic_exch_global_u64 = cuda_legacy_atomic_memory_order_wrap_rmw[u64](cuda_atomic_exch_global_u64), | |
atomic_min_global_i32 = cuda_legacy_atomic_memory_order_wrap_rmw[i32](cuda_atomic_min_global_i32), | |
atomic_min_global_u32 = cuda_legacy_atomic_memory_order_wrap_rmw[u32](cuda_atomic_min_global_u32), | |
atomic_min_global_u64 = cuda_legacy_atomic_memory_order_wrap_rmw[u64](cuda_atomic_min_global_u64), | |
atomic_max_global_i32 = cuda_legacy_atomic_memory_order_wrap_rmw[i32](cuda_atomic_max_global_i32), | |
atomic_max_global_u32 = cuda_legacy_atomic_memory_order_wrap_rmw[u32](cuda_atomic_max_global_u32), | |
atomic_max_global_u64 = cuda_legacy_atomic_memory_order_wrap_rmw[u64](cuda_atomic_max_global_u64), | |
atomic_cas_global_i32 = cuda_legacy_atomic_memory_order_wrap_cas[i32](cuda_atomic_cas_global_i32, @|a, b| a == b), | |
atomic_cas_global_u32 = cuda_legacy_atomic_memory_order_wrap_cas[u32](cuda_atomic_cas_global_u32, @|a, b| a == b), | |
atomic_cas_global_i64 = cuda_legacy_atomic_memory_order_wrap_cas[i64](@|location, expected, desired| cuda_atomic_cas_global_u64(location as &mut addrspace(1) u64, expected as u64, desired as u64) as i64, @|a, b| a == b), | |
atomic_cas_global_u64 = cuda_legacy_atomic_memory_order_wrap_cas[u64](cuda_atomic_cas_global_u64, @|a, b| a == b), | |
atomic_cas_global_i32_weak = cuda_legacy_atomic_memory_order_wrap_cas[i32](cuda_atomic_cas_global_i32, @|a, b| a == b), | |
atomic_cas_global_u32_weak = cuda_legacy_atomic_memory_order_wrap_cas[u32](cuda_atomic_cas_global_u32, @|a, b| a == b), | |
atomic_cas_global_i64_weak = cuda_legacy_atomic_memory_order_wrap_cas[i64](@|location, expected, desired| cuda_atomic_cas_global_u64(location as &mut addrspace(1) u64, expected as u64, desired as u64) as i64, @|a, b| a == b), | |
atomic_cas_global_u64_weak = cuda_legacy_atomic_memory_order_wrap_cas[u64](cuda_atomic_cas_global_u64, @|a, b| a == b), | |
atomic_inc_global_u32 = cuda_atomic_inc_global_u32, | |
memory_barrier = cuda_memory_barrier, | |
timestamp = @|| cuda_globaltimer() as i64, | |
timestamp32 = @|| cuda_globaltimer_lo() as i32, | |
wait = @|f, _debug_msg| { | |
while !f() { sleep(1); } | |
// for t in exponential_backoff(2, 128) { | |
// if f() { false } else { sleep(t as u32); true } | |
// } | |
} | |
}) | |
} | |
fn @cuda_subwarp(idx: fn() -> u32, gid: fn() -> u32, membermask: u32, num_threads: u32, body: fn(wave_context) -> ()) -> () { | |
let thread_idx = @|i: i32| match i { 0 => cuda_laneid(), _ => 0 }; | |
@body(wave_context { | |
idx = idx, | |
membermask = @|| membermask as u64, | |
threads = @|body|@|| cuda_thread(thread_idx, @|| gid() * num_threads + thread_idx(0), body), | |
num_threads = @|| num_threads, | |
barrier = @|| cuda_warp_sync(membermask), | |
barrier_all = @|predicate| cuda_warp_sync_all(membermask, cuda_pred(predicate)) != 0, | |
barrier_any = @|predicate| cuda_warp_sync_any(membermask, cuda_pred(predicate)) != 0, | |
barrier_count = @|predicate| cuda_popc_u32(cuda_warp_sync_vote(membermask, cuda_pred(predicate))), | |
barrier_vote = @|predicate| cuda_warp_sync_vote(membermask, cuda_pred(predicate)) as u64, | |
// activemask = cuda_warp_activemask, | |
shfl_i32 = @|x:i32, src_lane:i32, width:u32| cuda_warp_shfl_i32(membermask, x, src_lane, width as i32), | |
shfl_u32 = @|x:u32, src_lane:i32, width:u32| cuda_warp_shfl_u32(membermask, x, src_lane, width as i32), | |
// shfl_i64 = @|x:i64, src_lane:i32, width:u32| cuda_warp_shfl_i64(membermask, x, src_lane, width as i32), | |
// shfl_u64 = @|x:u64, src_lane:i32, width:u32| cuda_warp_shfl_u64(membermask, x, src_lane, width as i32), | |
// shfl_f32 = @|x:f32, src_lane:i32, width:u32| cuda_warp_shfl_f32(membermask, x, src_lane, width as i32), | |
// shfl_f64 = @|x:f64, src_lane:i32, width:u32| cuda_warp_shfl_f64(membermask, x, src_lane, width as i32), | |
shfl_up_i32 = @|x:i32, delta:u32, width:u32| cuda_warp_shfl_up_i32(membermask, x, delta, width as i32), | |
shfl_up_u32 = @|x:u32, delta:u32, width:u32| cuda_warp_shfl_up_u32(membermask, x, delta, width as i32), | |
// shfl_up_i64 = @|x:i64, delta:u32, width:u32| cuda_warp_shfl_up_i64(membermask, x, delta, width as i32), | |
// shfl_up_u64 = @|x:u64, delta:u32, width:u32| cuda_warp_shfl_up_u64(membermask, x, delta, width as i32), | |
// shfl_up_f32 = @|x:f32, delta:u32, width:u32| cuda_warp_shfl_up_f32(membermask, x, delta, width as i32), | |
// shfl_up_f64 = @|x:f64, delta:u32, width:u32| cuda_warp_shfl_up_f64(membermask, x, delta, width as i32), | |
shfl_down_i32 = @|x:i32, delta:u32, width:u32| cuda_warp_shfl_down_i32(membermask, x, delta, width as i32), | |
shfl_down_u32 = @|x:u32, delta:u32, width:u32| cuda_warp_shfl_down_u32(membermask, x, delta, width as i32), | |
// shfl_down_i64 = @|x:i64, delta:u32, width:u32| cuda_warp_shfl_down_i64(membermask, x, delta, width as i32), | |
// shfl_down_u64 = @|x:u64, delta:u32, width:u32| cuda_warp_shfl_down_u64(membermask, x, delta, width as i32), | |
// shfl_down_f32 = @|x:f32, delta:u32, width:u32| cuda_warp_shfl_down_f32(membermask, x, delta, width as i32), | |
// shfl_down_f64 = @|x:f64, delta:u32, width:u32| cuda_warp_shfl_down_f64(membermask, x, delta, width as i32), | |
shfl_bfly_i32 = @|x:i32, lane_mask:i32, width:u32| cuda_warp_shfl_xor_i32(membermask, x, lane_mask, width as i32), | |
shfl_bfly_u32 = @|x:u32, lane_mask:i32, width:u32| cuda_warp_shfl_xor_u32(membermask, x, lane_mask, width as i32), | |
// shfl_bfly_i64 = @|x:i64, lane_mask:i32, width:u32| cuda_warp_shfl_xor_i64(membermask, x, lane_mask, width as i32), | |
// shfl_bfly_u64 = @|x:u64, lane_mask:i32, width:u32| cuda_warp_shfl_xor_u64(membermask, x, lane_mask, width as i32), | |
// shfl_bfly_f32 = @|x:f32, lane_mask:i32, width:u32| cuda_warp_shfl_xor_f32(membermask, x, lane_mask, width as i32), | |
// shfl_bfly_f64 = @|x:f64, lane_mask:i32, width:u32| cuda_warp_shfl_xor_f64(membermask, x, lane_mask, width as i32), | |
// match_any_i32 = @|x:i32| cuda_warp_match_any_i32(membermask, x), | |
// match_any_u32 = @|x:u32| cuda_warp_match_any_u32(membermask, x), | |
// match_any_i64 = @|x:i64| cuda_warp_match_any_i64(membermask, x), | |
// match_any_u64 = @|x:u64| cuda_warp_match_any_u64(membermask, x), | |
// match_any_f32 = @|x:f32| cuda_warp_match_any_f32(membermask, x), | |
// match_any_f64 = @|x:f64| cuda_warp_match_any_f64(membermask, x), | |
// match_all_i32 = @|x:i32, predicate:&mut i32| cuda_warp_match_all_i32(membermask, x, predicate), | |
// match_all_u32 = @|x:u32, predicate:&mut i32| cuda_warp_match_all_u32(membermask, x, predicate), | |
// match_all_i64 = @|x:i64, predicate:&mut i32| cuda_warp_match_all_i64(membermask, x, predicate), | |
// match_all_u64 = @|x:u64, predicate:&mut i32| cuda_warp_match_all_u64(membermask, x, predicate), | |
// match_all_f32 = @|x:f32, predicate:&mut i32| cuda_warp_match_all_f32(membermask, x, predicate), | |
// match_all_f64 = @|x:f64, predicate:&mut i32| cuda_warp_match_all_f64(membermask, x, predicate), | |
lanemask = @|| cuda_lanemask() as u64, | |
lanemask_le = @|| cuda_lanemask_le() as u64, | |
lanemask_lt = @|| cuda_lanemask_lt() as u64, | |
lanemask_ge = @|| cuda_lanemask_ge() as u64, | |
lanemask_gt = @|| cuda_lanemask_gt() as u64 | |
}) | |
} | |
fn @cuda_block(idx: fn(i32) -> u32, gid: fn() -> u32, thread_idx: fn(i32) -> u32, block_size: fn(i32) -> u32, warp_size: u32, body: fn(group_context) -> ()) -> () { | |
let linear_thread_idx = @|| { | |
(thread_idx(2) * block_size(1) + thread_idx(1)) * block_size(0) + thread_idx(0) | |
}; | |
let warp_idx = @|| { | |
linear_thread_idx() / warp_size | |
}; | |
let num_threads = @|| block_size(0) * block_size(1) * block_size(2); | |
let num_warps = @|| (num_threads() + warp_size - 1) / warp_size; | |
@body(group_context { | |
idx = idx, | |
waves = @|body|@|| cuda_subwarp(warp_idx, @|| gid() * num_warps() + warp_idx(), get_member_mask_u32(warp_size), warp_size, body), | |
threads = @|body|@|| cuda_thread(thread_idx, @|| gid() * num_threads() + linear_thread_idx(), body), | |
num_waves = num_warps, | |
num_threads = block_size, | |
barrier = cuda_block_sync, | |
barrier_all = @|predicate| cuda_block_sync_all(cuda_pred(predicate)) != 0, | |
barrier_any = @|predicate| cuda_block_sync_any(cuda_pred(predicate)) != 0, | |
barrier_count = @|predicate| cuda_block_sync_count(cuda_pred(predicate)) | |
}) | |
} | |
fn @cuda_launch(device: i32, (grid_dim_x: i32, grid_dim_y: i32, grid_dim_z: i32), (block_dim_x: i32, block_dim_y: i32, block_dim_z: i32), wrap_index: index_wrapper, wrap_dim: dim_wrapper, body: fn(grid_context) -> ()) -> () { | |
// TODO: assert(warp_size == 32) | |
let warp_size: u32 = 32; | |
let block_size = wrap_dim(@|i: i32| { | |
match i { | |
0 => if ?block_dim_x { block_dim_x as u32 } else { cuda_blockDim_x() as u32 }, | |
1 => if ?block_dim_y { block_dim_y as u32 } else { cuda_blockDim_y() as u32 }, | |
2 => if ?block_dim_z { block_dim_z as u32 } else { cuda_blockDim_z() as u32 }, | |
_ => 1 | |
} | |
}); | |
let num_blocks = wrap_dim(@|i: i32| { | |
match i { | |
0 => if ?grid_dim_x { grid_dim_x as u32 } else { cuda_gridDim_x() as u32 }, | |
1 => if ?grid_dim_y { grid_dim_y as u32 } else { cuda_gridDim_y() as u32 }, | |
2 => if ?grid_dim_y { grid_dim_z as u32 } else { cuda_gridDim_z() as u32 }, | |
_ => 1 | |
} | |
}); | |
let num_threads_per_block = @|| block_size(0) * block_size(1) * block_size(2); | |
let num_warps_per_block = @|| (num_threads_per_block() + warp_size - 1) / warp_size; | |
let num_warps = @|| (num_blocks(0) * num_blocks(1) * num_blocks(2)) * num_warps_per_block(); | |
let num_threads = @|i: i32| num_blocks(i) * block_size(i); | |
let block_idx = wrap_index(@|i: i32| { | |
match i { 0 => cuda_blockIdx_x() as u32, 1 => cuda_blockIdx_y() as u32, 2 => cuda_blockIdx_z() as u32, _ => 0 } | |
}); | |
let linear_block_idx = @|| (block_idx(2) * num_blocks(1) + block_idx(1)) * num_blocks(0) + block_idx(0); | |
let thread_idx = wrap_index(@|i: i32| { | |
match i { 0 => cuda_threadIdx_x() as u32, 1 => cuda_threadIdx_y() as u32, 2 => cuda_threadIdx_z() as u32, _ => 0 } | |
}); | |
let global_thread_idx = @|i: i32| block_idx(i) * block_size(i) + thread_idx(i); | |
let linear_thread_idx = @|| (thread_idx(2) * block_size(1) + thread_idx(1)) * block_size(0) + thread_idx(0); | |
let global_linear_thread_idx = @|| linear_block_idx() * num_threads_per_block() + linear_thread_idx(); | |
let global_warp_idx = @|| linear_block_idx() * num_warps_per_block() + linear_thread_idx() / warp_size; | |
cuda(device, (grid_dim_x * block_dim_x, grid_dim_y * block_dim_y, grid_dim_z * block_dim_z), (block_dim_x, block_dim_y, block_dim_z), @|| @body(grid_context { | |
device = device, | |
max_concurrency = @|| (num_blocks(0) * num_blocks(1) * num_blocks(2)) as i32 * num_threads_per_block() as i32, | |
groups = @|body|@|| cuda_block(block_idx, linear_block_idx, thread_idx, block_size, warp_size, body), | |
waves = @|body|@|| cuda_subwarp(global_warp_idx, global_warp_idx, get_member_mask_u32(warp_size), warp_size, body), | |
threads = @|body|@|| cuda_thread(global_thread_idx, global_linear_thread_idx, body), | |
num_groups = num_blocks, | |
num_waves = num_warps, | |
num_threads = num_threads | |
})) | |
} | |
fn @get_member_mask_u32(size: u32) -> u32 { | |
if size == 32 { -1 } else { (1 << size) - 1 } | |
} | |
fn @get_member_mask_u64(size: u32) -> u64 { | |
if size == 64 { -1 } else { (1 << size as u64) - 1 } | |
} | |
type index_wrapper = fn(fn(i32) -> u32) -> fn(i32) -> u32; | |
type dim_wrapper = fn(fn(i32) -> u32) -> fn(i32) -> u32; | |
fn @wrap_index_1d(idx: fn(i32) -> u32) -> fn(i32) -> u32 { | |
@|i: i32| -> u32 { | |
if ?i { | |
match i { 0 => idx(0), _ => 0 } | |
} | |
else { | |
idx(i) | |
} | |
} | |
} | |
fn @wrap_index_2d(idx: fn(i32) -> u32) -> fn(i32) -> u32 { | |
@|i: i32| -> u32 { | |
if ?i { | |
match i { 0 => idx(0), 1 => idx(1), _ => 0 } | |
} | |
else { | |
idx(i) | |
} | |
} | |
} | |
fn @wrap_index_3d(idx: fn(i32) -> u32) -> fn(i32) -> u32 { | |
@|i: i32| -> u32 { | |
if ?i { | |
match i { 0 => idx(0), 1 => idx(1), 2 => idx(2), _ => 0 } | |
} | |
else { | |
idx(i) | |
} | |
} | |
} | |
fn @wrap_dim_1d(idx: fn(i32) -> u32) -> fn(i32) -> u32 { | |
@|i: i32| -> u32 { | |
if ?i { | |
match i { 0 => idx(0), _ => 1 } | |
} | |
else { | |
idx(i) | |
} | |
} | |
} | |
fn @wrap_dim_2d(idx: fn(i32) -> u32) -> fn(i32) -> u32 { | |
@|i: i32| -> u32 { | |
if ?i { | |
match i { 0 => idx(0), 1 => idx(1), _ => 1 } | |
} | |
else { | |
idx(i) | |
} | |
} | |
} | |
fn @wrap_dim_3d(idx: fn(i32) -> u32) -> fn(i32) -> u32 { | |
@|i: i32| -> u32 { | |
if ?i { | |
match i { 0 => idx(0), 1 => idx(1), 2 => idx(2), _ => 1 } | |
} | |
else { | |
idx(i) | |
} | |
} | |
} | |
fn @print_hex[T](val: T, skip_leading_zeros: bool) -> () { | |
print_char('0'); print_char('x'); | |
let mut leading = skip_leading_zeros; | |
let l = 2 * sizeof[T]() as i32; | |
let p = &val as &[u8]; | |
for i in range(0, l) { | |
let j = l - i - 1; | |
let v:u8 = p(j / 2); | |
let curr = (v >> (4*(j%2) as u8) & 0xf); | |
leading = leading && (curr == 0) && i < l-1; | |
if leading { | |
continue() | |
} | |
let off = if curr < 10 { '0' } else { 'A' - 10 }; | |
print_char(off + curr); | |
} | |
} | |
fn @print_binary(value: u32) -> () { | |
let group = 4; | |
let length = 16; | |
for j in range_step(1, length + 1, group) { | |
for i in range(j, j + group) { | |
let b = 1 << (length - i) as u32; | |
if value & b == b { | |
print_char('1'); | |
} else { | |
print_char('0'); | |
} | |
} | |
print_char(' '); | |
} | |
} | |
struct AccDevice { | |
supports_its: bool, | |
supports_npot_atomic_inc: bool, | |
launch_1d: fn(fn(grid_context) -> ()) -> fn(i32, i32) -> (), | |
synchronize: fn() -> (), | |
alloc: fn(i64) -> Buffer, | |
platform_device: i32, | |
platform_name: &[u8], | |
print_i32: fn(&[u8], i32) -> (), | |
print_2xi32: fn(&[u8], i32, i32) -> (), | |
print_3xi32: fn(&[u8], i32, i32, i32) -> () | |
} | |
struct BaseTest { | |
run_test: fn(fn(&mut addrspace(1) [i8], &mut addrspace(1) u32) -> bool) -> fn() -> (), | |
finish: fn() -> i32 | |
} | |
fn createBaseTest(device: AccDevice, additional_mem: i64) -> BaseTest { | |
let memory_size = round_up_i64(additional_mem + sizeof[u32](), sizeof[u32]()); | |
let device_test_state_alloc = device.alloc(memory_size); | |
let device_memory = device_test_state_alloc.data as &mut addrspace(1) [i8]; | |
let device_failed_flag = &mut device_memory((memory_size - sizeof[u32]()) as i32) as &mut addrspace(1) u32; | |
let gpu_result = runtime_alloc(0, sizeof[u32]()) as &mut u32; | |
*gpu_result = 0; | |
runtime_copy(0, gpu_result as &[i8], 0, device.platform_device, device_failed_flag as &mut[i8], 0, sizeof[u32]()); | |
let mut cpu_result: bool = true; | |
BaseTest { | |
run_test = @|body|@|| { | |
cpu_result = cpu_result & body(device_memory, device_failed_flag); | |
}, | |
finish = @|| { | |
device.synchronize(); | |
runtime_copy(device.platform_device, device_failed_flag as &[i8], 0, 0, gpu_result as &mut[i8], 0, sizeof[u32]()); | |
print_string("gpu_result "); | |
print_binary(*gpu_result); | |
print_char('\n'); | |
print_string("cpu_result "); | |
print_i32((if cpu_result { 0 } else { 1 })); | |
print_char('\n'); | |
let result = if cpu_result && *gpu_result == 0 { 0 } else { -1 }; | |
runtime_release(0, gpu_result as &[i8]); | |
release(device_test_state_alloc); | |
result | |
} | |
} | |
} | |
type ReadFn = fn(i32) -> i32; | |
type WriteFn = fn(i32, i32) -> (); | |
type GlobalFn = fn(i32) -> &mut addrspace(1) i32; | |
type ValidateFn = fn(fn(i32,i32)->bool, fn(i32,i32)->bool, bool) -> bool; | |
struct IntrinsicsTest { | |
run_test: fn(fn(ReadFn, WriteFn, GlobalFn, &mut addrspace(1) u32) -> bool) -> fn() -> (), | |
globals: fn(i32) -> i32, | |
locals: fn(i32) -> i32, | |
validate: ValidateFn, | |
finish: fn() -> i32 | |
} | |
fn createIntrinsicsTest(device: AccDevice, num_globals: i32, init_global: fn(i32)->i32, num_locals: i32, init_local: fn(i32)->i32) -> IntrinsicsTest { | |
let additional_mem = (num_globals + num_locals) as i64 * sizeof[i32](); | |
let test = createBaseTest(device, additional_mem); | |
let buffer = alloc_cpu(additional_mem); | |
let data = buffer.data as &mut[i32]; | |
let mut validation_result = true; | |
IntrinsicsTest { | |
run_test = @|body|@|| { | |
for idx in range(0, num_globals) { | |
data(idx) = init_global(idx); | |
} | |
for idx in range(0, num_locals) { | |
data(num_globals + idx) = init_local(idx); | |
} | |
for device_memory, device_failed_flag in test.run_test() { | |
runtime_copy(0, buffer.data, 0, device.platform_device, device_memory as &mut[i8], 0, additional_mem); | |
let read = @|idx:i32| -> i32 { let ptr = device_memory as & addrspace(1) [i32]; ptr(idx + num_globals) }; | |
let write = @|idx:i32, value:i32| -> () { let ptr = device_memory as &mut addrspace(1) [i32]; ptr(idx + num_globals) = value; }; | |
let global = @|idx:i32| -> &mut addrspace(1) i32 { let ptr = device_memory as &mut addrspace(1) [i32]; &mut(ptr(idx)) }; | |
let result = body(read, write, global, device_failed_flag); | |
device.synchronize(); | |
runtime_copy(device.platform_device, device_memory as &mut[i8], 0, 0, buffer.data, 0, additional_mem); | |
result | |
} | |
}, | |
globals = @|idx| { data(idx) }, | |
locals = @|idx| { data(num_globals + idx) }, | |
validate = @|validate_global: fn(i32,i32)->bool, validate_local: fn(i32,i32)->bool, stop_on_first_fail:bool| -> bool { | |
let result = (@|| -> bool { | |
let mut res = true; | |
for idx in range(0, num_globals) { | |
let check = validate_global(idx, data(idx)); | |
res &= check; | |
if !check { | |
print_string("validation failed! global("); print_hex(idx, false); print_string(") = "); print_i32(data(idx)); print_char('\n'); | |
if stop_on_first_fail { return(false) } | |
} | |
} | |
for idx in range(0, num_locals) { | |
let check = validate_local(idx, data(num_globals + idx)); | |
res &= check; | |
if !check { | |
print_string("validation failed! local("); print_hex(idx, false); print_string(") = "); print_i32(data(num_globals + idx)); print_char('\n'); | |
if stop_on_first_fail { return(false) } | |
} | |
} | |
res | |
})(); | |
validation_result &= result; | |
result | |
}, | |
finish = @|| { | |
release(buffer); | |
let test_result = test.finish(); | |
if !validation_result { -2 } else { test_result } | |
} | |
} | |
} | |
struct error_cont | |
{ | |
throw: fn() -> (), | |
throw_print_i32: fn(&[u8], i32) -> (), | |
throw_print_2xi32: fn(&[u8], i32, i32) -> (), | |
throw_print_3xi32: fn(&[u8], i32, i32, i32) -> () | |
} | |
static error_handler = @|device: AccDevice, cont: fn() -> ()| error_cont { | |
throw = @|| cont(), | |
throw_print_i32 = @|format, arg| -> () { device.print_i32(format, arg); cont(); }, | |
throw_print_2xi32 = @|format, arg_1, arg_2| -> () { device.print_2xi32(format, arg_1, arg_2); cont(); }, | |
throw_print_3xi32 = @|format, arg_1, arg_2, arg_3| -> () { device.print_3xi32(format, arg_1, arg_2, arg_3); cont(); } | |
}; | |
fn @min(a: i32, b: i32) -> i32 { if a < b { a } else { b } } | |
fn @min_i64(a: i64, b: i64) -> i64 { if a < b { a } else { b } } | |
fn @max(a: i32, b: i32) -> i32 { if a > b { a } else { b } } | |
fn @max_i64(a: i64, b: i64) -> i64 { if a > b { a } else { b } } | |
fn @abs(a: i32) -> i32 { if a < 0 { -a } else { a } } | |
fn @div_up(a: i32, b: i32) -> i32 { (a + b - 1) / b } | |
fn @div_up_u32(a: u32, b: u32) -> u32 { (a + b - 1) / b } | |
fn @div_up_i64(a: i64, b: i64) -> i64 { (a + b - 1) / b } | |
// nearest multiple of num | |
fn @round_up(num: i32, multiple: i32) -> i32 { div_up(num, multiple) * multiple } | |
fn @round_up_u32(num: u32, multiple: u32) -> u32 { div_up_u32(num, multiple) * multiple } | |
fn @round_up_i64(num: i64, multiple: i64) -> i64 { div_up_i64(num, multiple) * multiple } | |
fn @round_down(num: i32, multiple: i32) -> i32 { (num / multiple) * multiple } | |
fn @round_down_u32(num: u32, multiple: u32) -> u32 { (num / multiple) * multiple } | |
fn @round_down_i64(num: i64, multiple: i64) -> i64 { (num / multiple) * multiple } | |
fn @is_pot(value: u32) = (value != 0) && ((value & (value - 1)) == 0); | |
fn @exponential_backoff(body: fn(i32) -> bool) { | |
fn @loop(min: i32, max: i32) -> () { | |
if @body(min) { | |
if min < max { | |
loop(min * 3 / 2, max); | |
} | |
else { | |
while @body(min) {} | |
} | |
} | |
} | |
loop | |
} | |
static mut total_kernel_timing: i64 = 0; | |
static mut total_cpu_timing: i64 = 0; | |
fn @benchmark(get_time: fn() -> i64, num_iter: i32, body: fn() -> (), sync: fn() -> ()) -> i64 { | |
let times_buf = alloc_cpu(num_iter as i64 * sizeof[i64]()); | |
let times = times_buf.data as &mut[i64]; | |
for i in range(0, num_iter) { | |
let start = get_time(); | |
body(); | |
sync(); | |
times(i) = get_time() - start; | |
} | |
sort_i64(num_iter, times); | |
print_string("Timing: "); | |
print_f64(times(num_iter/2) as f64 / 1000.0); | |
print_string(" | "); | |
print_f64(times(0) as f64 / 1000.0); | |
print_string(" | "); | |
print_f64(times(num_iter-1) as f64 / 1000.0); | |
print_string(" (median("); | |
print_i32(num_iter); | |
print_string(") | minimum | maximum) ms\n"); | |
let median = times(num_iter/2); | |
release(times_buf); | |
median | |
} | |
static iter_acc = 30; | |
static iter_cpu = 270; | |
fn @benchmark_acc( body: fn() -> ()) = @|acc: Accelerator| total_kernel_timing += benchmark(get_kernel_time, iter_acc, body, acc.sync); | |
fn @benchmark_cpu( body: fn() -> ()) = @|| total_cpu_timing += benchmark(get_micro_time, iter_cpu, body, @||{}); | |
fn @benchmark_cuda(body: fn() -> ()) = @|dev: i32, N: i32| total_kernel_timing += benchmark(get_kernel_time, N, body, @|| synchronize_cuda(dev)); | |
fn print_total_timing() -> () { | |
print_string("Total timing for cpu / kernel: "); | |
print_f64(total_cpu_timing as f64 / 1000.0); | |
print_string(" / "); | |
print_f64(total_kernel_timing as f64 / 1000.0); | |
print_string(" ms\n") | |
} | |
fn @(?num) sort_i64(num: i32, arr: &mut[i64]) -> () { | |
// insertion sort | |
for i in range(1, num) { | |
let x = arr(i); | |
let mut j = i; | |
while j > 0 && arr(j-1) > x { | |
arr(j) = arr(j-1); | |
j = j - 1; | |
} | |
arr(j) = x; | |
} | |
} | |
fn @(?num) sort_f32(num: i32, arr: &mut[f32]) -> () { | |
// insertion sort | |
for i in range(1, num) { | |
let x = arr(i); | |
let mut j = i; | |
while j > 0 && arr(j-1) > x { | |
arr(j) = arr(j-1); | |
j = j - 1; | |
} | |
arr(j) = x; | |
} | |
} | |
fn print_endl() -> () { print_string("\n"); } | |
mod rng { | |
fn @xorshift32(state: u32) -> u32 { | |
let mut x = state; | |
x ^= x << 13; | |
x ^= x >> 17; | |
x ^= x << 5; | |
x | |
} | |
fn @xorseed32(i: u32) -> u32 { | |
xorshift32(xorshift32((i + 23) * 42)) | |
} | |
} | |
struct CSVPrinter { | |
row: fn(fn(ValueVisitor) -> ()) -> fn() -> (), | |
flush: fn() -> () | |
} | |
fn @make_csv_printer() = CSVPrinter { | |
row = @|body|@|| { | |
let mut b = false; | |
@body(ValueVisitor { | |
enum_string = @|v| { if b { print_char(';'); } print_string(v); b = true; }, | |
enum_i32 = @|v| { if b { print_char(';'); } print_i32(v); b = true; }, | |
enum_i64 = @|v| { if b { print_char(';'); } print_i64(v); b = true; }, | |
enum_f32 = @|v| { if b { print_char(';'); } print_f32(v); b = true; } | |
}); | |
print_char('\n'); | |
}, | |
flush = print_flush | |
}; | |
#[import(cc = "C")] fn FINGERPRINT() -> &[u8]; | |
#[import(cc = "C")] fn parse_int_arg(&mut u8, i32) -> i32; | |
#[import(cc = "C")] fn parse_float_arg(&mut u8, i32) -> f32; | |
#[import(cc = "C")] fn enum_int_arg(&mut u8, &[u8]) -> (); | |
#[import(cc = "C")] fn enum_float_arg(&mut u8, &[u8]) -> (); | |
#[import(cc = "C")] fn throw_usage_error(&[u8]) -> (); | |
struct CmdArgs { | |
parse_int: fn() -> i32, | |
parse_float: fn() -> f32 | |
} | |
fn @make_cmd_args(argc: i32, argv: &mut u8) { | |
let mut i = 0; | |
let next_arg = @|| { | |
if i >= argc { | |
throw_usage_error("missing arguments"); | |
} | |
let tmp = i; | |
i = i + 1; | |
tmp | |
}; | |
CmdArgs { | |
parse_int = @|| parse_int_arg(argv, next_arg()), | |
parse_float = @|| parse_float_arg(argv, next_arg()) | |
} | |
} | |
struct BenchmarkParamsVisitor { | |
visit_int_param: fn(&[u8]) -> bool, | |
visit_float_param: fn(&[u8]) -> bool | |
} | |
fn wrap_visit_func_int(visit_func: fn(BenchmarkParamsVisitor) -> bool, name: &[u8]) -> fn(BenchmarkParamsVisitor) -> bool { | |
@|visitor| if visit_func(visitor) { visitor.visit_int_param(name) } else { false } | |
} | |
fn wrap_visit_func_float(visit_func: fn(BenchmarkParamsVisitor) -> bool, name: &[u8]) -> fn(BenchmarkParamsVisitor) -> bool { | |
@|visitor| if visit_func(visitor) { visitor.visit_float_param(name) } else { false } | |
} | |
struct BenchmarkParams7[A1, A2, A3, A4, A5, A6, A7] { | |
// add_int: fn(&[u8]) -> BenchmarkParams8[A1, A2, A3, A4, A5, A6, A7, i32], | |
// add_float: fn(&[u8]) -> BenchmarkParams8[A1, A2, A3, A4, A5, A6, A7, f32], | |
visit: fn(BenchmarkParamsVisitor) -> BenchmarkParamsVisitor, | |
bind: fn(fn(AccDevice, A1, A2, A3, A4, A5, A6, A7) -> i32) -> fn(AccDevice, CmdArgs) -> (i32, CmdArgs) | |
} | |
fn @make_cmd_args_7[A1, A2, A3, A4, A5, A6, A7](visit: fn(BenchmarkParamsVisitor) -> bool, bind: fn(fn(AccDevice, A1, A2, A3, A4, A5, A6, A7) -> i32) -> fn(AccDevice, CmdArgs) -> (i32, CmdArgs)) = BenchmarkParams7[A1, A2, A3, A4, A5, A6, A7] { | |
// add_int = @|name| make_cmd_args_8[A1, A2, A3, A4, A5, A6, A7, i32](wrap_visit_func_int(visit, name), @|f| @|device, args| bind(@|device, a1, a2, a3, a4, a5, a6, a7| f(device, a1, a2, a3, a4, a5, a6, a7, args.parse_int()))(device, args)), | |
// add_float = @|name| make_cmd_args_8[A1, A2, A3, A4, A5, A6, A7, f32](wrap_visit_func_float(visit, name), @|f| @|device, args| bind(@|device, a1, a2, a3, a4, a5, a6, a7| f(device, a1, a2, a3, a4, a5, a6, a7, args.parse_float()))(device, args)), | |
visit = @|visitor| { visit(visitor); visitor }, | |
bind = bind | |
}; | |
struct BenchmarkParams6[A1, A2, A3, A4, A5, A6] { | |
add_int: fn(&[u8]) -> BenchmarkParams7[A1, A2, A3, A4, A5, A6, i32], | |
add_float: fn(&[u8]) -> BenchmarkParams7[A1, A2, A3, A4, A5, A6, f32], | |
visit: fn(BenchmarkParamsVisitor) -> BenchmarkParamsVisitor, | |
bind: fn(fn(AccDevice, A1, A2, A3, A4, A5, A6) -> i32) -> fn(AccDevice, CmdArgs) -> (i32, CmdArgs) | |
} | |
fn @make_cmd_args_6[A1, A2, A3, A4, A5, A6](visit: fn(BenchmarkParamsVisitor) -> bool, bind: fn(fn(AccDevice, A1, A2, A3, A4, A5, A6) -> i32) -> fn(AccDevice, CmdArgs) -> (i32, CmdArgs)) = BenchmarkParams6[A1, A2, A3, A4, A5, A6] { | |
add_int = @|name| make_cmd_args_7[A1, A2, A3, A4, A5, A6, i32](wrap_visit_func_int(visit, name), @|f| @|device, args| bind(@|device, a1, a2, a3, a4, a5, a6| f(device, a1, a2, a3, a4, a5, a6, args.parse_int()))(device, args)), | |
add_float = @|name| make_cmd_args_7[A1, A2, A3, A4, A5, A6, f32](wrap_visit_func_float(visit, name), @|f| @|device, args| bind(@|device, a1, a2, a3, a4, a5, a6| f(device, a1, a2, a3, a4, a5, a6, args.parse_float()))(device, args)), | |
visit = @|visitor| { visit(visitor); visitor }, | |
bind = bind | |
}; | |
struct BenchmarkParams5[A1, A2, A3, A4, A5] { | |
add_int: fn(&[u8]) -> BenchmarkParams6[A1, A2, A3, A4, A5, i32], | |
add_float: fn(&[u8]) -> BenchmarkParams6[A1, A2, A3, A4, A5, f32], | |
visit: fn(BenchmarkParamsVisitor) -> BenchmarkParamsVisitor, | |
bind: fn(fn(AccDevice, A1, A2, A3, A4, A5) -> i32) -> fn(AccDevice, CmdArgs) -> (i32, CmdArgs) | |
} | |
fn @make_cmd_args_5[A1, A2, A3, A4, A5](visit: fn(BenchmarkParamsVisitor) -> bool, bind: fn(fn(AccDevice, A1, A2, A3, A4, A5) -> i32) -> fn(AccDevice, CmdArgs) -> (i32, CmdArgs)) = BenchmarkParams5[A1, A2, A3, A4, A5] { | |
add_int = @|name| make_cmd_args_6[A1, A2, A3, A4, A5, i32](wrap_visit_func_int(visit, name), @|f| @|device, args| bind(@|device, a1, a2, a3, a4, a5| f(device, a1, a2, a3, a4, a5, args.parse_int()))(device, args)), | |
add_float = @|name| make_cmd_args_6[A1, A2, A3, A4, A5, f32](wrap_visit_func_float(visit, name), @|f| @|device, args| bind(@|device, a1, a2, a3, a4, a5| f(device, a1, a2, a3, a4, a5, args.parse_float()))(device, args)), | |
visit = @|visitor| { visit(visitor); visitor }, | |
bind = bind | |
}; | |
struct BenchmarkParams4[A1, A2, A3, A4] { | |
add_int: fn(&[u8]) -> BenchmarkParams5[A1, A2, A3, A4, i32], | |
add_float: fn(&[u8]) -> BenchmarkParams5[A1, A2, A3, A4, f32], | |
visit: fn(BenchmarkParamsVisitor) -> BenchmarkParamsVisitor, | |
bind: fn(fn(AccDevice, A1, A2, A3, A4) -> i32) -> fn(AccDevice, CmdArgs) -> (i32, CmdArgs) | |
} | |
fn @make_cmd_args_4[A1, A2, A3, A4](visit: fn(BenchmarkParamsVisitor) -> bool, bind: fn(fn(AccDevice, A1, A2, A3, A4) -> i32) -> fn(AccDevice, CmdArgs) -> (i32, CmdArgs)) = BenchmarkParams4[A1, A2, A3, A4] { | |
add_int = @|name| make_cmd_args_5[A1, A2, A3, A4, i32](wrap_visit_func_int(visit, name), @|f| @|device, args| bind(@|device, a1, a2, a3, a4| f(device, a1, a2, a3, a4, args.parse_int()))(device, args)), | |
add_float = @|name| make_cmd_args_5[A1, A2, A3, A4, f32](wrap_visit_func_float(visit, name), @|f| @|device, args| bind(@|device, a1, a2, a3, a4| f(device, a1, a2, a3, a4, args.parse_float()))(device, args)), | |
visit = @|visitor| { visit(visitor); visitor }, | |
bind = bind | |
}; | |
struct BenchmarkParams3[A1, A2, A3] { | |
add_int: fn(&[u8]) -> BenchmarkParams4[A1, A2, A3, i32], | |
add_float: fn(&[u8]) -> BenchmarkParams4[A1, A2, A3, f32], | |
visit: fn(BenchmarkParamsVisitor) -> BenchmarkParamsVisitor, | |
bind: fn(fn(AccDevice, A1, A2, A3) -> i32) -> fn(AccDevice, CmdArgs) -> (i32, CmdArgs) | |
} | |
fn @make_cmd_args_3[A1, A2, A3](visit: fn(BenchmarkParamsVisitor) -> bool, bind: fn(fn(AccDevice, A1, A2, A3) -> i32) -> fn(AccDevice, CmdArgs) -> (i32, CmdArgs)) = BenchmarkParams3[A1, A2, A3] { | |
add_int = @|name| make_cmd_args_4[A1, A2, A3, i32](wrap_visit_func_int(visit, name), @|f| @|device, args| bind(@|device, a1, a2, a3| f(device, a1, a2, a3, args.parse_int()))(device, args)), | |
add_float = @|name| make_cmd_args_4[A1, A2, A3, f32](wrap_visit_func_float(visit, name), @|f| @|device, args| bind(@|device, a1, a2, a3| f(device, a1, a2, a3, args.parse_float()))(device, args)), | |
visit = @|visitor| { visit(visitor); visitor }, | |
bind = bind | |
}; | |
struct BenchmarkParams2[A1, A2] { | |
add_int: fn(&[u8]) -> BenchmarkParams3[A1, A2, i32], | |
add_float: fn(&[u8]) -> BenchmarkParams3[A1, A2, f32], | |
visit: fn(BenchmarkParamsVisitor) -> BenchmarkParamsVisitor, | |
bind: fn(fn(AccDevice, A1, A2) -> i32) -> fn(AccDevice, CmdArgs) -> (i32, CmdArgs) | |
} | |
fn @make_cmd_args_2[A1, A2](visit: fn(BenchmarkParamsVisitor) -> bool, bind: fn(fn(AccDevice, A1, A2) -> i32) -> fn(AccDevice, CmdArgs) -> (i32, CmdArgs)) = BenchmarkParams2[A1, A2] { | |
add_int = @|name| make_cmd_args_3[A1, A2, i32](wrap_visit_func_int(visit, name), @|f| @|device, args| bind(@|device, a1, a2| f(device, a1, a2, args.parse_int()))(device, args)), | |
add_float = @|name| make_cmd_args_3[A1, A2, f32](wrap_visit_func_float(visit, name), @|f| @|device, args| bind(@|device, a1, a2| f(device, a1, a2, args.parse_float()))(device, args)), | |
visit = @|visitor| { visit(visitor); visitor }, | |
bind = bind | |
}; | |
struct BenchmarkParams1[A1] { | |
add_int: fn(&[u8]) -> BenchmarkParams2[A1, i32], | |
add_float: fn(&[u8]) -> BenchmarkParams2[A1, f32], | |
visit: fn(BenchmarkParamsVisitor) -> BenchmarkParamsVisitor, | |
bind: fn(fn(AccDevice, A1) -> i32) -> fn(AccDevice, CmdArgs) -> (i32, CmdArgs) | |
} | |
fn @make_cmd_args_1[A1](visit: fn(BenchmarkParamsVisitor) -> bool, bind: fn(fn(AccDevice, A1) -> i32) -> fn(AccDevice, CmdArgs) -> (i32, CmdArgs)) = BenchmarkParams1[A1] { | |
add_int = @|name| make_cmd_args_2[A1, i32](wrap_visit_func_int(visit, name), @|f| @|device, args| bind(@|device, a1| f(device, a1, args.parse_int()))(device, args)), | |
add_float = @|name| make_cmd_args_2[A1, f32](wrap_visit_func_float(visit, name), @|f| @|device, args| bind(@|device, a1| f(device, a1, args.parse_float()))(device, args)), | |
visit = @|visitor| { visit(visitor); visitor }, | |
bind = bind | |
}; | |
struct BenchmarkParams { | |
add_int: fn(&[u8]) -> BenchmarkParams1[i32], | |
add_float: fn(&[u8]) -> BenchmarkParams1[f32], | |
visit: fn(BenchmarkParamsVisitor) -> BenchmarkParamsVisitor, | |
bind: fn(fn(AccDevice) -> i32) -> fn(AccDevice, CmdArgs) -> (i32, CmdArgs) | |
} | |
fn @make_benchmark_params() = BenchmarkParams { | |
add_int = @|name| make_cmd_args_1[i32](@|visitor| visitor.visit_int_param(name), @|f| @|device, args| (f(device, args.parse_int()), args)), | |
add_float = @|name| make_cmd_args_1[f32](@|visitor| visitor.visit_float_param(name), @|f| @|device, args| (f(device, args.parse_float()), args)), | |
visit = @|visitor| visitor, | |
bind = @|f| @|device, args| (f(device), args) | |
}; | |
#[export] | |
fn benchmark_enum_args(ctx: &mut u8) { | |
let params = describe_benchmark_params(make_benchmark_params()); | |
params.visit(BenchmarkParamsVisitor { | |
visit_int_param = @|name| { enum_int_arg(ctx, name); true }, | |
visit_float_param = @|name| { enum_float_arg(ctx, name); true }, | |
}); | |
} | |
#[export] | |
fn benchmark_print_info(device: i32) { | |
print_string(runtime_device_name(createAccDevice(device).platform_device)); print_char('\n'); | |
print_string(FINGERPRINT()); print_char('\n'); | |
0 | |
} | |
#[export] | |
fn benchmark_run(device: i32, argc: i32, argv: &mut u8) { | |
let params = describe_benchmark_params(make_benchmark_params()); | |
let num_params = { | |
let mut n = 0; | |
params.visit(BenchmarkParamsVisitor { | |
visit_int_param = @|_name| { n = n + 1; true }, | |
visit_float_param = @|_name| { n = n + 1; true }, | |
}); | |
n | |
}; | |
if argc != num_params { | |
throw_usage_error("incorrect number of arguments"); | |
} | |
let run = params.bind(run_benchmark); | |
run(createAccDevice(device), make_cmd_args(argc, argv)).0 | |
} | |
#[import(cc = "C")] fn instrumentation_create(i32) -> &mut u8; | |
#[import(cc = "C")] fn instrumentation_begin(&mut u8) -> (); | |
#[import(cc = "C")] fn instrumentation_end(&mut u8) -> f32; | |
#[import(cc = "C")] fn instrumentation_destroy(&mut u8) -> (); | |
struct Instrumentation { | |
time: fn(fn() -> ()) -> fn() -> f32, | |
destroy: fn() -> () | |
} | |
fn @create_instrumentation(device: AccDevice) { | |
let ctx = instrumentation_create(device.platform_device >> 4); | |
// ^ HACK! | |
Instrumentation { | |
time = @|body|@|| { | |
instrumentation_begin(ctx); | |
@body(); | |
instrumentation_end(ctx) | |
}, | |
destroy = @|| instrumentation_destroy(ctx) | |
} | |
} | |
fn @upsweep_exponential(body: fn(i32) -> ()) { | |
fn @(?min & ?max & ?exp) loop(min: i32, max: i32, exp: i32) -> () { | |
if min <= max { | |
@body(min); | |
loop(min * exp, max, exp); | |
} | |
} | |
loop | |
} | |
struct ValueVisitor { | |
enum_string: fn(&[u8]) -> (), | |
enum_i32: fn(i32) -> (), | |
enum_i64: fn(i64) -> (), | |
enum_f32: fn(f32) -> (), | |
} | |
struct QueueBenchmark { | |
enum_param_names: fn(fn(&[u8]) -> ()) -> (), | |
enum_param_values: fn(ValueVisitor) -> (), | |
reset: fn(grid_context) -> (), | |
run: fn(QueueInstrumentationScope, i32, i32) -> () | |
} | |
fn @run_queue_benchmark(device: AccDevice, block_size: i32, benchmark: QueueBenchmark, queue_name: &[u8], create_queue: queue_constructor[u32], queue_size: i32, num_threads_min: i32, num_threads_max: i32, create_queue_instrumentation: fn(AccDevice) -> QueueInstrumentation) -> i32 { | |
match create_queue(device, queue_size) { | |
create_queue_result[u32]::Ok(queue) => { | |
let instrumentation = create_instrumentation(device); | |
let queue_instrumentation = create_queue_instrumentation(device); | |
let csv = make_csv_printer(); | |
// doing this during printing messes with the PE, probably due to the external calls | |
let device_name = runtime_device_name(device.platform_device); | |
let fingerprint = FINGERPRINT(); | |
for r in csv.row() { r.enum_string("queue_type"); r.enum_string("queue_size"); benchmark.enum_param_names(r.enum_string); } | |
for r in csv.row() { r.enum_string(queue_name); r.enum_i32(queue_size); benchmark.enum_param_values(r); } | |
for _ in csv.row() { } | |
for r in csv.row() { r.enum_string("platform"); r.enum_string("device_name"); r.enum_string("fingerprint"); } | |
for r in csv.row() { r.enum_string(device.platform_name); r.enum_string(device_name); r.enum_string(fingerprint); } | |
for _ in csv.row() { } | |
for r in csv.row() { r.enum_string("num_threads"); r.enum_string("t/ms"); queue_instrumentation.enum_result_names(r.enum_string); } | |
let N = 10; | |
for num_threads in upsweep_exponential(num_threads_min, num_threads_max, 2) { | |
for i in range(0, N) { | |
for r in csv.row() { | |
r.enum_i32(num_threads); | |
// TODO: better launch config (launch based on occupancy?) | |
// TODO: the queue reset may require launch config of benchmark | |
for grid in device.launch_1d(1, block_size) { | |
queue.reset(grid); | |
queue_instrumentation.reset(grid); | |
benchmark.reset(grid); | |
} | |
device.synchronize(); | |
let t = for instrumentation.time() { | |
for qis in queue_instrumentation.record(queue) { | |
benchmark.run(qis, num_threads, i); | |
} | |
}; | |
device.synchronize(); | |
r.enum_f32(t); | |
queue_instrumentation.results().enum_values(r); | |
} | |
csv.flush(); | |
} | |
} | |
0 | |
}, | |
create_queue_result[u32]::Err(msg) => { | |
print_string(msg); | |
print_endl(); | |
-1 | |
} | |
} | |
} | |
struct QueueOperationStatistics { | |
num_operations: i64, | |
t_total: i64, | |
t_min: i32, | |
t_max: i32, | |
} | |
struct QueueBenchmarkStatistics { | |
enqueue_stats_succ: QueueOperationStatistics, | |
enqueue_stats_fail: QueueOperationStatistics, | |
dequeue_stats_succ: QueueOperationStatistics, | |
dequeue_stats_fail: QueueOperationStatistics | |
} | |
fn @init_queue_operation_stats() = QueueOperationStatistics { | |
num_operations = 0, | |
t_total = 0, | |
t_min = ((1 as u32 << 31) - 1) as i32, | |
t_max = 0 | |
}; | |
fn @record_queue_operation(stats: &mut QueueOperationStatistics, t: i32) { | |
++stats.num_operations; | |
stats.t_total += t as i64; | |
stats.t_min = min(stats.t_min, t); | |
stats.t_max = max(stats.t_max, t); | |
} | |
fn @init_benchmark_stats() = QueueBenchmarkStatistics { | |
enqueue_stats_succ = init_queue_operation_stats(), | |
enqueue_stats_fail = init_queue_operation_stats(), | |
dequeue_stats_succ = init_queue_operation_stats(), | |
dequeue_stats_fail = init_queue_operation_stats(), | |
}; | |
fn @accumulate_queue_operation_stats(dest: &mut addrspace(1) QueueOperationStatistics, stats: QueueOperationStatistics, thread: thread_context) { | |
thread.atomic_add_global_i64(dest.num_operations, stats.num_operations, memory_order::relaxed); | |
thread.atomic_add_global_i64(dest.t_total, stats.t_total, memory_order::relaxed); | |
thread.atomic_min_global_i32(dest.t_min, stats.t_min, memory_order::relaxed); | |
thread.atomic_max_global_i32(dest.t_max, stats.t_max, memory_order::relaxed); | |
} | |
fn @accumulate_benchmark_stats(dest: &mut addrspace(1) QueueBenchmarkStatistics, stats: QueueBenchmarkStatistics, thread: thread_context) { | |
accumulate_queue_operation_stats(&mut dest.enqueue_stats_succ, stats.enqueue_stats_succ, thread); | |
accumulate_queue_operation_stats(&mut dest.enqueue_stats_fail, stats.enqueue_stats_fail, thread); | |
accumulate_queue_operation_stats(&mut dest.dequeue_stats_succ, stats.dequeue_stats_succ, thread); | |
accumulate_queue_operation_stats(&mut dest.dequeue_stats_fail, stats.dequeue_stats_fail, thread); | |
} | |
fn @wrap_queue_instrumentation[T](queue: ProducerConsumerQueue[T], stats: &mut QueueBenchmarkStatistics) -> ProducerConsumerQueue[T] { | |
let wrapped_queue = ProducerConsumerQueue[T] { | |
push = @|source:fn()->T| @|thread:thread_context| -> i32 { | |
let t_begin = thread.timestamp32(); | |
let num_pushed = for queue.push(thread) { | |
@source() | |
}; | |
let t_end = thread.timestamp32(); | |
let dt = t_end - t_begin; | |
if num_pushed > 0 { | |
record_queue_operation(stats.enqueue_stats_succ, dt); | |
} | |
else { | |
record_queue_operation(stats.enqueue_stats_fail, dt); | |
} | |
num_pushed | |
}, | |
pop = @|sink:fn(T)->()| @|thread:thread_context| -> i32 { | |
let t_begin = thread.timestamp32(); | |
let num_poppped = for el in queue.pop(thread) { | |
@sink(el); | |
}; | |
let t_end = thread.timestamp32(); | |
let dt = t_end - t_begin; | |
if num_poppped > 0 { | |
record_queue_operation(stats.dequeue_stats_succ, dt); | |
} | |
else { | |
record_queue_operation(stats.dequeue_stats_fail, dt); | |
} | |
num_poppped | |
}, | |
size = queue.size, | |
reset = queue.reset, | |
validate = queue.validate, | |
release = queue.release | |
}; | |
wrapped_queue | |
} | |
struct QueueInstrumentationResults { | |
enum_values: fn(ValueVisitor) -> () | |
} | |
struct QueueInstrumentationScope { | |
record: fn(fn(ProducerConsumerQueue[u32]) -> ()) -> fn(thread_context) -> () | |
} | |
struct QueueInstrumentation { | |
enum_result_names: fn(fn(&[u8]) -> ()) -> (), | |
reset: fn(grid_context) -> (), | |
record: fn(fn(QueueInstrumentationScope) -> ()) -> fn(ProducerConsumerQueue[u32]) -> (), | |
results: fn() -> QueueInstrumentationResults, | |
} | |
fn @create_queue_instrumentation_full(device: AccDevice) { | |
let stats_buffer_alloc = device.alloc(sizeof[QueueBenchmarkStatistics]()); | |
let stats_buffer = stats_buffer_alloc.data as &mut addrspace(1) QueueBenchmarkStatistics; | |
// TODO: release resources | |
QueueInstrumentation { | |
enum_result_names = enum_queue_instrumentation_result_names, | |
reset = @|grid| { | |
for thread in grid.threads() { | |
if thread.idx(0) == 0 { | |
*stats_buffer = init_benchmark_stats(); | |
} | |
} | |
}, | |
record = @|body|@|queue| { | |
@body(QueueInstrumentationScope { | |
record = @|body|@|thread| { | |
let mut stats = init_benchmark_stats(); | |
let q = wrap_queue_instrumentation(queue, &mut stats); | |
@body(q); | |
accumulate_benchmark_stats(stats_buffer, stats, thread); | |
} | |
}); | |
}, | |
results = @|| { | |
let mut stats: QueueBenchmarkStatistics; | |
runtime_copy(device.platform_device, stats_buffer_alloc.data, 0, 0, &mut stats as &mut[i8], 0, sizeof[QueueBenchmarkStatistics]()); | |
QueueInstrumentationResults { | |
enum_values = enum_queue_instrumentation_result_values(stats) | |
} | |
} | |
} | |
} | |
fn @enum_queue_instrumentation_result_names(enum_name: fn(&[u8]) -> ()) { | |
enum_name("num_enqueues"); enum_name("t_enqueues"); enum_name("t_enqueue_min"); enum_name("t_enqueue_max"); | |
enum_name("num_enqueues_failed"); enum_name("t_enqueues_failed"); enum_name("t_enqueue_failed_min"); enum_name("t_enqueue_failed_max"); | |
enum_name("num_dequeues"); enum_name("t_dequeues"); enum_name("t_dequeue_min"); enum_name("t_dequeue_max"); | |
enum_name("num_dequeues_failed"); enum_name("t_dequeues_failed"); enum_name("t_dequeue_failed_min"); enum_name("t_dequeue_failed_max"); | |
} | |
fn @enum_queue_instrumentation_result_values(stats: QueueBenchmarkStatistics) { | |
@|v: ValueVisitor| { | |
v.enum_i64(stats.enqueue_stats_succ.num_operations); v.enum_i64(stats.enqueue_stats_succ.t_total); v.enum_i32(stats.enqueue_stats_succ.t_min); v.enum_i32(stats.enqueue_stats_succ.t_max); | |
v.enum_i64(stats.enqueue_stats_fail.num_operations); v.enum_i64(stats.enqueue_stats_fail.t_total); v.enum_i32(stats.enqueue_stats_fail.t_min); v.enum_i32(stats.enqueue_stats_fail.t_max); | |
v.enum_i64(stats.dequeue_stats_succ.num_operations); v.enum_i64(stats.dequeue_stats_succ.t_total); v.enum_i32(stats.dequeue_stats_succ.t_min); v.enum_i32(stats.dequeue_stats_succ.t_max); | |
v.enum_i64(stats.dequeue_stats_fail.num_operations); v.enum_i64(stats.dequeue_stats_fail.t_total); v.enum_i32(stats.dequeue_stats_fail.t_min); v.enum_i32(stats.dequeue_stats_fail.t_max); | |
} | |
} | |
fn @create_queue_instrumentation_none(_device: AccDevice) { | |
QueueInstrumentation { | |
enum_result_names = @|_| {}, | |
reset = @|_| {}, | |
record = @|body|@|queue| { | |
@body(QueueInstrumentationScope { | |
record = @|body|@|_| { | |
@body(queue); | |
} | |
}); | |
}, | |
results = @|| { | |
QueueInstrumentationResults { | |
enum_values = @|_| {} | |
} | |
} | |
} | |
} | |
fn @describe_benchmark_params(params: BenchmarkParams) { | |
params.add_int("num-threads-min") | |
.add_int("num-threads-max") | |
.add_int("block-size") | |
.add_int("input-elements") | |
.add_int("workload-size-producer") | |
.add_int("workload-size-consumer") | |
} | |
fn @run_benchmark(device: AccDevice, num_threads_min: i32, num_threads_max: i32, block_size: i32, num_input_elements: i32, workload_size_producer: i32, workload_size_consumer: i32) { | |
run_queue_benchmark( | |
device, | |
block_size, | |
simple_pipeline_benchmark(device, block_size, num_input_elements, workload_size_producer, workload_size_consumer), | |
"CPCQ", | |
createConcurrentProducerConsumerIndexQueue, | |
16384, | |
num_threads_min, | |
num_threads_max, | |
create_queue_instrumentation_full) | |
} | |
mod CPCQ { | |
struct Queue { | |
size: i32, | |
head: u32, | |
tail: u32 | |
} | |
struct Element[T] { | |
lock: u32, | |
data: T | |
} | |
} | |
struct QueueElement[T] { | |
clear: fn() -> (), | |
is_free: fn() -> bool, | |
store: fn(fn() -> T, thread_context) -> (), | |
load: fn(fn(T) -> (), thread_context) -> (), | |
debug_print: fn(AccDevice) -> () | |
} | |
struct QueueElementType[T] { | |
buffer_size: fn(i32) -> i64, | |
buffer_alignment: fn() -> i64, | |
buffer_element: fn(&mut addrspace(1) [u8], u32) -> QueueElement[T] | |
} | |
// note (cc < 7.0): threads within the same warp must only ever either enqueue or dequeue stuff concurrently | |
// this is fine as long as a warp only ever acts as either a producer or consumer at a time | |
fn @createConcurrentProducerConsumerQueue[T](device: AccDevice, element_type: QueueElementType[T], num_elements: i32, opt: bool) -> create_queue_result[T] { | |
if num_elements < 0 { | |
return(create_queue_result[T]::Err("invalid queue size")) | |
} | |
if !device.supports_npot_atomic_inc && !is_pot(num_elements as u32) { | |
return(create_queue_result[T]::Err("queue size must be power of two")) | |
} | |
let buffer_size = element_type.buffer_size(num_elements); | |
let buffer_alignment = element_type.buffer_alignment(); | |
let buffer_data_offset = round_up_i64(sizeof[CPCQ::Queue](), buffer_alignment); | |
let queue_device_state_alloc = device.alloc(buffer_data_offset + buffer_size); | |
let queue_device_memory = queue_device_state_alloc.data as &mut addrspace(1) [u8]; | |
let queue = &mut queue_device_memory(0) as &mut addrspace(1) CPCQ::Queue; | |
let buffer = &mut queue_device_memory(buffer_data_offset) as &mut addrspace(1) [u8]; | |
let buffer_element = @|i:u32| -> QueueElement[T] { element_type.buffer_element(buffer, i) }; | |
let seq_cst = @|thread: thread_context| if opt { thread.memory_barrier(memory_order::seq_cst); } else {}; | |
let order = @|x: memory_order| if opt { x } else { memory_order::relaxed }; | |
create_queue_result[T]::Ok(ProducerConsumerQueue[T] { | |
push = @|source| @|thread| { | |
seq_cst(thread); | |
let current_size = thread.atomic_load_global_i32(queue.size, order(memory_order::relaxed)); | |
if current_size >= num_elements { | |
0 | |
} | |
else { | |
let new_size = thread.atomic_add_global_i32(queue.size, 1, order(memory_order::relaxed)); | |
seq_cst(thread); | |
for anyq_verbose() { | |
device.print_3xi32("%d | QUEUE: reserve for push %d -> %d\n", thread.gid() as i32, current_size, new_size); | |
} | |
if new_size >= num_elements { | |
for anyq_verbose() { | |
device.print_2xi32("%d | QUEUE: reject push %d\n", thread.gid() as i32, new_size); | |
} | |
seq_cst(thread); | |
thread.atomic_sub_global_i32(queue.size, 1, order(memory_order::relaxed)); | |
seq_cst(thread); | |
0 | |
} | |
else { | |
seq_cst(thread); | |
let i = thread.atomic_inc_global_u32(queue.tail, (num_elements - 1) as u32); | |
seq_cst(thread); | |
for anyq_verbose() { | |
device.print_2xi32("%d | QUEUE: move tail %d\n", thread.gid() as i32, i as i32); | |
} | |
buffer_element(i).store(source, thread); | |
1 | |
} | |
} | |
}, | |
pop = @|sink| @|thread| { | |
seq_cst(thread); | |
let current_size = thread.atomic_load_global_i32(queue.size, order(memory_order::relaxed)); | |
if current_size <= 0 { | |
0 | |
} | |
else { | |
let available = thread.atomic_sub_global_i32(queue.size, 1, order(memory_order::relaxed)); | |
seq_cst(thread); | |
for anyq_verbose() { | |
device.print_3xi32("%d | QUEUE: allocate for pop %d -> %d\n", thread.gid() as i32, current_size, available); | |
} | |
if available <= 0 { | |
for anyq_verbose() { | |
device.print_2xi32("%d | QUEUE: reject pop %d\n", thread.gid() as i32, available); | |
} | |
seq_cst(thread); | |
thread.atomic_add_global_i32(queue.size, 1, order(memory_order::relaxed)); | |
seq_cst(thread); | |
0 | |
} | |
else { | |
seq_cst(thread); | |
let i = thread.atomic_inc_global_u32(queue.head, (num_elements - 1) as u32); | |
seq_cst(thread); | |
for anyq_verbose() { | |
device.print_2xi32("%d | QUEUE: move head %d\n", thread.gid() as i32, i as i32); | |
} | |
buffer_element(i).load(sink, thread); | |
1 | |
} | |
} | |
}, | |
size = @|thread| { | |
thread.atomic_load_global_i32(queue.size, memory_order::relaxed) | |
}, | |
reset = @|grid| { | |
for thread in grid.threads() { | |
if thread.idx(0) == 0 { | |
queue.size = 0; | |
queue.head = 0; | |
queue.tail = 0; | |
} | |
for i in range_step(thread.idx(0) as i32, num_elements, grid.num_threads(0) as i32) { | |
buffer_element(i as u32).clear(); | |
} | |
} | |
}, | |
validate = @|corrupted, grid| { | |
for thread in grid.threads() { | |
let idx = thread.idx(0); | |
if idx == 0 { | |
if (queue.size != 0) { | |
device.print_i32("VALIDATION ERROR: queue size (%d) is not zero!\n", queue.size); | |
thread.atomic_store_global_u32(corrupted, 1, memory_order::relaxed); | |
} | |
} | |
if idx < num_elements as u32 { | |
let element = buffer_element(idx); | |
if !element.is_free() { | |
thread.atomic_store_global_u32(corrupted, 1, memory_order::relaxed); | |
element.debug_print(device); | |
} | |
} | |
} | |
}, | |
release = @|| { | |
release(queue_device_state_alloc); | |
} | |
}) | |
} | |
fn @genericQueueElementType[T]() { | |
let element_alignment = alignof[CPCQ::Element[T]](); | |
let element_size = round_up_i64(sizeof[CPCQ::Element[T]](), element_alignment); | |
QueueElementType[T] { | |
buffer_size = @|num_elements| num_elements as i64 * element_size, | |
buffer_alignment = @|| element_alignment, | |
buffer_element = @|buffer, i| { | |
let element = &mut (buffer as &mut addrspace(1) [CPCQ::Element[T]])(i); | |
QueueElement[T] { | |
clear = @|| element.lock = 0, | |
is_free = @|| element.lock == 0, | |
load = @|sink, thread| { | |
thread.wait(@|| thread.atomic_cas_global_u32_weak(element.lock, 2, 3, memory_order::acquire, memory_order::relaxed).1, "wait for element lock to read"); | |
sink(element.data); | |
thread.atomic_store_global_u32(element.lock, 0, memory_order::release); | |
}, | |
store = @|source, thread| { | |
thread.wait(@|| thread.atomic_cas_global_u32_weak(element.lock, 0, 1, memory_order::acquire, memory_order::relaxed).1, "wait for element lock to write"); | |
element.data = source(); | |
thread.atomic_store_global_u32(element.lock, 2, memory_order::release); | |
}, | |
debug_print = @|device| { | |
device.print_2xi32("VALIDATION ERROR: inconsistent queue state: buffer[%d] = %d\n", i as i32, element.lock as i32); | |
} | |
} | |
} | |
} | |
} | |
fn @indexQueueElementType(opt: bool) = QueueElementType[u32] { | |
buffer_size = @|num_elements| num_elements as i64 * sizeof[u32](), | |
buffer_alignment = @|| alignof[u32](), | |
buffer_element = @|buffer, i| { | |
let FREE = -1 as u32; | |
let element = &mut (buffer as &mut addrspace(1) [u32])(i); | |
let order = @|x: memory_order| if opt { memory_order::seq_cst } else { x }; | |
QueueElement[u32] { | |
clear = @|| *element = FREE, | |
is_free = @|| *element == FREE, | |
load = @|sink, thread| { | |
thread.wait(@|| { | |
let el = thread.atomic_exch_global_u32(element, FREE, order(memory_order::relaxed)); | |
if el != FREE { | |
sink(el); | |
true | |
} | |
else { | |
false | |
} | |
}, "wait for element"); | |
}, | |
store = @|source, thread| { | |
let value = source(); | |
thread.wait(@|| thread.atomic_cas_global_u32_weak(element, FREE, value, order(memory_order::relaxed), order(memory_order::relaxed)).1, "wait for a successful indexQueueElementType.store()"); | |
}, | |
debug_print = @|device| { | |
device.print_2xi32("VALIDATION ERROR: inconsistent queue state: buffer[%d] = %d\n", i as i32, *element as i32); | |
} | |
} | |
} | |
}; | |
fn @createConcurrentProducerConsumerIndexQueue(device: AccDevice, num_elements: i32) = createConcurrentProducerConsumerQueue[u32](device, indexQueueElementType(false), num_elements, false); | |
fn @createConcurrentProducerConsumerIndexQueueOpt(device: AccDevice, num_elements: i32) = createConcurrentProducerConsumerQueue[u32](device, indexQueueElementType(true), num_elements, true); | |
fn @createConcurrentProducerConsumerQueueGeneric[T](device: AccDevice, num_elements: i32) = createConcurrentProducerConsumerQueue[T](device, genericQueueElementType[T](), num_elements, false); | |
fn @simple_pipeline_benchmark(device: AccDevice, block_size: i32, num_input_elements: i32, workload_size_producer: i32, workload_size_consumer: i32) { | |
let pipeline_state_alloc = device.alloc(1 * sizeof[i32]()); | |
let pipeline_state = pipeline_state_alloc.data as &mut addrspace(1) [i32]; | |
let input_elements = &mut pipeline_state(0); | |
let completed_elements = &mut pipeline_state(1); | |
// TODO: release resources | |
QueueBenchmark { | |
enum_param_names = @|enum_param| { | |
enum_param("block_size"); enum_param("num_input_elements"); enum_param("workload_size_producer"); enum_param("workload_size_consumer"); | |
}, | |
enum_param_values = @|v| { | |
v.enum_i32(block_size); v.enum_i32(num_input_elements); v.enum_i32(workload_size_producer); v.enum_i32(workload_size_consumer); | |
}, | |
reset = @|grid| { | |
for thread in grid.threads() { | |
if thread.idx(0) == 0 { | |
*input_elements = num_input_elements; | |
*completed_elements = 0; | |
} | |
} | |
}, | |
run = @|queue_instrumentation, num_threads, i| { | |
for grid in device.launch_1d(div_up(num_threads, block_size), block_size) { | |
for wave in grid.waves() { | |
for thread in wave.threads() { | |
let thread_id = wave.idx() * wave.num_threads() + thread.idx(0); | |
if thread_id < num_threads as u32 { | |
for q in queue_instrumentation.record(thread) { | |
let mut next_value = rng::xorseed32(thread_id + i as u32 * num_threads as u32); | |
let simulate_workload = @|workload_size: i32| { | |
for _ in range(0, workload_size) { | |
next_value = rng::xorshift32(next_value); | |
} | |
next_value | |
}; | |
while (thread.atomic_load_global_i32(completed_elements, memory_order::relaxed) < num_input_elements) { | |
let should_drain = @|thread: thread_context| q.size(thread) >= wave.num_threads() as i32 || thread.atomic_load_global_i32(input_elements, memory_order::relaxed) <= 0; | |
if wave.barrier_any(if thread.idx(0) == 0 { should_drain(thread) } else { false }) { | |
if for value in q.pop(thread) { | |
next_value = value; | |
} > 0 { | |
simulate_workload(workload_size_consumer); | |
thread.atomic_add_global_i32(completed_elements, 1, memory_order::relaxed); | |
} | |
} | |
else { | |
if thread.atomic_sub_global_i32(input_elements, 1, memory_order::relaxed) > 0 { | |
next_value = simulate_workload(workload_size_producer); | |
while for q.push(thread) { | |
next_value | |
} < 1 {}; | |
} | |
else { | |
thread.atomic_add_global_i32(input_elements, 0, memory_order::relaxed); | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment