Created
May 11, 2023 14:55
-
-
Save alexanderguzhva/1f7dc7352e71824fd4a3b668ec5c9750 to your computer and use it in GitHub Desktop.
raft version with compose op
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
cvt.u32.u64 %r6, %rd5; | |
cvt.u32.u64 %r7, %rd4; | |
rem.u32 %r8, %r7, %r6; | |
cvt.u64.u32 %rd38, %r8; | |
$L__BB141_4: | |
add.s64 %rd9, %rd4, 1; | |
and.b64 %rd27, %rd9, -4294967296; | |
setp.eq.s64 %p3, %rd27, 0; | |
@%p3 bra $L__BB141_6; | |
rem.u64 %rd39, %rd9, %rd5; | |
bra.uni $L__BB141_7; | |
$L__BB141_6: | |
cvt.u32.u64 %r9, %rd5; | |
cvt.u32.u64 %r10, %rd9; | |
rem.u32 %r11, %r10, %r9; | |
cvt.u64.u32 %rd39, %r11; | |
$L__BB141_7: | |
cvt.u32.u64 %r12, %rd39; | |
cvt.u32.u64 %r13, %rd38; | |
shl.b64 %rd28, %rd4, 2; | |
add.s64 %rd29, %rd1, %rd28; | |
st.global.v2.u32 [%rd29], {%r13, %r12}; | |
$L__BB141_8: | |
setp.le.u64 %p4, %rd3, %rd2; | |
@%p4 bra $L__BB141_10; | |
cvt.u32.u64 %r14, %rd2; | |
rem.u32 %r15, %r14, %r1; | |
shl.b64 %rd30, %rd2, 2; | |
add.s64 %rd31, %rd1, %rd30; | |
st.global.u32 [%rd31], %r15; | |
$L__BB141_10: | |
sub.s64 %rd32, %rd18, %rd3; | |
and.b64 %rd33, %rd32, 1; | |
sub.s64 %rd34, %rd18, %rd33; | |
add.s64 %rd13, %rd34, %rd2; | |
setp.ge.u64 %p5, %rd13, %rd18; | |
@%p5 bra $L__BB141_15; | |
cvt.u64.u32 %rd14, %r1; | |
and.b64 %rd35, %rd13, -4294967296; | |
setp.eq.s64 %p6, %rd35, 0; | |
@%p6 bra $L__BB141_13; | |
rem.u64 %rd40, %rd13, %rd14; | |
bra.uni $L__BB141_14; | |
$L__BB141_13: | |
cvt.u32.u64 %r16, %rd14; | |
cvt.u32.u64 %r17, %rd13; | |
rem.u32 %r18, %r17, %r16; | |
cvt.u64.u32 %rd40, %r18; | |
$L__BB141_14: | |
shl.b64 %rd36, %rd13, 2; | |
add.s64 %rd37, %rd1, %rd36; | |
st.global.u32 [%rd37], %rd40; | |
$L__BB141_15: | |
ret; | |
} | |
.visible .entry _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4_( | |
.param .u64 _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_0, | |
.param .u64 _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_1, | |
.param .align 4 .b8 _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_2[12], | |
.param .u64 _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_3 | |
) | |
{ | |
.reg .pred %p<2>; | |
.reg .b16 %rs<10>; | |
.reg .f32 %f<4>; | |
.reg .b32 %r<5>; | |
.reg .b64 %rd<10>; | |
ld.param.u64 %rd2, [_ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_0]; | |
ld.param.u64 %rd4, [_ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_1]; | |
ld.param.u64 %rd3, [_ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_3]; | |
ld.param.f32 %f1, [_ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_2+4]; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %ntid.x; | |
mov.u32 %r3, %tid.x; | |
mad.lo.s32 %r4, %r1, %r2, %r3; | |
cvt.u64.u32 %rd1, %r4; | |
setp.ge.u64 %p1, %rd1, %rd4; | |
@%p1 bra $L__BB142_2; | |
cvta.to.global.u64 %rd5, %rd3; | |
shl.b64 %rd6, %rd1, 2; | |
add.s64 %rd7, %rd5, %rd6; | |
ld.global.f32 %f2, [%rd7]; | |
mul.ftz.f32 %f3, %f1, %f2; | |
cvta.to.global.u64 %rd8, %rd2; | |
add.s64 %rd9, %rd8, %rd6; | |
st.global.f32 [%rd9], %f3; | |
$L__BB142_2: | |
ret; | |
} | |
.visible .entry _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4_( | |
.param .u64 _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_0, | |
.param .u64 _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_1, | |
.param .align 4 .b8 _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_2[12], | |
.param .u64 _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_3 | |
) | |
{ | |
.reg .pred %p<4>; | |
.reg .f32 %f<14>; | |
.reg .b32 %r<5>; | |
.reg .b64 %rd<28>; | |
ld.param.u64 %rd9, [_ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_0]; | |
ld.param.u64 %rd7, [_ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_1]; | |
ld.param.u64 %rd8, [_ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_3]; | |
cvta.to.global.u64 %rd1, %rd8; | |
cvta.to.global.u64 %rd2, %rd9; | |
mov.u32 %r1, %ntid.x; | |
mov.u32 %r2, %ctaid.x; | |
mov.u32 %r3, %tid.x; | |
mad.lo.s32 %r4, %r2, %r1, %r3; | |
cvt.u64.u32 %rd3, %r4; | |
add.s64 %rd10, %rd9, 15; | |
and.b64 %rd11, %rd10, -16; | |
sub.s64 %rd12, %rd11, %rd9; | |
shr.s64 %rd13, %rd12, 2; | |
min.u64 %rd4, %rd13, %rd7; | |
mul.wide.u32 %rd14, %r4, 4; | |
add.s64 %rd5, %rd4, %rd14; | |
ld.param.f32 %f1, [_ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_2+4]; | |
add.s64 %rd15, %rd5, 4; | |
setp.gt.u64 %p1, %rd15, %rd7; | |
@%p1 bra $L__BB143_2; | |
shl.b64 %rd17, %rd5, 2; | |
add.s64 %rd16, %rd8, %rd17; | |
ld.global.nc.v4.f32 {%f2,%f3,%f4,%f5}, [%rd16]; | |
mul.ftz.f32 %f6, %f1, %f5; | |
mul.ftz.f32 %f7, %f1, %f4; | |
mul.ftz.f32 %f8, %f1, %f3; | |
mul.ftz.f32 %f9, %f1, %f2; | |
add.s64 %rd18, %rd2, %rd17; | |
st.global.v4.f32 [%rd18], {%f9, %f8, %f7, %f6}; | |
$L__BB143_2: | |
setp.le.u64 %p2, %rd4, %rd3; | |
@%p2 bra $L__BB143_4; | |
shl.b64 %rd19, %rd3, 2; | |
add.s64 %rd20, %rd1, %rd19; | |
ld.global.f32 %f10, [%rd20]; | |
mul.ftz.f32 %f11, %f1, %f10; | |
add.s64 %rd21, %rd2, %rd19; | |
st.global.f32 [%rd21], %f11; | |
$L__BB143_4: | |
sub.s64 %rd22, %rd7, %rd4; | |
and.b64 %rd23, %rd22, 3; | |
sub.s64 %rd24, %rd7, %rd23; | |
add.s64 %rd6, %rd24, %rd3; | |
setp.ge.u64 %p3, %rd6, %rd7; | |
@%p3 bra $L__BB143_6; | |
shl.b64 %rd25, %rd6, 2; | |
add.s64 %rd26, %rd1, %rd25; | |
ld.global.f32 %f12, [%rd26]; | |
mul.ftz.f32 %f13, %f1, %f12; | |
add.s64 %rd27, %rd2, %rd25; | |
st.global.f32 [%rd27], %f13; | |
$L__BB143_6: | |
ret; | |
} | |
.visible .entry _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4_( | |
.param .u64 _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_0, | |
.param .u64 _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_1, | |
.param .align 4 .b8 _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_2[12], | |
.param .u64 _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_3 | |
) | |
{ | |
.reg .pred %p<4>; | |
.reg .f32 %f<10>; | |
.reg .b32 %r<5>; | |
.reg .b64 %rd<28>; | |
ld.param.u64 %rd9, [_ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_0]; | |
ld.param.u64 %rd7, [_ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_1]; | |
ld.param.u64 %rd8, [_ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_3]; | |
cvta.to.global.u64 %rd1, %rd8; | |
cvta.to.global.u64 %rd2, %rd9; | |
mov.u32 %r1, %ntid.x; | |
mov.u32 %r2, %ctaid.x; | |
mov.u32 %r3, %tid.x; | |
mad.lo.s32 %r4, %r2, %r1, %r3; | |
cvt.u64.u32 %rd3, %r4; | |
add.s64 %rd10, %rd9, 7; | |
and.b64 %rd11, %rd10, -8; | |
sub.s64 %rd12, %rd11, %rd9; | |
shr.s64 %rd13, %rd12, 2; | |
min.u64 %rd4, %rd13, %rd7; | |
mul.wide.u32 %rd14, %r4, 2; | |
add.s64 %rd5, %rd4, %rd14; | |
ld.param.f32 %f1, [_ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_2+4]; | |
add.s64 %rd15, %rd5, 2; | |
setp.gt.u64 %p1, %rd15, %rd7; | |
@%p1 bra $L__BB144_2; | |
shl.b64 %rd17, %rd5, 2; | |
add.s64 %rd16, %rd8, %rd17; | |
ld.global.nc.v2.f32 {%f2,%f3}, [%rd16]; | |
mul.ftz.f32 %f4, %f1, %f3; | |
mul.ftz.f32 %f5, %f1, %f2; | |
add.s64 %rd18, %rd2, %rd17; | |
st.global.v2.f32 [%rd18], {%f5, %f4}; | |
$L__BB144_2: | |
setp.le.u64 %p2, %rd4, %rd3; | |
@%p2 bra $L__BB144_4; | |
shl.b64 %rd19, %rd3, 2; | |
add.s64 %rd20, %rd1, %rd19; | |
ld.global.f32 %f6, [%rd20]; | |
mul.ftz.f32 %f7, %f1, %f6; | |
add.s64 %rd21, %rd2, %rd19; | |
st.global.f32 [%rd21], %f7; | |
$L__BB144_4: | |
sub.s64 %rd22, %rd7, %rd4; | |
and.b64 %rd23, %rd22, 1; | |
sub.s64 %rd24, %rd7, %rd23; | |
add.s64 %rd6, %rd24, %rd3; | |
setp.ge.u64 %p3, %rd6, %rd7; | |
@%p3 bra $L__BB144_6; | |
shl.b64 %rd25, %rd6, 2; | |
add.s64 %rd26, %rd1, %rd25; | |
ld.global.f32 %f8, [%rd26]; | |
mul.ftz.f32 %f9, %f1, %f8; | |
add.s64 %rd27, %rd2, %rd25; | |
st.global.f32 [%rd27], %f9; | |
$L__BB144_6: | |
ret; | |
} | |
.visible .entry _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4_( | |
.param .u64 _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_0, | |
.param .u64 _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_1, | |
.param .align 4 .b8 _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_2[12], | |
.param .u64 _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_3 | |
) | |
{ | |
.reg .pred %p<2>; | |
.reg .b16 %rs<13>; | |
.reg .f32 %f<5>; | |
.reg .b32 %r<5>; | |
.reg .b64 %rd<10>; | |
ld.param.u64 %rd2, [_ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_0]; | |
ld.param.u64 %rd4, [_ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_1]; | |
ld.param.u64 %rd3, [_ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_3]; | |
ld.param.f32 %f1, [_ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_2+4]; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %ntid.x; | |
mov.u32 %r3, %tid.x; | |
mad.lo.s32 %r4, %r1, %r2, %r3; | |
cvt.u64.u32 %rd1, %r4; | |
setp.ge.u64 %p1, %rd1, %rd4; | |
@%p1 bra $L__BB145_2; | |
cvta.to.global.u64 %rd5, %rd3; | |
shl.b64 %rd6, %rd1, 2; | |
add.s64 %rd7, %rd5, %rd6; | |
ld.global.f32 %f2, [%rd7]; | |
sqrt.approx.ftz.f32 %f3, %f2; | |
mul.ftz.f32 %f4, %f1, %f3; | |
cvta.to.global.u64 %rd8, %rd2; | |
add.s64 %rd9, %rd8, %rd6; | |
st.global.f32 [%rd9], %f4; | |
$L__BB145_2: | |
ret; | |
} | |
.visible .entry _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4_( | |
.param .u64 _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_0, | |
.param .u64 _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_1, | |
.param .align 4 .b8 _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_2[12], | |
.param .u64 _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_3 | |
) | |
{ | |
.reg .pred %p<4>; | |
.reg .f32 %f<20>; | |
.reg .b32 %r<5>; | |
.reg .b64 %rd<28>; | |
ld.param.u64 %rd9, [_ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_0]; | |
ld.param.u64 %rd7, [_ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_1]; | |
ld.param.u64 %rd8, [_ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_3]; | |
cvta.to.global.u64 %rd1, %rd8; | |
cvta.to.global.u64 %rd2, %rd9; | |
mov.u32 %r1, %ntid.x; | |
mov.u32 %r2, %ctaid.x; | |
mov.u32 %r3, %tid.x; | |
mad.lo.s32 %r4, %r2, %r1, %r3; | |
cvt.u64.u32 %rd3, %r4; | |
add.s64 %rd10, %rd9, 15; | |
and.b64 %rd11, %rd10, -16; | |
sub.s64 %rd12, %rd11, %rd9; | |
shr.s64 %rd13, %rd12, 2; | |
min.u64 %rd4, %rd13, %rd7; | |
mul.wide.u32 %rd14, %r4, 4; | |
add.s64 %rd5, %rd4, %rd14; | |
ld.param.f32 %f1, [_ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_2+4]; | |
add.s64 %rd15, %rd5, 4; | |
setp.gt.u64 %p1, %rd15, %rd7; | |
@%p1 bra $L__BB146_2; | |
shl.b64 %rd17, %rd5, 2; | |
add.s64 %rd16, %rd8, %rd17; | |
ld.global.nc.v4.f32 {%f2,%f3,%f4,%f5}, [%rd16]; | |
sqrt.approx.ftz.f32 %f6, %f2; | |
sqrt.approx.ftz.f32 %f7, %f3; | |
sqrt.approx.ftz.f32 %f8, %f4; | |
sqrt.approx.ftz.f32 %f9, %f5; | |
mul.ftz.f32 %f10, %f1, %f9; | |
mul.ftz.f32 %f11, %f1, %f8; | |
mul.ftz.f32 %f12, %f1, %f7; | |
mul.ftz.f32 %f13, %f1, %f6; | |
add.s64 %rd18, %rd2, %rd17; | |
st.global.v4.f32 [%rd18], {%f13, %f12, %f11, %f10}; | |
$L__BB146_2: | |
setp.le.u64 %p2, %rd4, %rd3; | |
@%p2 bra $L__BB146_4; | |
shl.b64 %rd19, %rd3, 2; | |
add.s64 %rd20, %rd1, %rd19; | |
ld.global.f32 %f14, [%rd20]; | |
sqrt.approx.ftz.f32 %f15, %f14; | |
mul.ftz.f32 %f16, %f1, %f15; | |
add.s64 %rd21, %rd2, %rd19; | |
st.global.f32 [%rd21], %f16; | |
$L__BB146_4: | |
sub.s64 %rd22, %rd7, %rd4; | |
and.b64 %rd23, %rd22, 3; | |
sub.s64 %rd24, %rd7, %rd23; | |
add.s64 %rd6, %rd24, %rd3; | |
setp.ge.u64 %p3, %rd6, %rd7; | |
@%p3 bra $L__BB146_6; | |
shl.b64 %rd25, %rd6, 2; | |
add.s64 %rd26, %rd1, %rd25; | |
ld.global.f32 %f17, [%rd26]; | |
sqrt.approx.ftz.f32 %f18, %f17; | |
mul.ftz.f32 %f19, %f1, %f18; | |
add.s64 %rd27, %rd2, %rd25; | |
st.global.f32 [%rd27], %f19; | |
$L__BB146_6: | |
ret; | |
} | |
.visible .entry _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4_( | |
.param .u64 _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_0, | |
.param .u64 _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_1, | |
.param .align 4 .b8 _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_2[12], | |
.param .u64 _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_3 | |
) | |
{ | |
.reg .pred %p<4>; | |
.reg .f32 %f<14>; | |
.reg .b32 %r<5>; | |
.reg .b64 %rd<28>; | |
ld.param.u64 %rd9, [_ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_0]; | |
ld.param.u64 %rd7, [_ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_1]; | |
ld.param.u64 %rd8, [_ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_3]; | |
cvta.to.global.u64 %rd1, %rd8; | |
cvta.to.global.u64 %rd2, %rd9; | |
mov.u32 %r1, %ntid.x; | |
mov.u32 %r2, %ctaid.x; | |
mov.u32 %r3, %tid.x; | |
mad.lo.s32 %r4, %r2, %r1, %r3; | |
cvt.u64.u32 %rd3, %r4; | |
add.s64 %rd10, %rd9, 7; | |
and.b64 %rd11, %rd10, -8; | |
sub.s64 %rd12, %rd11, %rd9; | |
shr.s64 %rd13, %rd12, 2; | |
min.u64 %rd4, %rd13, %rd7; | |
mul.wide.u32 %rd14, %r4, 2; | |
add.s64 %rd5, %rd4, %rd14; | |
ld.param.f32 %f1, [_ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_2+4]; | |
add.s64 %rd15, %rd5, 2; | |
setp.gt.u64 %p1, %rd15, %rd7; | |
@%p1 bra $L__BB147_2; | |
shl.b64 %rd17, %rd5, 2; | |
add.s64 %rd16, %rd8, %rd17; | |
ld.global.nc.v2.f32 {%f2,%f3}, [%rd16]; | |
sqrt.approx.ftz.f32 %f4, %f2; | |
sqrt.approx.ftz.f32 %f5, %f3; | |
mul.ftz.f32 %f6, %f1, %f5; | |
mul.ftz.f32 %f7, %f1, %f4; | |
add.s64 %rd18, %rd2, %rd17; | |
st.global.v2.f32 [%rd18], {%f7, %f6}; | |
$L__BB147_2: | |
setp.le.u64 %p2, %rd4, %rd3; | |
@%p2 bra $L__BB147_4; | |
shl.b64 %rd19, %rd3, 2; | |
add.s64 %rd20, %rd1, %rd19; | |
ld.global.f32 %f8, [%rd20]; | |
sqrt.approx.ftz.f32 %f9, %f8; | |
mul.ftz.f32 %f10, %f1, %f9; | |
add.s64 %rd21, %rd2, %rd19; | |
st.global.f32 [%rd21], %f10; | |
$L__BB147_4: | |
sub.s64 %rd22, %rd7, %rd4; | |
and.b64 %rd23, %rd22, 1; | |
sub.s64 %rd24, %rd7, %rd23; | |
add.s64 %rd6, %rd24, %rd3; | |
setp.ge.u64 %p3, %rd6, %rd7; | |
@%p3 bra $L__BB147_6; | |
shl.b64 %rd25, %rd6, 2; | |
add.s64 %rd26, %rd1, %rd25; | |
ld.global.f32 %f11, [%rd26]; | |
sqrt.approx.ftz.f32 %f12, %f11; | |
mul.ftz.f32 %f13, %f1, %f12; | |
add.s64 %rd27, %rd2, %rd25; | |
st.global.f32 [%rd27], %f13; | |
$L__BB147_6: | |
ret; | |
} | |
.visible .entry _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4_( | |
.param .u64 _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_0, | |
.param .u64 _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_1, | |
.param .align 4 .b8 _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_2[12], | |
.param .u64 _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_3 | |
) | |
{ | |
.reg .pred %p<2>; | |
.reg .b16 %rs<11>; | |
.reg .f32 %f<4>; | |
.reg .b32 %r<5>; | |
.reg .b64 %rd<11>; | |
ld.param.u64 %rd2, [_ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_0]; | |
ld.param.u64 %rd4, [_ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_1]; | |
ld.param.u64 %rd3, [_ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_3]; | |
ld.param.f32 %f1, [_ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_2+4]; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %ntid.x; | |
mov.u32 %r3, %tid.x; | |
mad.lo.s32 %r4, %r1, %r2, %r3; | |
cvt.u64.u32 %rd1, %r4; | |
setp.ge.u64 %p1, %rd1, %rd4; | |
@%p1 bra $L__BB148_2; | |
cvta.to.global.u64 %rd5, %rd3; | |
shl.b64 %rd6, %rd1, 1; | |
add.s64 %rd7, %rd5, %rd6; | |
ld.global.u16 %rs10, [%rd7]; | |
{ cvt.f32.f16 %f2, %rs10;} | |
mul.ftz.f32 %f3, %f1, %f2; | |
cvta.to.global.u64 %rd8, %rd2; | |
shl.b64 %rd9, %rd1, 2; | |
add.s64 %rd10, %rd8, %rd9; | |
st.global.f32 [%rd10], %f3; | |
$L__BB148_2: | |
ret; | |
} | |
.visible .entry _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4_( | |
.param .u64 _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_0, | |
.param .u64 _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_1, | |
.param .align 4 .b8 _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_2[12], | |
.param .u64 _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_3 | |
) | |
{ | |
.reg .pred %p<4>; | |
.reg .b16 %rs<7>; | |
.reg .f32 %f<14>; | |
.reg .b32 %r<7>; | |
.reg .b64 %rd<31>; | |
ld.param.u64 %rd9, [_ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_0]; | |
ld.param.u64 %rd7, [_ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_1]; | |
ld.param.u64 %rd8, [_ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_3]; | |
cvta.to.global.u64 %rd1, %rd8; | |
cvta.to.global.u64 %rd2, %rd9; | |
mov.u32 %r1, %ntid.x; | |
mov.u32 %r2, %ctaid.x; | |
mov.u32 %r3, %tid.x; | |
mad.lo.s32 %r4, %r2, %r1, %r3; | |
cvt.u64.u32 %rd3, %r4; | |
add.s64 %rd10, %rd9, 15; | |
and.b64 %rd11, %rd10, -16; | |
sub.s64 %rd12, %rd11, %rd9; | |
shr.s64 %rd13, %rd12, 2; | |
min.u64 %rd4, %rd13, %rd7; | |
mul.wide.u32 %rd14, %r4, 4; | |
add.s64 %rd5, %rd4, %rd14; | |
ld.param.f32 %f1, [_ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_2+4]; | |
add.s64 %rd15, %rd5, 4; | |
setp.gt.u64 %p1, %rd15, %rd7; | |
@%p1 bra $L__BB149_2; | |
shl.b64 %rd17, %rd5, 1; | |
add.s64 %rd16, %rd8, %rd17; | |
ld.global.nc.v2.u32 {%r5,%r6}, [%rd16]; | |
mov.b32 {%rs1, %rs2}, %r5; | |
{ cvt.f32.f16 %f2, %rs1;} | |
{ cvt.f32.f16 %f3, %rs2;} | |
mov.b32 {%rs3, %rs4}, %r6; | |
{ cvt.f32.f16 %f4, %rs3;} | |
{ cvt.f32.f16 %f5, %rs4;} | |
mul.ftz.f32 %f6, %f1, %f5; | |
mul.ftz.f32 %f7, %f1, %f4; | |
mul.ftz.f32 %f8, %f1, %f3; | |
mul.ftz.f32 %f9, %f1, %f2; | |
shl.b64 %rd18, %rd5, 2; | |
add.s64 %rd19, %rd2, %rd18; | |
st.global.v4.f32 [%rd19], {%f9, %f8, %f7, %f6}; | |
$L__BB149_2: | |
setp.le.u64 %p2, %rd4, %rd3; | |
@%p2 bra $L__BB149_4; | |
shl.b64 %rd20, %rd3, 1; | |
add.s64 %rd21, %rd1, %rd20; | |
ld.global.u16 %rs5, [%rd21]; | |
{ cvt.f32.f16 %f10, %rs5;} | |
mul.ftz.f32 %f11, %f1, %f10; | |
shl.b64 %rd22, %rd3, 2; | |
add.s64 %rd23, %rd2, %rd22; | |
st.global.f32 [%rd23], %f11; | |
$L__BB149_4: | |
sub.s64 %rd24, %rd7, %rd4; | |
and.b64 %rd25, %rd24, 3; | |
sub.s64 %rd26, %rd7, %rd25; | |
add.s64 %rd6, %rd26, %rd3; | |
setp.ge.u64 %p3, %rd6, %rd7; | |
@%p3 bra $L__BB149_6; | |
shl.b64 %rd27, %rd6, 1; | |
add.s64 %rd28, %rd1, %rd27; | |
ld.global.u16 %rs6, [%rd28]; | |
{ cvt.f32.f16 %f12, %rs6;} | |
mul.ftz.f32 %f13, %f1, %f12; | |
shl.b64 %rd29, %rd6, 2; | |
add.s64 %rd30, %rd2, %rd29; | |
st.global.f32 [%rd30], %f13; | |
$L__BB149_6: | |
ret; | |
} | |
.visible .entry _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4_( | |
.param .u64 _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_0, | |
.param .u64 _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_1, | |
.param .align 4 .b8 _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_2[12], | |
.param .u64 _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_3 | |
) | |
{ | |
.reg .pred %p<4>; | |
.reg .b16 %rs<5>; | |
.reg .f32 %f<10>; | |
.reg .b32 %r<6>; | |
.reg .b64 %rd<31>; | |
ld.param.u64 %rd9, [_ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_0]; | |
ld.param.u64 %rd7, [_ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_1]; | |
ld.param.u64 %rd8, [_ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_3]; | |
cvta.to.global.u64 %rd1, %rd8; | |
cvta.to.global.u64 %rd2, %rd9; | |
mov.u32 %r1, %ntid.x; | |
mov.u32 %r2, %ctaid.x; | |
mov.u32 %r3, %tid.x; | |
mad.lo.s32 %r4, %r2, %r1, %r3; | |
cvt.u64.u32 %rd3, %r4; | |
add.s64 %rd10, %rd9, 7; | |
and.b64 %rd11, %rd10, -8; | |
sub.s64 %rd12, %rd11, %rd9; | |
shr.s64 %rd13, %rd12, 2; | |
min.u64 %rd4, %rd13, %rd7; | |
mul.wide.u32 %rd14, %r4, 2; | |
add.s64 %rd5, %rd4, %rd14; | |
ld.param.f32 %f1, [_ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_2+4]; | |
add.s64 %rd15, %rd5, 2; | |
setp.gt.u64 %p1, %rd15, %rd7; | |
@%p1 bra $L__BB150_2; | |
shl.b64 %rd17, %rd5, 1; | |
add.s64 %rd16, %rd8, %rd17; | |
ld.global.nc.b32 %r5, [%rd16]; | |
mov.b32 {%rs1, %rs2}, %r5; | |
{ cvt.f32.f16 %f2, %rs1;} | |
{ cvt.f32.f16 %f3, %rs2;} | |
mul.ftz.f32 %f4, %f1, %f3; | |
mul.ftz.f32 %f5, %f1, %f2; | |
shl.b64 %rd18, %rd5, 2; | |
add.s64 %rd19, %rd2, %rd18; | |
st.global.v2.f32 [%rd19], {%f5, %f4}; | |
$L__BB150_2: | |
setp.le.u64 %p2, %rd4, %rd3; | |
@%p2 bra $L__BB150_4; | |
shl.b64 %rd20, %rd3, 1; | |
add.s64 %rd21, %rd1, %rd20; | |
ld.global.u16 %rs3, [%rd21]; | |
{ cvt.f32.f16 %f6, %rs3;} | |
mul.ftz.f32 %f7, %f1, %f6; | |
shl.b64 %rd22, %rd3, 2; | |
add.s64 %rd23, %rd2, %rd22; | |
st.global.f32 [%rd23], %f7; | |
$L__BB150_4: | |
sub.s64 %rd24, %rd7, %rd4; | |
and.b64 %rd25, %rd24, 1; | |
sub.s64 %rd26, %rd7, %rd25; | |
add.s64 %rd6, %rd26, %rd3; | |
setp.ge.u64 %p3, %rd6, %rd7; | |
@%p3 bra $L__BB150_6; | |
shl.b64 %rd27, %rd6, 1; | |
add.s64 %rd28, %rd1, %rd27; | |
ld.global.u16 %rs4, [%rd28]; | |
{ cvt.f32.f16 %f8, %rs4;} | |
mul.ftz.f32 %f9, %f1, %f8; | |
shl.b64 %rd29, %rd6, 2; | |
add.s64 %rd30, %rd2, %rd29; | |
st.global.f32 [%rd30], %f9; | |
$L__BB150_6: | |
ret; | |
} | |
.visible .entry _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4_( | |
.param .u64 _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_0, | |
.param .u64 _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_1, | |
.param .align 4 .b8 _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_2[12], | |
.param .u64 _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_3 | |
) | |
{ | |
.reg .pred %p<2>; | |
.reg .b16 %rs<14>; | |
.reg .f32 %f<5>; | |
.reg .b32 %r<5>; | |
.reg .b64 %rd<11>; | |
ld.param.u64 %rd2, [_ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_0]; | |
ld.param.u64 %rd4, [_ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_1]; | |
ld.param.u64 %rd3, [_ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_3]; | |
ld.param.f32 %f1, [_ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_2+4]; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %ntid.x; | |
mov.u32 %r3, %tid.x; | |
mad.lo.s32 %r4, %r1, %r2, %r3; | |
cvt.u64.u32 %rd1, %r4; | |
setp.ge.u64 %p1, %rd1, %rd4; | |
@%p1 bra $L__BB151_2; | |
cvta.to.global.u64 %rd5, %rd3; | |
shl.b64 %rd6, %rd1, 1; | |
add.s64 %rd7, %rd5, %rd6; | |
ld.global.u16 %rs13, [%rd7]; | |
{ cvt.f32.f16 %f2, %rs13;} | |
sqrt.approx.ftz.f32 %f3, %f2; | |
mul.ftz.f32 %f4, %f1, %f3; | |
cvta.to.global.u64 %rd8, %rd2; | |
shl.b64 %rd9, %rd1, 2; | |
add.s64 %rd10, %rd8, %rd9; | |
st.global.f32 [%rd10], %f4; | |
$L__BB151_2: | |
ret; | |
} | |
.visible .entry _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4_( | |
.param .u64 _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_0, | |
.param .u64 _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_1, | |
.param .align 4 .b8 _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_2[12], | |
.param .u64 _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_3 | |
) | |
{ | |
.reg .pred %p<4>; | |
.reg .b16 %rs<7>; | |
.reg .f32 %f<20>; | |
.reg .b32 %r<7>; | |
.reg .b64 %rd<31>; | |
ld.param.u64 %rd9, [_ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_0]; | |
ld.param.u64 %rd7, [_ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_1]; | |
ld.param.u64 %rd8, [_ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_3]; | |
cvta.to.global.u64 %rd1, %rd8; | |
cvta.to.global.u64 %rd2, %rd9; | |
mov.u32 %r1, %ntid.x; | |
mov.u32 %r2, %ctaid.x; | |
mov.u32 %r3, %tid.x; | |
mad.lo.s32 %r4, %r2, %r1, %r3; | |
cvt.u64.u32 %rd3, %r4; | |
add.s64 %rd10, %rd9, 15; | |
and.b64 %rd11, %rd10, -16; | |
sub.s64 %rd12, %rd11, %rd9; | |
shr.s64 %rd13, %rd12, 2; | |
min.u64 %rd4, %rd13, %rd7; | |
mul.wide.u32 %rd14, %r4, 4; | |
add.s64 %rd5, %rd4, %rd14; | |
ld.param.f32 %f1, [_ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_2+4]; | |
add.s64 %rd15, %rd5, 4; | |
setp.gt.u64 %p1, %rd15, %rd7; | |
@%p1 bra $L__BB152_2; | |
shl.b64 %rd17, %rd5, 1; | |
add.s64 %rd16, %rd8, %rd17; | |
ld.global.nc.v2.u32 {%r5,%r6}, [%rd16]; | |
mov.b32 {%rs1, %rs2}, %r5; | |
{ cvt.f32.f16 %f2, %rs1;} | |
sqrt.approx.ftz.f32 %f6, %f2; | |
{ cvt.f32.f16 %f3, %rs2;} | |
sqrt.approx.ftz.f32 %f7, %f3; | |
mov.b32 {%rs3, %rs4}, %r6; | |
{ cvt.f32.f16 %f4, %rs3;} | |
sqrt.approx.ftz.f32 %f8, %f4; | |
{ cvt.f32.f16 %f5, %rs4;} | |
sqrt.approx.ftz.f32 %f9, %f5; | |
mul.ftz.f32 %f10, %f1, %f9; | |
mul.ftz.f32 %f11, %f1, %f8; | |
mul.ftz.f32 %f12, %f1, %f7; | |
mul.ftz.f32 %f13, %f1, %f6; | |
shl.b64 %rd18, %rd5, 2; | |
add.s64 %rd19, %rd2, %rd18; | |
st.global.v4.f32 [%rd19], {%f13, %f12, %f11, %f10}; | |
$L__BB152_2: | |
setp.le.u64 %p2, %rd4, %rd3; | |
@%p2 bra $L__BB152_4; | |
shl.b64 %rd20, %rd3, 1; | |
add.s64 %rd21, %rd1, %rd20; | |
ld.global.u16 %rs5, [%rd21]; | |
{ cvt.f32.f16 %f14, %rs5;} | |
sqrt.approx.ftz.f32 %f15, %f14; | |
mul.ftz.f32 %f16, %f1, %f15; | |
shl.b64 %rd22, %rd3, 2; | |
add.s64 %rd23, %rd2, %rd22; | |
st.global.f32 [%rd23], %f16; | |
$L__BB152_4: | |
sub.s64 %rd24, %rd7, %rd4; | |
and.b64 %rd25, %rd24, 3; | |
sub.s64 %rd26, %rd7, %rd25; | |
add.s64 %rd6, %rd26, %rd3; | |
setp.ge.u64 %p3, %rd6, %rd7; | |
@%p3 bra $L__BB152_6; | |
shl.b64 %rd27, %rd6, 1; | |
add.s64 %rd28, %rd1, %rd27; | |
ld.global.u16 %rs6, [%rd28]; | |
{ cvt.f32.f16 %f17, %rs6;} | |
sqrt.approx.ftz.f32 %f18, %f17; | |
mul.ftz.f32 %f19, %f1, %f18; | |
shl.b64 %rd29, %rd6, 2; | |
add.s64 %rd30, %rd2, %rd29; | |
st.global.f32 [%rd30], %f19; | |
$L__BB152_6: | |
ret; | |
} | |
.visible .entry _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4_( | |
.param .u64 _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_0, | |
.param .u64 _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_1, | |
.param .align 4 .b8 _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_2[12], | |
.param .u64 _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_3 | |
) | |
{ | |
.reg .pred %p<4>; | |
.reg .b16 %rs<5>; | |
.reg .f32 %f<14>; | |
.reg .b32 %r<6>; | |
.reg .b64 %rd<31>; | |
ld.param.u64 %rd9, [_ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_0]; | |
ld.param.u64 %rd7, [_ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_1]; | |
ld.param.u64 %rd8, [_ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_3]; | |
cvta.to.global.u64 %rd1, %rd8; | |
cvta.to.global.u64 %rd2, %rd9; | |
mov.u32 %r1, %ntid.x; | |
mov.u32 %r2, %ctaid.x; | |
mov.u32 %r3, %tid.x; | |
mad.lo.s32 %r4, %r2, %r1, %r3; | |
cvt.u64.u32 %rd3, %r4; | |
add.s64 %rd10, %rd9, 7; | |
and.b64 %rd11, %rd10, -8; | |
sub.s64 %rd12, %rd11, %rd9; | |
shr.s64 %rd13, %rd12, 2; | |
min.u64 %rd4, %rd13, %rd7; | |
mul.wide.u32 %rd14, %r4, 2; | |
add.s64 %rd5, %rd4, %rd14; | |
ld.param.f32 %f1, [_ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_2+4]; | |
add.s64 %rd15, %rd5, 2; | |
setp.gt.u64 %p1, %rd15, %rd7; | |
@%p1 bra $L__BB153_2; | |
shl.b64 %rd17, %rd5, 1; | |
add.s64 %rd16, %rd8, %rd17; | |
ld.global.nc.b32 %r5, [%rd16]; | |
mov.b32 {%rs1, %rs2}, %r5; | |
{ cvt.f32.f16 %f2, %rs1;} | |
sqrt.approx.ftz.f32 %f4, %f2; | |
{ cvt.f32.f16 %f3, %rs2;} | |
sqrt.approx.ftz.f32 %f5, %f3; | |
mul.ftz.f32 %f6, %f1, %f5; | |
mul.ftz.f32 %f7, %f1, %f4; | |
shl.b64 %rd18, %rd5, 2; | |
add.s64 %rd19, %rd2, %rd18; | |
st.global.v2.f32 [%rd19], {%f7, %f6}; | |
$L__BB153_2: | |
setp.le.u64 %p2, %rd4, %rd3; | |
@%p2 bra $L__BB153_4; | |
shl.b64 %rd20, %rd3, 1; | |
add.s64 %rd21, %rd1, %rd20; | |
ld.global.u16 %rs3, [%rd21]; | |
{ cvt.f32.f16 %f8, %rs3;} | |
sqrt.approx.ftz.f32 %f9, %f8; | |
mul.ftz.f32 %f10, %f1, %f9; | |
shl.b64 %rd22, %rd3, 2; | |
add.s64 %rd23, %rd2, %rd22; | |
st.global.f32 [%rd23], %f10; | |
$L__BB153_4: | |
sub.s64 %rd24, %rd7, %rd4; | |
and.b64 %rd25, %rd24, 1; | |
sub.s64 %rd26, %rd7, %rd25; | |
add.s64 %rd6, %rd26, %rd3; | |
setp.ge.u64 %p3, %rd6, %rd7; | |
@%p3 bra $L__BB153_6; | |
shl.b64 %rd27, %rd6, 1; | |
add.s64 %rd28, %rd1, %rd27; | |
ld.global.u16 %rs4, [%rd28]; | |
{ cvt.f32.f16 %f11, %rs4;} | |
sqrt.approx.ftz.f32 %f12, %f11; | |
mul.ftz.f32 %f13, %f1, %f12; | |
shl.b64 %rd29, %rd6, 2; | |
add.s64 %rd30, %rd2, %rd29; | |
st.global.f32 [%rd30], %f13; | |
$L__BB153_6: | |
ret; | |
} | |
.visible .entry _ZN4raft6linalg6detail10map_kernelILi1ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4_( | |
.param .u64 _ZN4raft6linalg6detail10map_kernelILi1ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_0, | |
.param .u32 _ZN4raft6linalg6detail10map_kernelILi1ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_1, | |
.param .align 8 .b8 _ZN4raft6linalg6detail10map_kernelILi1ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2[24] | |
) | |
{ | |
.reg .pred %p<3>; | |
.reg .b16 %rs<9>; | |
.reg .f32 %f<6>; | |
.reg .b32 %r<15>; | |
.reg .b64 %rd<9>; | |
ld.param.u64 %rd1, [_ZN4raft6linalg6detail10map_kernelILi1ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_0]; | |
ld.param.u32 %r7, [_ZN4raft6linalg6detail10map_kernelILi1ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_1]; | |
ld.param.v2.u32 {%r8, %r9}, [_ZN4raft6linalg6detail10map_kernelILi1ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2+8]; | |
ld.param.f32 %f5, [_ZN4raft6linalg6detail10map_kernelILi1ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2+16]; | |
ld.param.u64 %rd2, [_ZN4raft6linalg6detail10map_kernelILi1ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2]; | |
mov.u32 %r10, %ntid.x; | |
mov.u32 %r11, %ctaid.x; | |
mov.u32 %r12, %tid.x; | |
mad.lo.s32 %r1, %r11, %r10, %r12; | |
setp.ge.u32 %p1, %r1, %r7; | |
@%p1 bra $L__BB154_4; | |
rem.u32 %r4, %r1, %r9; | |
setp.lt.u32 %p2, %r4, %r8; | |
@%p2 bra $L__BB154_2; | |
bra.uni $L__BB154_3; | |
$L__BB154_2: | |
cvta.to.global.u64 %rd3, %rd2; | |
div.u32 %r13, %r1, %r9; | |
mad.lo.s32 %r14, %r13, %r8, %r4; | |
mul.wide.u32 %rd4, %r14, 4; | |
add.s64 %rd5, %rd3, %rd4; | |
ld.global.f32 %f5, [%rd5]; | |
$L__BB154_3: | |
cvta.to.global.u64 %rd6, %rd1; | |
mul.wide.u32 %rd7, %r1, 4; | |
add.s64 %rd8, %rd6, %rd7; | |
st.global.f32 [%rd8], %f5; | |
$L__BB154_4: | |
ret; | |
} | |
.visible .entry _ZN4raft6linalg6detail10map_kernelILi4ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4_( | |
.param .u64 _ZN4raft6linalg6detail10map_kernelILi4ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_0, | |
.param .u32 _ZN4raft6linalg6detail10map_kernelILi4ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_1, | |
.param .align 8 .b8 _ZN4raft6linalg6detail10map_kernelILi4ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2[24] | |
) | |
{ | |
.reg .pred %p<10>; | |
.reg .b16 %rs<9>; | |
.reg .f32 %f<23>; | |
.reg .b32 %r<47>; | |
.reg .b64 %rd<29>; | |
ld.param.u64 %rd4, [_ZN4raft6linalg6detail10map_kernelILi4ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_0]; | |
ld.param.u32 %r20, [_ZN4raft6linalg6detail10map_kernelILi4ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_1]; | |
ld.param.v2.u32 {%r23, %r24}, [_ZN4raft6linalg6detail10map_kernelILi4ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2+8]; | |
ld.param.f32 %f22, [_ZN4raft6linalg6detail10map_kernelILi4ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2+16]; | |
ld.param.u64 %rd3, [_ZN4raft6linalg6detail10map_kernelILi4ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2]; | |
cvta.to.global.u64 %rd1, %rd4; | |
mov.u32 %r25, %ntid.x; | |
mov.u32 %r26, %ctaid.x; | |
mov.u32 %r27, %tid.x; | |
mad.lo.s32 %r1, %r26, %r25, %r27; | |
add.s64 %rd5, %rd4, 15; | |
and.b64 %rd6, %rd5, 17179869168; | |
sub.s64 %rd7, %rd6, %rd4; | |
shr.u64 %rd8, %rd7, 2; | |
cvt.u32.u64 %r28, %rd8; | |
min.u32 %r2, %r28, %r20; | |
shl.b32 %r29, %r1, 2; | |
add.s32 %r3, %r2, %r29; | |
add.s32 %r30, %r3, 4; | |
setp.gt.u32 %p1, %r30, %r20; | |
@%p1 bra $L__BB155_10; | |
cvta.to.global.u64 %rd2, %rd3; | |
rem.u32 %r6, %r3, %r24; | |
setp.ge.u32 %p2, %r6, %r23; | |
mov.f32 %f17, %f22; | |
@%p2 bra $L__BB155_3; | |
div.u32 %r31, %r3, %r24; | |
mad.lo.s32 %r32, %r31, %r23, %r6; | |
mul.wide.u32 %rd9, %r32, 4; | |
add.s64 %rd10, %rd2, %rd9; | |
ld.global.f32 %f17, [%rd10]; | |
$L__BB155_3: | |
add.s32 %r7, %r3, 1; | |
rem.u32 %r8, %r7, %r24; | |
setp.ge.u32 %p3, %r8, %r23; | |
mov.f32 %f18, %f22; | |
@%p3 bra $L__BB155_5; | |
div.u32 %r33, %r7, %r24; | |
mad.lo.s32 %r34, %r33, %r23, %r8; | |
mul.wide.u32 %rd11, %r34, 4; | |
add.s64 %rd12, %rd2, %rd11; | |
ld.global.f32 %f18, [%rd12]; | |
$L__BB155_5: | |
add.s32 %r9, %r3, 2; | |
rem.u32 %r10, %r9, %r24; | |
setp.ge.u32 %p4, %r10, %r23; | |
mov.f32 %f19, %f22; | |
@%p4 bra $L__BB155_7; | |
div.u32 %r35, %r9, %r24; | |
mad.lo.s32 %r36, %r35, %r23, %r10; | |
mul.wide.u32 %rd13, %r36, 4; | |
add.s64 %rd14, %rd2, %rd13; | |
ld.global.f32 %f19, [%rd14]; | |
$L__BB155_7: | |
add.s32 %r37, %r3, 3; | |
div.u32 %r12, %r37, %r24; | |
mul.lo.s32 %r38, %r12, %r24; | |
sub.s32 %r11, %r37, %r38; | |
setp.ge.u32 %p5, %r11, %r23; | |
mov.f32 %f20, %f22; | |
@%p5 bra $L__BB155_9; | |
mad.lo.s32 %r39, %r12, %r23, %r11; | |
mul.wide.u32 %rd15, %r39, 4; | |
add.s64 %rd16, %rd2, %rd15; | |
ld.global.f32 %f20, [%rd16]; | |
$L__BB155_9: | |
mul.wide.u32 %rd17, %r3, 4; | |
add.s64 %rd18, %rd1, %rd17; | |
st.global.v4.f32 [%rd18], {%f17, %f18, %f19, %f20}; | |
$L__BB155_10: | |
setp.ge.u32 %p6, %r1, %r2; | |
@%p6 bra $L__BB155_14; | |
rem.u32 %r15, %r1, %r24; | |
setp.lt.u32 %p7, %r15, %r23; | |
mov.f32 %f21, %f22; | |
@%p7 bra $L__BB155_12; | |
bra.uni $L__BB155_13; | |
$L__BB155_12: | |
cvta.to.global.u64 %rd19, %rd3; | |
div.u32 %r40, %r1, %r24; | |
mad.lo.s32 %r41, %r40, %r23, %r15; | |
mul.wide.u32 %rd20, %r41, 4; | |
add.s64 %rd21, %rd19, %rd20; | |
ld.global.f32 %f21, [%rd21]; | |
$L__BB155_13: | |
mul.wide.u32 %rd22, %r1, 4; | |
add.s64 %rd23, %rd1, %rd22; | |
st.global.f32 [%rd23], %f21; | |
$L__BB155_14: | |
sub.s32 %r42, %r20, %r2; | |
and.b32 %r43, %r42, 3; | |
sub.s32 %r44, %r20, %r43; | |
add.s32 %r16, %r44, %r1; | |
setp.ge.u32 %p8, %r16, %r20; | |
@%p8 bra $L__BB155_18; | |
rem.u32 %r19, %r16, %r24; | |
setp.lt.u32 %p9, %r19, %r23; | |
@%p9 bra $L__BB155_16; | |
bra.uni $L__BB155_17; | |
$L__BB155_16: | |
cvta.to.global.u64 %rd24, %rd3; | |
div.u32 %r45, %r16, %r24; | |
mad.lo.s32 %r46, %r45, %r23, %r19; | |
mul.wide.u32 %rd25, %r46, 4; | |
add.s64 %rd26, %rd24, %rd25; | |
ld.global.f32 %f22, [%rd26]; | |
$L__BB155_17: | |
mul.wide.u32 %rd27, %r16, 4; | |
add.s64 %rd28, %rd1, %rd27; | |
st.global.f32 [%rd28], %f22; | |
$L__BB155_18: | |
ret; | |
} | |
.visible .entry _ZN4raft6linalg6detail10map_kernelILi2ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4_( | |
.param .u64 _ZN4raft6linalg6detail10map_kernelILi2ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_0, | |
.param .u32 _ZN4raft6linalg6detail10map_kernelILi2ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_1, | |
.param .align 8 .b8 _ZN4raft6linalg6detail10map_kernelILi2ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2[24] | |
) | |
{ | |
.reg .pred %p<8>; | |
.reg .b16 %rs<9>; | |
.reg .f32 %f<17>; | |
.reg .b32 %r<39>; | |
.reg .b64 %rd<25>; | |
ld.param.u64 %rd4, [_ZN4raft6linalg6detail10map_kernelILi2ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_0]; | |
ld.param.u32 %r16, [_ZN4raft6linalg6detail10map_kernelILi2ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_1]; | |
ld.param.v2.u32 {%r19, %r20}, [_ZN4raft6linalg6detail10map_kernelILi2ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2+8]; | |
ld.param.f32 %f16, [_ZN4raft6linalg6detail10map_kernelILi2ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2+16]; | |
ld.param.u64 %rd3, [_ZN4raft6linalg6detail10map_kernelILi2ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2]; | |
cvta.to.global.u64 %rd1, %rd4; | |
mov.u32 %r21, %ntid.x; | |
mov.u32 %r22, %ctaid.x; | |
mov.u32 %r23, %tid.x; | |
mad.lo.s32 %r1, %r22, %r21, %r23; | |
add.s64 %rd5, %rd4, 7; | |
and.b64 %rd6, %rd5, 17179869176; | |
sub.s64 %rd7, %rd6, %rd4; | |
shr.u64 %rd8, %rd7, 2; | |
cvt.u32.u64 %r24, %rd8; | |
min.u32 %r2, %r24, %r16; | |
shl.b32 %r25, %r1, 1; | |
add.s32 %r3, %r2, %r25; | |
add.s32 %r26, %r3, 2; | |
setp.gt.u32 %p1, %r26, %r16; | |
@%p1 bra $L__BB156_6; | |
cvta.to.global.u64 %rd2, %rd3; | |
rem.u32 %r6, %r3, %r20; | |
setp.ge.u32 %p2, %r6, %r19; | |
mov.f32 %f13, %f16; | |
@%p2 bra $L__BB156_3; | |
div.u32 %r27, %r3, %r20; | |
mad.lo.s32 %r28, %r27, %r19, %r6; | |
mul.wide.u32 %rd9, %r28, 4; | |
add.s64 %rd10, %rd2, %rd9; | |
ld.global.f32 %f13, [%rd10]; | |
$L__BB156_3: | |
add.s32 %r29, %r3, 1; | |
div.u32 %r8, %r29, %r20; | |
mul.lo.s32 %r30, %r8, %r20; | |
sub.s32 %r7, %r29, %r30; | |
setp.ge.u32 %p3, %r7, %r19; | |
mov.f32 %f14, %f16; | |
@%p3 bra $L__BB156_5; | |
mad.lo.s32 %r31, %r8, %r19, %r7; | |
mul.wide.u32 %rd11, %r31, 4; | |
add.s64 %rd12, %rd2, %rd11; | |
ld.global.f32 %f14, [%rd12]; | |
$L__BB156_5: | |
mul.wide.u32 %rd13, %r3, 4; | |
add.s64 %rd14, %rd1, %rd13; | |
st.global.v2.f32 [%rd14], {%f13, %f14}; | |
$L__BB156_6: | |
setp.ge.u32 %p4, %r1, %r2; | |
@%p4 bra $L__BB156_10; | |
rem.u32 %r11, %r1, %r20; | |
setp.lt.u32 %p5, %r11, %r19; | |
mov.f32 %f15, %f16; | |
@%p5 bra $L__BB156_8; | |
bra.uni $L__BB156_9; | |
$L__BB156_8: | |
cvta.to.global.u64 %rd15, %rd3; | |
div.u32 %r32, %r1, %r20; | |
mad.lo.s32 %r33, %r32, %r19, %r11; | |
mul.wide.u32 %rd16, %r33, 4; | |
add.s64 %rd17, %rd15, %rd16; | |
ld.global.f32 %f15, [%rd17]; | |
$L__BB156_9: | |
mul.wide.u32 %rd18, %r1, 4; | |
add.s64 %rd19, %rd1, %rd18; | |
st.global.f32 [%rd19], %f15; | |
$L__BB156_10: | |
sub.s32 %r34, %r16, %r2; | |
and.b32 %r35, %r34, 1; | |
sub.s32 %r36, %r16, %r35; | |
add.s32 %r12, %r36, %r1; | |
setp.ge.u32 %p6, %r12, %r16; | |
@%p6 bra $L__BB156_14; | |
rem.u32 %r15, %r12, %r20; | |
setp.lt.u32 %p7, %r15, %r19; | |
@%p7 bra $L__BB156_12; | |
bra.uni $L__BB156_13; | |
$L__BB156_12: | |
cvta.to.global.u64 %rd20, %rd3; | |
div.u32 %r37, %r12, %r20; | |
mad.lo.s32 %r38, %r37, %r19, %r15; | |
mul.wide.u32 %rd21, %r38, 4; | |
add.s64 %rd22, %rd20, %rd21; | |
ld.global.f32 %f16, [%rd22]; | |
$L__BB156_13: | |
mul.wide.u32 %rd23, %r12, 4; | |
add.s64 %rd24, %rd1, %rd23; | |
st.global.f32 [%rd24], %f16; | |
$L__BB156_14: | |
ret; | |
} | |
.visible .entry _ZN4raft6linalg6detail10map_kernelILi1ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4_( | |
.param .u64 _ZN4raft6linalg6detail10map_kernelILi1ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_0, | |
.param .u64 _ZN4raft6linalg6detail10map_kernelILi1ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_1, | |
.param .align 8 .b8 _ZN4raft6linalg6detail10map_kernelILi1ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2[24] | |
) | |
{ | |
.reg .pred %p<3>; | |
.reg .b16 %rs<9>; | |
.reg .f32 %f<6>; | |
.reg .b32 %r<16>; | |
.reg .b64 %rd<11>; | |
ld.param.u64 %rd2, [_ZN4raft6linalg6detail10map_kernelILi1ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_0]; | |
ld.param.u64 %rd4, [_ZN4raft6linalg6detail10map_kernelILi1ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_1]; | |
ld.param.v2.u32 {%r6, %r7}, [_ZN4raft6linalg6detail10map_kernelILi1ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2+8]; | |
ld.param.f32 %f5, [_ZN4raft6linalg6detail10map_kernelILi1ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2+16]; | |
ld.param.u64 %rd3, [_ZN4raft6linalg6detail10map_kernelILi1ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2]; | |
mov.u32 %r8, %ntid.x; | |
mov.u32 %r9, %ctaid.x; | |
mov.u32 %r10, %tid.x; | |
mad.lo.s32 %r11, %r9, %r8, %r10; | |
cvt.u64.u32 %rd1, %r11; | |
setp.ge.u64 %p1, %rd1, %rd4; | |
@%p1 bra $L__BB157_4; | |
cvt.u32.u64 %r12, %rd1; | |
rem.u32 %r2, %r12, %r7; | |
setp.lt.u32 %p2, %r2, %r6; | |
@%p2 bra $L__BB157_2; | |
bra.uni $L__BB157_3; | |
$L__BB157_2: | |
cvta.to.global.u64 %rd5, %rd3; | |
div.u32 %r14, %r12, %r7; | |
mad.lo.s32 %r15, %r14, %r6, %r2; | |
mul.wide.u32 %rd6, %r15, 4; | |
add.s64 %rd7, %rd5, %rd6; | |
ld.global.f32 %f5, [%rd7]; | |
$L__BB157_3: | |
cvta.to.global.u64 %rd8, %rd2; | |
shl.b64 %rd9, %rd1, 2; | |
add.s64 %rd10, %rd8, %rd9; | |
st.global.f32 [%rd10], %f5; | |
$L__BB157_4: | |
ret; | |
} | |
.visible .entry _ZN4raft6linalg6detail10map_kernelILi4ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4_( | |
.param .u64 _ZN4raft6linalg6detail10map_kernelILi4ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_0, | |
.param .u64 _ZN4raft6linalg6detail10map_kernelILi4ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_1, | |
.param .align 8 .b8 _ZN4raft6linalg6detail10map_kernelILi4ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2[24] | |
) | |
{ | |
.reg .pred %p<10>; | |
.reg .b16 %rs<9>; | |
.reg .f32 %f<23>; | |
.reg .b32 %r<41>; | |
.reg .b64 %rd<39>; | |
ld.param.u64 %rd9, [_ZN4raft6linalg6detail10map_kernelILi4ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_0]; | |
ld.param.u64 %rd7, [_ZN4raft6linalg6detail10map_kernelILi4ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_1]; | |
ld.param.v2.u32 {%r19, %r20}, [_ZN4raft6linalg6detail10map_kernelILi4ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2+8]; | |
ld.param.f32 %f22, [_ZN4raft6linalg6detail10map_kernelILi4ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2+16]; | |
ld.param.u64 %rd8, [_ZN4raft6linalg6detail10map_kernelILi4ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2]; | |
cvta.to.global.u64 %rd1, %rd9; | |
mov.u32 %r21, %ntid.x; | |
mov.u32 %r22, %ctaid.x; | |
mov.u32 %r23, %tid.x; | |
mad.lo.s32 %r24, %r22, %r21, %r23; | |
cvt.u64.u32 %rd2, %r24; | |
add.s64 %rd10, %rd9, 15; | |
and.b64 %rd11, %rd10, -16; | |
sub.s64 %rd12, %rd11, %rd9; | |
shr.s64 %rd13, %rd12, 2; | |
min.u64 %rd3, %rd13, %rd7; | |
mul.wide.u32 %rd14, %r24, 4; | |
add.s64 %rd4, %rd3, %rd14; | |
add.s64 %rd15, %rd4, 4; | |
setp.gt.u64 %p1, %rd15, %rd7; | |
@%p1 bra $L__BB158_10; | |
cvta.to.global.u64 %rd5, %rd8; | |
cvt.u32.u64 %r2, %rd4; | |
rem.u32 %r4, %r2, %r20; | |
setp.ge.u32 %p2, %r4, %r19; | |
mov.f32 %f17, %f22; | |
@%p2 bra $L__BB158_3; | |
div.u32 %r25, %r2, %r20; | |
mad.lo.s32 %r26, %r25, %r19, %r4; | |
mul.wide.u32 %rd16, %r26, 4; | |
add.s64 %rd17, %rd5, %rd16; | |
ld.global.f32 %f17, [%rd17]; | |
$L__BB158_3: | |
add.s32 %r5, %r2, 1; | |
rem.u32 %r6, %r5, %r20; | |
setp.ge.u32 %p3, %r6, %r19; | |
mov.f32 %f18, %f22; | |
@%p3 bra $L__BB158_5; | |
div.u32 %r27, %r5, %r20; | |
mad.lo.s32 %r28, %r27, %r19, %r6; | |
mul.wide.u32 %rd18, %r28, 4; | |
add.s64 %rd19, %rd5, %rd18; | |
ld.global.f32 %f18, [%rd19]; | |
$L__BB158_5: | |
add.s32 %r7, %r2, 2; | |
rem.u32 %r8, %r7, %r20; | |
setp.ge.u32 %p4, %r8, %r19; | |
mov.f32 %f19, %f22; | |
@%p4 bra $L__BB158_7; | |
div.u32 %r29, %r7, %r20; | |
mad.lo.s32 %r30, %r29, %r19, %r8; | |
mul.wide.u32 %rd20, %r30, 4; | |
add.s64 %rd21, %rd5, %rd20; | |
ld.global.f32 %f19, [%rd21]; | |
$L__BB158_7: | |
add.s32 %r31, %r2, 3; | |
div.u32 %r10, %r31, %r20; | |
mul.lo.s32 %r32, %r10, %r20; | |
sub.s32 %r9, %r31, %r32; | |
setp.ge.u32 %p5, %r9, %r19; | |
mov.f32 %f20, %f22; | |
@%p5 bra $L__BB158_9; | |
mad.lo.s32 %r33, %r10, %r19, %r9; | |
mul.wide.u32 %rd22, %r33, 4; | |
add.s64 %rd23, %rd5, %rd22; | |
ld.global.f32 %f20, [%rd23]; | |
$L__BB158_9: | |
shl.b64 %rd24, %rd4, 2; | |
add.s64 %rd25, %rd1, %rd24; | |
st.global.v4.f32 [%rd25], {%f17, %f18, %f19, %f20}; | |
$L__BB158_10: | |
setp.le.u64 %p6, %rd3, %rd2; | |
@%p6 bra $L__BB158_14; | |
cvt.u32.u64 %r34, %rd2; | |
rem.u32 %r12, %r34, %r20; | |
setp.lt.u32 %p7, %r12, %r19; | |
mov.f32 %f21, %f22; | |
@%p7 bra $L__BB158_12; | |
bra.uni $L__BB158_13; | |
$L__BB158_12: | |
cvta.to.global.u64 %rd26, %rd8; | |
div.u32 %r36, %r34, %r20; | |
mad.lo.s32 %r37, %r36, %r19, %r12; | |
mul.wide.u32 %rd27, %r37, 4; | |
add.s64 %rd28, %rd26, %rd27; | |
ld.global.f32 %f21, [%rd28]; | |
$L__BB158_13: | |
shl.b64 %rd29, %rd2, 2; | |
add.s64 %rd30, %rd1, %rd29; | |
st.global.f32 [%rd30], %f21; | |
$L__BB158_14: | |
sub.s64 %rd31, %rd7, %rd3; | |
and.b64 %rd32, %rd31, 3; | |
sub.s64 %rd33, %rd7, %rd32; | |
add.s64 %rd6, %rd33, %rd2; | |
setp.ge.u64 %p8, %rd6, %rd7; | |
@%p8 bra $L__BB158_18; | |
cvt.u32.u64 %r38, %rd6; | |
div.u32 %r16, %r38, %r20; | |
mul.lo.s32 %r39, %r16, %r20; | |
sub.s32 %r15, %r38, %r39; | |
setp.lt.u32 %p9, %r15, %r19; | |
@%p9 bra $L__BB158_16; | |
bra.uni $L__BB158_17; | |
$L__BB158_16: | |
cvta.to.global.u64 %rd34, %rd8; | |
mad.lo.s32 %r40, %r16, %r19, %r15; | |
mul.wide.u32 %rd35, %r40, 4; | |
add.s64 %rd36, %rd34, %rd35; | |
ld.global.f32 %f22, [%rd36]; | |
$L__BB158_17: | |
shl.b64 %rd37, %rd6, 2; | |
add.s64 %rd38, %rd1, %rd37; | |
st.global.f32 [%rd38], %f22; | |
$L__BB158_18: | |
ret; | |
} | |
.visible .entry _ZN4raft6linalg6detail10map_kernelILi2ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4_( | |
.param .u64 _ZN4raft6linalg6detail10map_kernelILi2ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_0, | |
.param .u64 _ZN4raft6linalg6detail10map_kernelILi2ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_1, | |
.param .align 8 .b8 _ZN4raft6linalg6detail10map_kernelILi2ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2[24] | |
) | |
{ | |
.reg .pred %p<8>; | |
.reg .b16 %rs<9>; | |
.reg .f32 %f<17>; | |
.reg .b32 %r<33>; | |
.reg .b64 %rd<35>; | |
ld.param.u64 %rd9, [_ZN4raft6linalg6detail10map_kernelILi2ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_0]; | |
ld.param.u64 %rd7, [_ZN4raft6linalg6detail10map_kernelILi2ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_1]; | |
ld.param.v2.u32 {%r15, %r16}, [_ZN4raft6linalg6detail10map_kernelILi2ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2+8]; | |
ld.param.f32 %f16, [_ZN4raft6linalg6detail10map_kernelILi2ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2+16]; | |
ld.param.u64 %rd8, [_ZN4raft6linalg6detail10map_kernelILi2ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2]; | |
cvta.to.global.u64 %rd1, %rd9; | |
mov.u32 %r17, %ntid.x; | |
mov.u32 %r18, %ctaid.x; | |
mov.u32 %r19, %tid.x; | |
mad.lo.s32 %r20, %r18, %r17, %r19; | |
cvt.u64.u32 %rd2, %r20; | |
add.s64 %rd10, %rd9, 7; | |
and.b64 %rd11, %rd10, -8; | |
sub.s64 %rd12, %rd11, %rd9; | |
shr.s64 %rd13, %rd12, 2; | |
min.u64 %rd3, %rd13, %rd7; | |
mul.wide.u32 %rd14, %r20, 2; | |
add.s64 %rd4, %rd3, %rd14; | |
add.s64 %rd15, %rd4, 2; | |
setp.gt.u64 %p1, %rd15, %rd7; | |
@%p1 bra $L__BB159_6; | |
cvta.to.global.u64 %rd5, %rd8; | |
cvt.u32.u64 %r2, %rd4; | |
rem.u32 %r4, %r2, %r16; | |
setp.ge.u32 %p2, %r4, %r15; | |
mov.f32 %f13, %f16; | |
@%p2 bra $L__BB159_3; | |
div.u32 %r21, %r2, %r16; | |
mad.lo.s32 %r22, %r21, %r15, %r4; | |
mul.wide.u32 %rd16, %r22, 4; | |
add.s64 %rd17, %rd5, %rd16; | |
ld.global.f32 %f13, [%rd17]; | |
$L__BB159_3: | |
add.s32 %r23, %r2, 1; | |
div.u32 %r6, %r23, %r16; | |
mul.lo.s32 %r24, %r6, %r16; | |
sub.s32 %r5, %r23, %r24; | |
setp.ge.u32 %p3, %r5, %r15; | |
mov.f32 %f14, %f16; | |
@%p3 bra $L__BB159_5; | |
mad.lo.s32 %r25, %r6, %r15, %r5; | |
mul.wide.u32 %rd18, %r25, 4; | |
add.s64 %rd19, %rd5, %rd18; | |
ld.global.f32 %f14, [%rd19]; | |
$L__BB159_5: | |
shl.b64 %rd20, %rd4, 2; | |
add.s64 %rd21, %rd1, %rd20; | |
st.global.v2.f32 [%rd21], {%f13, %f14}; | |
$L__BB159_6: | |
setp.le.u64 %p4, %rd3, %rd2; | |
@%p4 bra $L__BB159_10; | |
cvt.u32.u64 %r26, %rd2; | |
rem.u32 %r8, %r26, %r16; | |
setp.lt.u32 %p5, %r8, %r15; | |
mov.f32 %f15, %f16; | |
@%p5 bra $L__BB159_8; | |
bra.uni $L__BB159_9; | |
$L__BB159_8: | |
cvta.to.global.u64 %rd22, %rd8; | |
div.u32 %r28, %r26, %r16; | |
mad.lo.s32 %r29, %r28, %r15, %r8; | |
mul.wide.u32 %rd23, %r29, 4; | |
add.s64 %rd24, %rd22, %rd23; | |
ld.global.f32 %f15, [%rd24]; | |
$L__BB159_9: | |
shl.b64 %rd25, %rd2, 2; | |
add.s64 %rd26, %rd1, %rd25; | |
st.global.f32 [%rd26], %f15; | |
$L__BB159_10: | |
sub.s64 %rd27, %rd7, %rd3; | |
and.b64 %rd28, %rd27, 1; | |
sub.s64 %rd29, %rd7, %rd28; | |
add.s64 %rd6, %rd29, %rd2; | |
setp.ge.u64 %p6, %rd6, %rd7; | |
@%p6 bra $L__BB159_14; | |
cvt.u32.u64 %r30, %rd6; | |
div.u32 %r12, %r30, %r16; | |
mul.lo.s32 %r31, %r12, %r16; | |
sub.s32 %r11, %r30, %r31; | |
setp.lt.u32 %p7, %r11, %r15; | |
@%p7 bra $L__BB159_12; | |
bra.uni $L__BB159_13; | |
$L__BB159_12: | |
cvta.to.global.u64 %rd30, %rd8; | |
mad.lo.s32 %r32, %r12, %r15, %r11; | |
mul.wide.u32 %rd31, %r32, 4; | |
add.s64 %rd32, %rd30, %rd31; | |
ld.global.f32 %f16, [%rd32]; | |
$L__BB159_13: | |
shl.b64 %rd33, %rd6, 2; | |
add.s64 %rd34, %rd1, %rd33; | |
st.global.f32 [%rd34], %f16; | |
$L__BB159_14: | |
ret; | |
} | |
.visible .entry _ZN4raft8distance6detail10initKernelIfNS_12KeyValuePairIlfEElNS1_26MinAndDistanceReduceOpImplIlfEEEEvPT0_T1_T_T2_( | |
.param .u64 _ZN4raft8distance6detail10initKernelIfNS_12KeyValuePairIlfEElNS1_26MinAndDistanceReduceOpImplIlfEEEEvPT0_T1_T_T2__param_0, | |
.param .u64 _ZN4raft8distance6detail10initKernelIfNS_12KeyValuePairIlfEElNS1_26MinAndDistanceReduceOpImplIlfEEEEvPT0_T1_T_T2__param_1, | |
.param .f32 _ZN4raft8distance6detail10initKernelIfNS_12KeyValuePairIlfEElNS1_26MinAndDistanceReduceOpImplIlfEEEEvPT0_T1_T_T2__param_2, | |
.param .align 1 .b8 _ZN4raft8distance6detail10initKernelIfNS_12KeyValuePairIlfEElNS1_26MinAndDistanceReduceOpImplIlfEEEEvPT0_T1_T_T2__param_3[1] | |
) | |
{ | |
.reg .pred %p<2>; | |
.reg .f32 %f<2>; | |
.reg .b32 %r<4>; | |
.reg .b64 %rd<10>; | |
ld.param.u64 %rd2, [_ZN4raft8distance6detail10initKernelIfNS_12KeyValuePairIlfEElNS1_26MinAndDistanceReduceOpImplIlfEEEEvPT0_T1_T_T2__param_0]; | |
ld.param.u64 %rd3, [_ZN4raft8distance6detail10initKernelIfNS_12KeyValuePairIlfEElNS1_26MinAndDistanceReduceOpImplIlfEEEEvPT0_T1_T_T2__param_1]; | |
ld.param.f32 %f1, [_ZN4raft8distance6detail10initKernelIfNS_12KeyValuePairIlfEElNS1_26MinAndDistanceReduceOpImplIlfEEEEvPT0_T1_T_T2__param_2]; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %ntid.x; | |
mul.wide.u32 %rd4, %r1, %r2; | |
mov.u32 %r3, %tid.x; | |
cvt.u64.u32 %rd5, %r3; | |
add.s64 %rd1, %rd4, %rd5; | |
setp.ge.s64 %p1, %rd1, %rd3; | |
@%p1 bra $L__BB160_2; | |
cvta.to.global.u64 %rd6, %rd2; | |
shl.b64 %rd7, %rd1, 4; | |
add.s64 %rd8, %rd6, %rd7; | |
mov.u64 %rd9, 0; | |
st.global.u64 [%rd8], %rd9; | |
st.global.f32 [%rd8+8], %f1; | |
$L__BB160_2: | |
ret; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment