Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save alexanderguzhva/f3d86dc42d7a11ff85293c8804e304d4 to your computer and use it in GitHub Desktop.
Save alexanderguzhva/f3d86dc42d7a11ff85293c8804e304d4 to your computer and use it in GitHub Desktop.
raft version without compose
cvt.u32.u64 %r6, %rd5;
cvt.u32.u64 %r7, %rd4;
rem.u32 %r8, %r7, %r6;
cvt.u64.u32 %rd38, %r8;
$L__BB141_4:
add.s64 %rd9, %rd4, 1;
and.b64 %rd27, %rd9, -4294967296;
setp.eq.s64 %p3, %rd27, 0;
@%p3 bra $L__BB141_6;
rem.u64 %rd39, %rd9, %rd5;
bra.uni $L__BB141_7;
$L__BB141_6:
cvt.u32.u64 %r9, %rd5;
cvt.u32.u64 %r10, %rd9;
rem.u32 %r11, %r10, %r9;
cvt.u64.u32 %rd39, %r11;
$L__BB141_7:
cvt.u32.u64 %r12, %rd39;
cvt.u32.u64 %r13, %rd38;
shl.b64 %rd28, %rd4, 2;
add.s64 %rd29, %rd1, %rd28;
st.global.v2.u32 [%rd29], {%r13, %r12};
$L__BB141_8:
setp.le.u64 %p4, %rd3, %rd2;
@%p4 bra $L__BB141_10;
cvt.u32.u64 %r14, %rd2;
rem.u32 %r15, %r14, %r1;
shl.b64 %rd30, %rd2, 2;
add.s64 %rd31, %rd1, %rd30;
st.global.u32 [%rd31], %r15;
$L__BB141_10:
sub.s64 %rd32, %rd18, %rd3;
and.b64 %rd33, %rd32, 1;
sub.s64 %rd34, %rd18, %rd33;
add.s64 %rd13, %rd34, %rd2;
setp.ge.u64 %p5, %rd13, %rd18;
@%p5 bra $L__BB141_15;
cvt.u64.u32 %rd14, %r1;
and.b64 %rd35, %rd13, -4294967296;
setp.eq.s64 %p6, %rd35, 0;
@%p6 bra $L__BB141_13;
rem.u64 %rd40, %rd13, %rd14;
bra.uni $L__BB141_14;
$L__BB141_13:
cvt.u32.u64 %r16, %rd14;
cvt.u32.u64 %r17, %rd13;
rem.u32 %r18, %r17, %r16;
cvt.u64.u32 %rd40, %r18;
$L__BB141_14:
shl.b64 %rd36, %rd13, 2;
add.s64 %rd37, %rd1, %rd36;
st.global.u32 [%rd37], %rd40;
$L__BB141_15:
ret;
}
.visible .entry _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_7cast_opIfEEJfEEEvPT1_T2_T3_DpPKT4_(
.param .u64 _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_7cast_opIfEEJfEEEvPT1_T2_T3_DpPKT4__param_0,
.param .u64 _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_7cast_opIfEEJfEEEvPT1_T2_T3_DpPKT4__param_1,
.param .align 1 .b8 _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_7cast_opIfEEJfEEEvPT1_T2_T3_DpPKT4__param_2[1],
.param .u64 _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_7cast_opIfEEJfEEEvPT1_T2_T3_DpPKT4__param_3
)
{
.reg .pred %p<2>;
.reg .f32 %f<2>;
.reg .b32 %r<5>;
.reg .b64 %rd<10>;
ld.param.u64 %rd2, [_ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_7cast_opIfEEJfEEEvPT1_T2_T3_DpPKT4__param_0];
ld.param.u64 %rd4, [_ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_7cast_opIfEEJfEEEvPT1_T2_T3_DpPKT4__param_1];
ld.param.u64 %rd3, [_ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_7cast_opIfEEJfEEEvPT1_T2_T3_DpPKT4__param_3];
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %ntid.x;
mov.u32 %r3, %tid.x;
mad.lo.s32 %r4, %r1, %r2, %r3;
cvt.u64.u32 %rd1, %r4;
setp.ge.u64 %p1, %rd1, %rd4;
@%p1 bra $L__BB142_2;
cvta.to.global.u64 %rd5, %rd3;
shl.b64 %rd6, %rd1, 2;
add.s64 %rd7, %rd5, %rd6;
ld.global.f32 %f1, [%rd7];
cvta.to.global.u64 %rd8, %rd2;
add.s64 %rd9, %rd8, %rd6;
st.global.f32 [%rd9], %f1;
$L__BB142_2:
ret;
}
.visible .entry _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_7cast_opIfEEJfEEEvPT1_T2_T3_DpPKT4_(
.param .u64 _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_7cast_opIfEEJfEEEvPT1_T2_T3_DpPKT4__param_0,
.param .u64 _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_7cast_opIfEEJfEEEvPT1_T2_T3_DpPKT4__param_1,
.param .align 1 .b8 _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_7cast_opIfEEJfEEEvPT1_T2_T3_DpPKT4__param_2[1],
.param .u64 _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_7cast_opIfEEJfEEEvPT1_T2_T3_DpPKT4__param_3
)
{
.reg .pred %p<4>;
.reg .f32 %f<7>;
.reg .b32 %r<5>;
.reg .b64 %rd<28>;
ld.param.u64 %rd9, [_ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_7cast_opIfEEJfEEEvPT1_T2_T3_DpPKT4__param_0];
ld.param.u64 %rd7, [_ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_7cast_opIfEEJfEEEvPT1_T2_T3_DpPKT4__param_1];
ld.param.u64 %rd8, [_ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_7cast_opIfEEJfEEEvPT1_T2_T3_DpPKT4__param_3];
cvta.to.global.u64 %rd1, %rd8;
cvta.to.global.u64 %rd2, %rd9;
mov.u32 %r1, %ntid.x;
mov.u32 %r2, %ctaid.x;
mov.u32 %r3, %tid.x;
mad.lo.s32 %r4, %r2, %r1, %r3;
cvt.u64.u32 %rd3, %r4;
add.s64 %rd10, %rd9, 15;
and.b64 %rd11, %rd10, -16;
sub.s64 %rd12, %rd11, %rd9;
shr.s64 %rd13, %rd12, 2;
min.u64 %rd4, %rd13, %rd7;
mul.wide.u32 %rd14, %r4, 4;
add.s64 %rd5, %rd4, %rd14;
add.s64 %rd15, %rd5, 4;
setp.gt.u64 %p1, %rd15, %rd7;
@%p1 bra $L__BB143_2;
shl.b64 %rd17, %rd5, 2;
add.s64 %rd16, %rd8, %rd17;
ld.global.nc.v4.f32 {%f1,%f2,%f3,%f4}, [%rd16];
add.s64 %rd18, %rd2, %rd17;
st.global.v4.f32 [%rd18], {%f1, %f2, %f3, %f4};
$L__BB143_2:
setp.le.u64 %p2, %rd4, %rd3;
@%p2 bra $L__BB143_4;
shl.b64 %rd19, %rd3, 2;
add.s64 %rd20, %rd1, %rd19;
ld.global.f32 %f5, [%rd20];
add.s64 %rd21, %rd2, %rd19;
st.global.f32 [%rd21], %f5;
$L__BB143_4:
sub.s64 %rd22, %rd7, %rd4;
and.b64 %rd23, %rd22, 3;
sub.s64 %rd24, %rd7, %rd23;
add.s64 %rd6, %rd24, %rd3;
setp.ge.u64 %p3, %rd6, %rd7;
@%p3 bra $L__BB143_6;
shl.b64 %rd25, %rd6, 2;
add.s64 %rd26, %rd1, %rd25;
ld.global.f32 %f6, [%rd26];
add.s64 %rd27, %rd2, %rd25;
st.global.f32 [%rd27], %f6;
$L__BB143_6:
ret;
}
.visible .entry _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_7cast_opIfEEJfEEEvPT1_T2_T3_DpPKT4_(
.param .u64 _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_7cast_opIfEEJfEEEvPT1_T2_T3_DpPKT4__param_0,
.param .u64 _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_7cast_opIfEEJfEEEvPT1_T2_T3_DpPKT4__param_1,
.param .align 1 .b8 _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_7cast_opIfEEJfEEEvPT1_T2_T3_DpPKT4__param_2[1],
.param .u64 _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_7cast_opIfEEJfEEEvPT1_T2_T3_DpPKT4__param_3
)
{
.reg .pred %p<4>;
.reg .f32 %f<5>;
.reg .b32 %r<5>;
.reg .b64 %rd<28>;
ld.param.u64 %rd9, [_ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_7cast_opIfEEJfEEEvPT1_T2_T3_DpPKT4__param_0];
ld.param.u64 %rd7, [_ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_7cast_opIfEEJfEEEvPT1_T2_T3_DpPKT4__param_1];
ld.param.u64 %rd8, [_ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_7cast_opIfEEJfEEEvPT1_T2_T3_DpPKT4__param_3];
cvta.to.global.u64 %rd1, %rd8;
cvta.to.global.u64 %rd2, %rd9;
mov.u32 %r1, %ntid.x;
mov.u32 %r2, %ctaid.x;
mov.u32 %r3, %tid.x;
mad.lo.s32 %r4, %r2, %r1, %r3;
cvt.u64.u32 %rd3, %r4;
add.s64 %rd10, %rd9, 7;
and.b64 %rd11, %rd10, -8;
sub.s64 %rd12, %rd11, %rd9;
shr.s64 %rd13, %rd12, 2;
min.u64 %rd4, %rd13, %rd7;
mul.wide.u32 %rd14, %r4, 2;
add.s64 %rd5, %rd4, %rd14;
add.s64 %rd15, %rd5, 2;
setp.gt.u64 %p1, %rd15, %rd7;
@%p1 bra $L__BB144_2;
shl.b64 %rd17, %rd5, 2;
add.s64 %rd16, %rd8, %rd17;
ld.global.nc.v2.f32 {%f1,%f2}, [%rd16];
add.s64 %rd18, %rd2, %rd17;
st.global.v2.f32 [%rd18], {%f1, %f2};
$L__BB144_2:
setp.le.u64 %p2, %rd4, %rd3;
@%p2 bra $L__BB144_4;
shl.b64 %rd19, %rd3, 2;
add.s64 %rd20, %rd1, %rd19;
ld.global.f32 %f3, [%rd20];
add.s64 %rd21, %rd2, %rd19;
st.global.f32 [%rd21], %f3;
$L__BB144_4:
sub.s64 %rd22, %rd7, %rd4;
and.b64 %rd23, %rd22, 1;
sub.s64 %rd24, %rd7, %rd23;
add.s64 %rd6, %rd24, %rd3;
setp.ge.u64 %p3, %rd6, %rd7;
@%p3 bra $L__BB144_6;
shl.b64 %rd25, %rd6, 2;
add.s64 %rd26, %rd1, %rd25;
ld.global.f32 %f4, [%rd26];
add.s64 %rd27, %rd2, %rd25;
st.global.f32 [%rd27], %f4;
$L__BB144_6:
ret;
}
.visible .entry _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4_(
.param .u64 _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_0,
.param .u64 _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_1,
.param .align 4 .b8 _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_2[12],
.param .u64 _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_3
)
{
.reg .pred %p<2>;
.reg .b16 %rs<13>;
.reg .f32 %f<5>;
.reg .b32 %r<5>;
.reg .b64 %rd<10>;
ld.param.u64 %rd2, [_ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_0];
ld.param.u64 %rd4, [_ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_1];
ld.param.u64 %rd3, [_ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_3];
ld.param.f32 %f1, [_ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_2+4];
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %ntid.x;
mov.u32 %r3, %tid.x;
mad.lo.s32 %r4, %r1, %r2, %r3;
cvt.u64.u32 %rd1, %r4;
setp.ge.u64 %p1, %rd1, %rd4;
@%p1 bra $L__BB145_2;
cvta.to.global.u64 %rd5, %rd3;
shl.b64 %rd6, %rd1, 2;
add.s64 %rd7, %rd5, %rd6;
ld.global.f32 %f2, [%rd7];
sqrt.approx.ftz.f32 %f3, %f2;
mul.ftz.f32 %f4, %f1, %f3;
cvta.to.global.u64 %rd8, %rd2;
add.s64 %rd9, %rd8, %rd6;
st.global.f32 [%rd9], %f4;
$L__BB145_2:
ret;
}
.visible .entry _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4_(
.param .u64 _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_0,
.param .u64 _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_1,
.param .align 4 .b8 _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_2[12],
.param .u64 _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_3
)
{
.reg .pred %p<4>;
.reg .f32 %f<20>;
.reg .b32 %r<5>;
.reg .b64 %rd<28>;
ld.param.u64 %rd9, [_ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_0];
ld.param.u64 %rd7, [_ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_1];
ld.param.u64 %rd8, [_ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_3];
cvta.to.global.u64 %rd1, %rd8;
cvta.to.global.u64 %rd2, %rd9;
mov.u32 %r1, %ntid.x;
mov.u32 %r2, %ctaid.x;
mov.u32 %r3, %tid.x;
mad.lo.s32 %r4, %r2, %r1, %r3;
cvt.u64.u32 %rd3, %r4;
add.s64 %rd10, %rd9, 15;
and.b64 %rd11, %rd10, -16;
sub.s64 %rd12, %rd11, %rd9;
shr.s64 %rd13, %rd12, 2;
min.u64 %rd4, %rd13, %rd7;
mul.wide.u32 %rd14, %r4, 4;
add.s64 %rd5, %rd4, %rd14;
ld.param.f32 %f1, [_ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_2+4];
add.s64 %rd15, %rd5, 4;
setp.gt.u64 %p1, %rd15, %rd7;
@%p1 bra $L__BB146_2;
shl.b64 %rd17, %rd5, 2;
add.s64 %rd16, %rd8, %rd17;
ld.global.nc.v4.f32 {%f2,%f3,%f4,%f5}, [%rd16];
sqrt.approx.ftz.f32 %f6, %f2;
sqrt.approx.ftz.f32 %f7, %f3;
sqrt.approx.ftz.f32 %f8, %f4;
sqrt.approx.ftz.f32 %f9, %f5;
mul.ftz.f32 %f10, %f1, %f9;
mul.ftz.f32 %f11, %f1, %f8;
mul.ftz.f32 %f12, %f1, %f7;
mul.ftz.f32 %f13, %f1, %f6;
add.s64 %rd18, %rd2, %rd17;
st.global.v4.f32 [%rd18], {%f13, %f12, %f11, %f10};
$L__BB146_2:
setp.le.u64 %p2, %rd4, %rd3;
@%p2 bra $L__BB146_4;
shl.b64 %rd19, %rd3, 2;
add.s64 %rd20, %rd1, %rd19;
ld.global.f32 %f14, [%rd20];
sqrt.approx.ftz.f32 %f15, %f14;
mul.ftz.f32 %f16, %f1, %f15;
add.s64 %rd21, %rd2, %rd19;
st.global.f32 [%rd21], %f16;
$L__BB146_4:
sub.s64 %rd22, %rd7, %rd4;
and.b64 %rd23, %rd22, 3;
sub.s64 %rd24, %rd7, %rd23;
add.s64 %rd6, %rd24, %rd3;
setp.ge.u64 %p3, %rd6, %rd7;
@%p3 bra $L__BB146_6;
shl.b64 %rd25, %rd6, 2;
add.s64 %rd26, %rd1, %rd25;
ld.global.f32 %f17, [%rd26];
sqrt.approx.ftz.f32 %f18, %f17;
mul.ftz.f32 %f19, %f1, %f18;
add.s64 %rd27, %rd2, %rd25;
st.global.f32 [%rd27], %f19;
$L__BB146_6:
ret;
}
.visible .entry _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4_(
.param .u64 _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_0,
.param .u64 _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_1,
.param .align 4 .b8 _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_2[12],
.param .u64 _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_3
)
{
.reg .pred %p<4>;
.reg .f32 %f<14>;
.reg .b32 %r<5>;
.reg .b64 %rd<28>;
ld.param.u64 %rd9, [_ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_0];
ld.param.u64 %rd7, [_ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_1];
ld.param.u64 %rd8, [_ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_3];
cvta.to.global.u64 %rd1, %rd8;
cvta.to.global.u64 %rd2, %rd9;
mov.u32 %r1, %ntid.x;
mov.u32 %r2, %ctaid.x;
mov.u32 %r3, %tid.x;
mad.lo.s32 %r4, %r2, %r1, %r3;
cvt.u64.u32 %rd3, %r4;
add.s64 %rd10, %rd9, 7;
and.b64 %rd11, %rd10, -8;
sub.s64 %rd12, %rd11, %rd9;
shr.s64 %rd13, %rd12, 2;
min.u64 %rd4, %rd13, %rd7;
mul.wide.u32 %rd14, %r4, 2;
add.s64 %rd5, %rd4, %rd14;
ld.param.f32 %f1, [_ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_2+4];
add.s64 %rd15, %rd5, 2;
setp.gt.u64 %p1, %rd15, %rd7;
@%p1 bra $L__BB147_2;
shl.b64 %rd17, %rd5, 2;
add.s64 %rd16, %rd8, %rd17;
ld.global.nc.v2.f32 {%f2,%f3}, [%rd16];
sqrt.approx.ftz.f32 %f4, %f2;
sqrt.approx.ftz.f32 %f5, %f3;
mul.ftz.f32 %f6, %f1, %f5;
mul.ftz.f32 %f7, %f1, %f4;
add.s64 %rd18, %rd2, %rd17;
st.global.v2.f32 [%rd18], {%f7, %f6};
$L__BB147_2:
setp.le.u64 %p2, %rd4, %rd3;
@%p2 bra $L__BB147_4;
shl.b64 %rd19, %rd3, 2;
add.s64 %rd20, %rd1, %rd19;
ld.global.f32 %f8, [%rd20];
sqrt.approx.ftz.f32 %f9, %f8;
mul.ftz.f32 %f10, %f1, %f9;
add.s64 %rd21, %rd2, %rd19;
st.global.f32 [%rd21], %f10;
$L__BB147_4:
sub.s64 %rd22, %rd7, %rd4;
and.b64 %rd23, %rd22, 1;
sub.s64 %rd24, %rd7, %rd23;
add.s64 %rd6, %rd24, %rd3;
setp.ge.u64 %p3, %rd6, %rd7;
@%p3 bra $L__BB147_6;
shl.b64 %rd25, %rd6, 2;
add.s64 %rd26, %rd1, %rd25;
ld.global.f32 %f11, [%rd26];
sqrt.approx.ftz.f32 %f12, %f11;
mul.ftz.f32 %f13, %f1, %f12;
add.s64 %rd27, %rd2, %rd25;
st.global.f32 [%rd27], %f13;
$L__BB147_6:
ret;
}
.visible .entry _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4_(
.param .u64 _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_0,
.param .u64 _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_1,
.param .align 4 .b8 _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_2[12],
.param .u64 _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_3
)
{
.reg .pred %p<2>;
.reg .b16 %rs<10>;
.reg .f32 %f<4>;
.reg .b32 %r<5>;
.reg .b64 %rd<10>;
ld.param.u64 %rd2, [_ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_0];
ld.param.u64 %rd4, [_ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_1];
ld.param.u64 %rd3, [_ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_3];
ld.param.f32 %f1, [_ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_2+4];
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %ntid.x;
mov.u32 %r3, %tid.x;
mad.lo.s32 %r4, %r1, %r2, %r3;
cvt.u64.u32 %rd1, %r4;
setp.ge.u64 %p1, %rd1, %rd4;
@%p1 bra $L__BB148_2;
cvta.to.global.u64 %rd5, %rd3;
shl.b64 %rd6, %rd1, 2;
add.s64 %rd7, %rd5, %rd6;
ld.global.f32 %f2, [%rd7];
mul.ftz.f32 %f3, %f1, %f2;
cvta.to.global.u64 %rd8, %rd2;
add.s64 %rd9, %rd8, %rd6;
st.global.f32 [%rd9], %f3;
$L__BB148_2:
ret;
}
.visible .entry _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4_(
.param .u64 _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_0,
.param .u64 _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_1,
.param .align 4 .b8 _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_2[12],
.param .u64 _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_3
)
{
.reg .pred %p<4>;
.reg .f32 %f<14>;
.reg .b32 %r<5>;
.reg .b64 %rd<28>;
ld.param.u64 %rd9, [_ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_0];
ld.param.u64 %rd7, [_ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_1];
ld.param.u64 %rd8, [_ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_3];
cvta.to.global.u64 %rd1, %rd8;
cvta.to.global.u64 %rd2, %rd9;
mov.u32 %r1, %ntid.x;
mov.u32 %r2, %ctaid.x;
mov.u32 %r3, %tid.x;
mad.lo.s32 %r4, %r2, %r1, %r3;
cvt.u64.u32 %rd3, %r4;
add.s64 %rd10, %rd9, 15;
and.b64 %rd11, %rd10, -16;
sub.s64 %rd12, %rd11, %rd9;
shr.s64 %rd13, %rd12, 2;
min.u64 %rd4, %rd13, %rd7;
mul.wide.u32 %rd14, %r4, 4;
add.s64 %rd5, %rd4, %rd14;
ld.param.f32 %f1, [_ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_2+4];
add.s64 %rd15, %rd5, 4;
setp.gt.u64 %p1, %rd15, %rd7;
@%p1 bra $L__BB149_2;
shl.b64 %rd17, %rd5, 2;
add.s64 %rd16, %rd8, %rd17;
ld.global.nc.v4.f32 {%f2,%f3,%f4,%f5}, [%rd16];
mul.ftz.f32 %f6, %f1, %f5;
mul.ftz.f32 %f7, %f1, %f4;
mul.ftz.f32 %f8, %f1, %f3;
mul.ftz.f32 %f9, %f1, %f2;
add.s64 %rd18, %rd2, %rd17;
st.global.v4.f32 [%rd18], {%f9, %f8, %f7, %f6};
$L__BB149_2:
setp.le.u64 %p2, %rd4, %rd3;
@%p2 bra $L__BB149_4;
shl.b64 %rd19, %rd3, 2;
add.s64 %rd20, %rd1, %rd19;
ld.global.f32 %f10, [%rd20];
mul.ftz.f32 %f11, %f1, %f10;
add.s64 %rd21, %rd2, %rd19;
st.global.f32 [%rd21], %f11;
$L__BB149_4:
sub.s64 %rd22, %rd7, %rd4;
and.b64 %rd23, %rd22, 3;
sub.s64 %rd24, %rd7, %rd23;
add.s64 %rd6, %rd24, %rd3;
setp.ge.u64 %p3, %rd6, %rd7;
@%p3 bra $L__BB149_6;
shl.b64 %rd25, %rd6, 2;
add.s64 %rd26, %rd1, %rd25;
ld.global.f32 %f12, [%rd26];
mul.ftz.f32 %f13, %f1, %f12;
add.s64 %rd27, %rd2, %rd25;
st.global.f32 [%rd27], %f13;
$L__BB149_6:
ret;
}
.visible .entry _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4_(
.param .u64 _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_0,
.param .u64 _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_1,
.param .align 4 .b8 _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_2[12],
.param .u64 _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_3
)
{
.reg .pred %p<4>;
.reg .f32 %f<10>;
.reg .b32 %r<5>;
.reg .b64 %rd<28>;
ld.param.u64 %rd9, [_ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_0];
ld.param.u64 %rd7, [_ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_1];
ld.param.u64 %rd8, [_ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_3];
cvta.to.global.u64 %rd1, %rd8;
cvta.to.global.u64 %rd2, %rd9;
mov.u32 %r1, %ntid.x;
mov.u32 %r2, %ctaid.x;
mov.u32 %r3, %tid.x;
mad.lo.s32 %r4, %r2, %r1, %r3;
cvt.u64.u32 %rd3, %r4;
add.s64 %rd10, %rd9, 7;
and.b64 %rd11, %rd10, -8;
sub.s64 %rd12, %rd11, %rd9;
shr.s64 %rd13, %rd12, 2;
min.u64 %rd4, %rd13, %rd7;
mul.wide.u32 %rd14, %r4, 2;
add.s64 %rd5, %rd4, %rd14;
ld.param.f32 %f1, [_ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJfEEEvPT1_T2_T3_DpPKT4__param_2+4];
add.s64 %rd15, %rd5, 2;
setp.gt.u64 %p1, %rd15, %rd7;
@%p1 bra $L__BB150_2;
shl.b64 %rd17, %rd5, 2;
add.s64 %rd16, %rd8, %rd17;
ld.global.nc.v2.f32 {%f2,%f3}, [%rd16];
mul.ftz.f32 %f4, %f1, %f3;
mul.ftz.f32 %f5, %f1, %f2;
add.s64 %rd18, %rd2, %rd17;
st.global.v2.f32 [%rd18], {%f5, %f4};
$L__BB150_2:
setp.le.u64 %p2, %rd4, %rd3;
@%p2 bra $L__BB150_4;
shl.b64 %rd19, %rd3, 2;
add.s64 %rd20, %rd1, %rd19;
ld.global.f32 %f6, [%rd20];
mul.ftz.f32 %f7, %f1, %f6;
add.s64 %rd21, %rd2, %rd19;
st.global.f32 [%rd21], %f7;
$L__BB150_4:
sub.s64 %rd22, %rd7, %rd4;
and.b64 %rd23, %rd22, 1;
sub.s64 %rd24, %rd7, %rd23;
add.s64 %rd6, %rd24, %rd3;
setp.ge.u64 %p3, %rd6, %rd7;
@%p3 bra $L__BB150_6;
shl.b64 %rd25, %rd6, 2;
add.s64 %rd26, %rd1, %rd25;
ld.global.f32 %f8, [%rd26];
mul.ftz.f32 %f9, %f1, %f8;
add.s64 %rd27, %rd2, %rd25;
st.global.f32 [%rd27], %f9;
$L__BB150_6:
ret;
}
.visible .entry _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_7cast_opIfEEJ6__halfEEEvPT1_T2_T3_DpPKT4_(
.param .u64 _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_7cast_opIfEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_0,
.param .u64 _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_7cast_opIfEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_1,
.param .align 1 .b8 _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_7cast_opIfEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_2[1],
.param .u64 _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_7cast_opIfEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_3
)
{
.reg .pred %p<2>;
.reg .b16 %rs<2>;
.reg .f32 %f<2>;
.reg .b32 %r<5>;
.reg .b64 %rd<11>;
ld.param.u64 %rd2, [_ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_7cast_opIfEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_0];
ld.param.u64 %rd4, [_ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_7cast_opIfEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_1];
ld.param.u64 %rd3, [_ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_7cast_opIfEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_3];
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %ntid.x;
mov.u32 %r3, %tid.x;
mad.lo.s32 %r4, %r1, %r2, %r3;
cvt.u64.u32 %rd1, %r4;
setp.ge.u64 %p1, %rd1, %rd4;
@%p1 bra $L__BB151_2;
cvta.to.global.u64 %rd5, %rd3;
shl.b64 %rd6, %rd1, 1;
add.s64 %rd7, %rd5, %rd6;
ld.global.u16 %rs1, [%rd7];
{ cvt.f32.f16 %f1, %rs1;}
cvta.to.global.u64 %rd8, %rd2;
shl.b64 %rd9, %rd1, 2;
add.s64 %rd10, %rd8, %rd9;
st.global.f32 [%rd10], %f1;
$L__BB151_2:
ret;
}
.visible .entry _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_7cast_opIfEEJ6__halfEEEvPT1_T2_T3_DpPKT4_(
.param .u64 _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_7cast_opIfEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_0,
.param .u64 _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_7cast_opIfEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_1,
.param .align 1 .b8 _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_7cast_opIfEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_2[1],
.param .u64 _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_7cast_opIfEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_3
)
{
.reg .pred %p<4>;
.reg .b16 %rs<7>;
.reg .f32 %f<7>;
.reg .b32 %r<7>;
.reg .b64 %rd<31>;
ld.param.u64 %rd9, [_ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_7cast_opIfEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_0];
ld.param.u64 %rd7, [_ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_7cast_opIfEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_1];
ld.param.u64 %rd8, [_ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_7cast_opIfEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_3];
cvta.to.global.u64 %rd1, %rd8;
cvta.to.global.u64 %rd2, %rd9;
mov.u32 %r1, %ntid.x;
mov.u32 %r2, %ctaid.x;
mov.u32 %r3, %tid.x;
mad.lo.s32 %r4, %r2, %r1, %r3;
cvt.u64.u32 %rd3, %r4;
add.s64 %rd10, %rd9, 15;
and.b64 %rd11, %rd10, -16;
sub.s64 %rd12, %rd11, %rd9;
shr.s64 %rd13, %rd12, 2;
min.u64 %rd4, %rd13, %rd7;
mul.wide.u32 %rd14, %r4, 4;
add.s64 %rd5, %rd4, %rd14;
add.s64 %rd15, %rd5, 4;
setp.gt.u64 %p1, %rd15, %rd7;
@%p1 bra $L__BB152_2;
shl.b64 %rd17, %rd5, 1;
add.s64 %rd16, %rd8, %rd17;
ld.global.nc.v2.u32 {%r5,%r6}, [%rd16];
mov.b32 {%rs1, %rs2}, %r5;
mov.b32 {%rs3, %rs4}, %r6;
{ cvt.f32.f16 %f4, %rs4;}
{ cvt.f32.f16 %f3, %rs3;}
{ cvt.f32.f16 %f2, %rs2;}
{ cvt.f32.f16 %f1, %rs1;}
shl.b64 %rd18, %rd5, 2;
add.s64 %rd19, %rd2, %rd18;
st.global.v4.f32 [%rd19], {%f1, %f2, %f3, %f4};
$L__BB152_2:
setp.le.u64 %p2, %rd4, %rd3;
@%p2 bra $L__BB152_4;
shl.b64 %rd20, %rd3, 1;
add.s64 %rd21, %rd1, %rd20;
ld.global.u16 %rs5, [%rd21];
{ cvt.f32.f16 %f5, %rs5;}
shl.b64 %rd22, %rd3, 2;
add.s64 %rd23, %rd2, %rd22;
st.global.f32 [%rd23], %f5;
$L__BB152_4:
sub.s64 %rd24, %rd7, %rd4;
and.b64 %rd25, %rd24, 3;
sub.s64 %rd26, %rd7, %rd25;
add.s64 %rd6, %rd26, %rd3;
setp.ge.u64 %p3, %rd6, %rd7;
@%p3 bra $L__BB152_6;
shl.b64 %rd27, %rd6, 1;
add.s64 %rd28, %rd1, %rd27;
ld.global.u16 %rs6, [%rd28];
{ cvt.f32.f16 %f6, %rs6;}
shl.b64 %rd29, %rd6, 2;
add.s64 %rd30, %rd2, %rd29;
st.global.f32 [%rd30], %f6;
$L__BB152_6:
ret;
}
.visible .entry _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_7cast_opIfEEJ6__halfEEEvPT1_T2_T3_DpPKT4_(
.param .u64 _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_7cast_opIfEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_0,
.param .u64 _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_7cast_opIfEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_1,
.param .align 1 .b8 _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_7cast_opIfEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_2[1],
.param .u64 _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_7cast_opIfEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_3
)
{
.reg .pred %p<4>;
.reg .b16 %rs<5>;
.reg .f32 %f<5>;
.reg .b32 %r<6>;
.reg .b64 %rd<31>;
ld.param.u64 %rd9, [_ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_7cast_opIfEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_0];
ld.param.u64 %rd7, [_ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_7cast_opIfEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_1];
ld.param.u64 %rd8, [_ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_7cast_opIfEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_3];
cvta.to.global.u64 %rd1, %rd8;
cvta.to.global.u64 %rd2, %rd9;
mov.u32 %r1, %ntid.x;
mov.u32 %r2, %ctaid.x;
mov.u32 %r3, %tid.x;
mad.lo.s32 %r4, %r2, %r1, %r3;
cvt.u64.u32 %rd3, %r4;
add.s64 %rd10, %rd9, 7;
and.b64 %rd11, %rd10, -8;
sub.s64 %rd12, %rd11, %rd9;
shr.s64 %rd13, %rd12, 2;
min.u64 %rd4, %rd13, %rd7;
mul.wide.u32 %rd14, %r4, 2;
add.s64 %rd5, %rd4, %rd14;
add.s64 %rd15, %rd5, 2;
setp.gt.u64 %p1, %rd15, %rd7;
@%p1 bra $L__BB153_2;
shl.b64 %rd17, %rd5, 1;
add.s64 %rd16, %rd8, %rd17;
ld.global.nc.b32 %r5, [%rd16];
mov.b32 {%rs1, %rs2}, %r5;
{ cvt.f32.f16 %f2, %rs2;}
{ cvt.f32.f16 %f1, %rs1;}
shl.b64 %rd18, %rd5, 2;
add.s64 %rd19, %rd2, %rd18;
st.global.v2.f32 [%rd19], {%f1, %f2};
$L__BB153_2:
setp.le.u64 %p2, %rd4, %rd3;
@%p2 bra $L__BB153_4;
shl.b64 %rd20, %rd3, 1;
add.s64 %rd21, %rd1, %rd20;
ld.global.u16 %rs3, [%rd21];
{ cvt.f32.f16 %f3, %rs3;}
shl.b64 %rd22, %rd3, 2;
add.s64 %rd23, %rd2, %rd22;
st.global.f32 [%rd23], %f3;
$L__BB153_4:
sub.s64 %rd24, %rd7, %rd4;
and.b64 %rd25, %rd24, 1;
sub.s64 %rd26, %rd7, %rd25;
add.s64 %rd6, %rd26, %rd3;
setp.ge.u64 %p3, %rd6, %rd7;
@%p3 bra $L__BB153_6;
shl.b64 %rd27, %rd6, 1;
add.s64 %rd28, %rd1, %rd27;
ld.global.u16 %rs4, [%rd28];
{ cvt.f32.f16 %f4, %rs4;}
shl.b64 %rd29, %rd6, 2;
add.s64 %rd30, %rd2, %rd29;
st.global.f32 [%rd30], %f4;
$L__BB153_6:
ret;
}
.visible .entry _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4_(
.param .u64 _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_0,
.param .u64 _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_1,
.param .align 4 .b8 _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_2[12],
.param .u64 _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_3
)
{
.reg .pred %p<2>;
.reg .b16 %rs<14>;
.reg .f32 %f<5>;
.reg .b32 %r<5>;
.reg .b64 %rd<11>;
ld.param.u64 %rd2, [_ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_0];
ld.param.u64 %rd4, [_ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_1];
ld.param.u64 %rd3, [_ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_3];
ld.param.f32 %f1, [_ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_2+4];
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %ntid.x;
mov.u32 %r3, %tid.x;
mad.lo.s32 %r4, %r1, %r2, %r3;
cvt.u64.u32 %rd1, %r4;
setp.ge.u64 %p1, %rd1, %rd4;
@%p1 bra $L__BB154_2;
cvta.to.global.u64 %rd5, %rd3;
shl.b64 %rd6, %rd1, 1;
add.s64 %rd7, %rd5, %rd6;
ld.global.u16 %rs13, [%rd7];
{ cvt.f32.f16 %f2, %rs13;}
sqrt.approx.ftz.f32 %f3, %f2;
mul.ftz.f32 %f4, %f1, %f3;
cvta.to.global.u64 %rd8, %rd2;
shl.b64 %rd9, %rd1, 2;
add.s64 %rd10, %rd8, %rd9;
st.global.f32 [%rd10], %f4;
$L__BB154_2:
ret;
}
.visible .entry _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4_(
.param .u64 _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_0,
.param .u64 _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_1,
.param .align 4 .b8 _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_2[12],
.param .u64 _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_3
)
{
.reg .pred %p<4>;
.reg .b16 %rs<7>;
.reg .f32 %f<20>;
.reg .b32 %r<7>;
.reg .b64 %rd<31>;
ld.param.u64 %rd9, [_ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_0];
ld.param.u64 %rd7, [_ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_1];
ld.param.u64 %rd8, [_ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_3];
cvta.to.global.u64 %rd1, %rd8;
cvta.to.global.u64 %rd2, %rd9;
mov.u32 %r1, %ntid.x;
mov.u32 %r2, %ctaid.x;
mov.u32 %r3, %tid.x;
mad.lo.s32 %r4, %r2, %r1, %r3;
cvt.u64.u32 %rd3, %r4;
add.s64 %rd10, %rd9, 15;
and.b64 %rd11, %rd10, -16;
sub.s64 %rd12, %rd11, %rd9;
shr.s64 %rd13, %rd12, 2;
min.u64 %rd4, %rd13, %rd7;
mul.wide.u32 %rd14, %r4, 4;
add.s64 %rd5, %rd4, %rd14;
ld.param.f32 %f1, [_ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_2+4];
add.s64 %rd15, %rd5, 4;
setp.gt.u64 %p1, %rd15, %rd7;
@%p1 bra $L__BB155_2;
shl.b64 %rd17, %rd5, 1;
add.s64 %rd16, %rd8, %rd17;
ld.global.nc.v2.u32 {%r5,%r6}, [%rd16];
mov.b32 {%rs1, %rs2}, %r5;
{ cvt.f32.f16 %f2, %rs1;}
sqrt.approx.ftz.f32 %f6, %f2;
{ cvt.f32.f16 %f3, %rs2;}
sqrt.approx.ftz.f32 %f7, %f3;
mov.b32 {%rs3, %rs4}, %r6;
{ cvt.f32.f16 %f4, %rs3;}
sqrt.approx.ftz.f32 %f8, %f4;
{ cvt.f32.f16 %f5, %rs4;}
sqrt.approx.ftz.f32 %f9, %f5;
mul.ftz.f32 %f10, %f1, %f9;
mul.ftz.f32 %f11, %f1, %f8;
mul.ftz.f32 %f12, %f1, %f7;
mul.ftz.f32 %f13, %f1, %f6;
shl.b64 %rd18, %rd5, 2;
add.s64 %rd19, %rd2, %rd18;
st.global.v4.f32 [%rd19], {%f13, %f12, %f11, %f10};
$L__BB155_2:
setp.le.u64 %p2, %rd4, %rd3;
@%p2 bra $L__BB155_4;
shl.b64 %rd20, %rd3, 1;
add.s64 %rd21, %rd1, %rd20;
ld.global.u16 %rs5, [%rd21];
{ cvt.f32.f16 %f14, %rs5;}
sqrt.approx.ftz.f32 %f15, %f14;
mul.ftz.f32 %f16, %f1, %f15;
shl.b64 %rd22, %rd3, 2;
add.s64 %rd23, %rd2, %rd22;
st.global.f32 [%rd23], %f16;
$L__BB155_4:
sub.s64 %rd24, %rd7, %rd4;
and.b64 %rd25, %rd24, 3;
sub.s64 %rd26, %rd7, %rd25;
add.s64 %rd6, %rd26, %rd3;
setp.ge.u64 %p3, %rd6, %rd7;
@%p3 bra $L__BB155_6;
shl.b64 %rd27, %rd6, 1;
add.s64 %rd28, %rd1, %rd27;
ld.global.u16 %rs6, [%rd28];
{ cvt.f32.f16 %f17, %rs6;}
sqrt.approx.ftz.f32 %f18, %f17;
mul.ftz.f32 %f19, %f1, %f18;
shl.b64 %rd29, %rd6, 2;
add.s64 %rd30, %rd2, %rd29;
st.global.f32 [%rd30], %f19;
$L__BB155_6:
ret;
}
.visible .entry _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4_(
.param .u64 _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_0,
.param .u64 _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_1,
.param .align 4 .b8 _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_2[12],
.param .u64 _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_3
)
{
.reg .pred %p<4>;
.reg .b16 %rs<5>;
.reg .f32 %f<14>;
.reg .b32 %r<6>;
.reg .b64 %rd<31>;
ld.param.u64 %rd9, [_ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_0];
ld.param.u64 %rd7, [_ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_1];
ld.param.u64 %rd8, [_ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_3];
cvta.to.global.u64 %rd1, %rd8;
cvta.to.global.u64 %rd2, %rd9;
mov.u32 %r1, %ntid.x;
mov.u32 %r2, %ctaid.x;
mov.u32 %r3, %tid.x;
mad.lo.s32 %r4, %r2, %r1, %r3;
cvt.u64.u32 %rd3, %r4;
add.s64 %rd10, %rd9, 7;
and.b64 %rd11, %rd10, -8;
sub.s64 %rd12, %rd11, %rd9;
shr.s64 %rd13, %rd12, 2;
min.u64 %rd4, %rd13, %rd7;
mul.wide.u32 %rd14, %r4, 2;
add.s64 %rd5, %rd4, %rd14;
ld.param.f32 %f1, [_ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7sqrt_opENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_2+4];
add.s64 %rd15, %rd5, 2;
setp.gt.u64 %p1, %rd15, %rd7;
@%p1 bra $L__BB156_2;
shl.b64 %rd17, %rd5, 1;
add.s64 %rd16, %rd8, %rd17;
ld.global.nc.b32 %r5, [%rd16];
mov.b32 {%rs1, %rs2}, %r5;
{ cvt.f32.f16 %f2, %rs1;}
sqrt.approx.ftz.f32 %f4, %f2;
{ cvt.f32.f16 %f3, %rs2;}
sqrt.approx.ftz.f32 %f5, %f3;
mul.ftz.f32 %f6, %f1, %f5;
mul.ftz.f32 %f7, %f1, %f4;
shl.b64 %rd18, %rd5, 2;
add.s64 %rd19, %rd2, %rd18;
st.global.v2.f32 [%rd19], {%f7, %f6};
$L__BB156_2:
setp.le.u64 %p2, %rd4, %rd3;
@%p2 bra $L__BB156_4;
shl.b64 %rd20, %rd3, 1;
add.s64 %rd21, %rd1, %rd20;
ld.global.u16 %rs3, [%rd21];
{ cvt.f32.f16 %f8, %rs3;}
sqrt.approx.ftz.f32 %f9, %f8;
mul.ftz.f32 %f10, %f1, %f9;
shl.b64 %rd22, %rd3, 2;
add.s64 %rd23, %rd2, %rd22;
st.global.f32 [%rd23], %f10;
$L__BB156_4:
sub.s64 %rd24, %rd7, %rd4;
and.b64 %rd25, %rd24, 1;
sub.s64 %rd26, %rd7, %rd25;
add.s64 %rd6, %rd26, %rd3;
setp.ge.u64 %p3, %rd6, %rd7;
@%p3 bra $L__BB156_6;
shl.b64 %rd27, %rd6, 1;
add.s64 %rd28, %rd1, %rd27;
ld.global.u16 %rs4, [%rd28];
{ cvt.f32.f16 %f11, %rs4;}
sqrt.approx.ftz.f32 %f12, %f11;
mul.ftz.f32 %f13, %f1, %f12;
shl.b64 %rd29, %rd6, 2;
add.s64 %rd30, %rd2, %rd29;
st.global.f32 [%rd30], %f13;
$L__BB156_6:
ret;
}
.visible .entry _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4_(
.param .u64 _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_0,
.param .u64 _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_1,
.param .align 4 .b8 _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_2[12],
.param .u64 _ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_3
)
{
.reg .pred %p<2>;
.reg .b16 %rs<11>;
.reg .f32 %f<4>;
.reg .b32 %r<5>;
.reg .b64 %rd<11>;
ld.param.u64 %rd2, [_ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_0];
ld.param.u64 %rd4, [_ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_1];
ld.param.u64 %rd3, [_ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_3];
ld.param.f32 %f1, [_ZN4raft6linalg6detail10map_kernelILi1ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_2+4];
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %ntid.x;
mov.u32 %r3, %tid.x;
mad.lo.s32 %r4, %r1, %r2, %r3;
cvt.u64.u32 %rd1, %r4;
setp.ge.u64 %p1, %rd1, %rd4;
@%p1 bra $L__BB157_2;
cvta.to.global.u64 %rd5, %rd3;
shl.b64 %rd6, %rd1, 1;
add.s64 %rd7, %rd5, %rd6;
ld.global.u16 %rs10, [%rd7];
{ cvt.f32.f16 %f2, %rs10;}
mul.ftz.f32 %f3, %f1, %f2;
cvta.to.global.u64 %rd8, %rd2;
shl.b64 %rd9, %rd1, 2;
add.s64 %rd10, %rd8, %rd9;
st.global.f32 [%rd10], %f3;
$L__BB157_2:
ret;
}
.visible .entry _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4_(
.param .u64 _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_0,
.param .u64 _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_1,
.param .align 4 .b8 _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_2[12],
.param .u64 _ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_3
)
{
.reg .pred %p<4>;
.reg .b16 %rs<7>;
.reg .f32 %f<14>;
.reg .b32 %r<7>;
.reg .b64 %rd<31>;
ld.param.u64 %rd9, [_ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_0];
ld.param.u64 %rd7, [_ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_1];
ld.param.u64 %rd8, [_ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_3];
cvta.to.global.u64 %rd1, %rd8;
cvta.to.global.u64 %rd2, %rd9;
mov.u32 %r1, %ntid.x;
mov.u32 %r2, %ctaid.x;
mov.u32 %r3, %tid.x;
mad.lo.s32 %r4, %r2, %r1, %r3;
cvt.u64.u32 %rd3, %r4;
add.s64 %rd10, %rd9, 15;
and.b64 %rd11, %rd10, -16;
sub.s64 %rd12, %rd11, %rd9;
shr.s64 %rd13, %rd12, 2;
min.u64 %rd4, %rd13, %rd7;
mul.wide.u32 %rd14, %r4, 4;
add.s64 %rd5, %rd4, %rd14;
ld.param.f32 %f1, [_ZN4raft6linalg6detail10map_kernelILi4ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_2+4];
add.s64 %rd15, %rd5, 4;
setp.gt.u64 %p1, %rd15, %rd7;
@%p1 bra $L__BB158_2;
shl.b64 %rd17, %rd5, 1;
add.s64 %rd16, %rd8, %rd17;
ld.global.nc.v2.u32 {%r5,%r6}, [%rd16];
mov.b32 {%rs1, %rs2}, %r5;
{ cvt.f32.f16 %f2, %rs1;}
{ cvt.f32.f16 %f3, %rs2;}
mov.b32 {%rs3, %rs4}, %r6;
{ cvt.f32.f16 %f4, %rs3;}
{ cvt.f32.f16 %f5, %rs4;}
mul.ftz.f32 %f6, %f1, %f5;
mul.ftz.f32 %f7, %f1, %f4;
mul.ftz.f32 %f8, %f1, %f3;
mul.ftz.f32 %f9, %f1, %f2;
shl.b64 %rd18, %rd5, 2;
add.s64 %rd19, %rd2, %rd18;
st.global.v4.f32 [%rd19], {%f9, %f8, %f7, %f6};
$L__BB158_2:
setp.le.u64 %p2, %rd4, %rd3;
@%p2 bra $L__BB158_4;
shl.b64 %rd20, %rd3, 1;
add.s64 %rd21, %rd1, %rd20;
ld.global.u16 %rs5, [%rd21];
{ cvt.f32.f16 %f10, %rs5;}
mul.ftz.f32 %f11, %f1, %f10;
shl.b64 %rd22, %rd3, 2;
add.s64 %rd23, %rd2, %rd22;
st.global.f32 [%rd23], %f11;
$L__BB158_4:
sub.s64 %rd24, %rd7, %rd4;
and.b64 %rd25, %rd24, 3;
sub.s64 %rd26, %rd7, %rd25;
add.s64 %rd6, %rd26, %rd3;
setp.ge.u64 %p3, %rd6, %rd7;
@%p3 bra $L__BB158_6;
shl.b64 %rd27, %rd6, 1;
add.s64 %rd28, %rd1, %rd27;
ld.global.u16 %rs6, [%rd28];
{ cvt.f32.f16 %f12, %rs6;}
mul.ftz.f32 %f13, %f1, %f12;
shl.b64 %rd29, %rd6, 2;
add.s64 %rd30, %rd2, %rd29;
st.global.f32 [%rd30], %f13;
$L__BB158_6:
ret;
}
.visible .entry _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4_(
.param .u64 _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_0,
.param .u64 _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_1,
.param .align 4 .b8 _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_2[12],
.param .u64 _ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_3
)
{
.reg .pred %p<4>;
.reg .b16 %rs<5>;
.reg .f32 %f<10>;
.reg .b32 %r<6>;
.reg .b64 %rd<31>;
ld.param.u64 %rd9, [_ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_0];
ld.param.u64 %rd7, [_ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_1];
ld.param.u64 %rd8, [_ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_3];
cvta.to.global.u64 %rd1, %rd8;
cvta.to.global.u64 %rd2, %rd9;
mov.u32 %r1, %ntid.x;
mov.u32 %r2, %ctaid.x;
mov.u32 %r3, %tid.x;
mad.lo.s32 %r4, %r2, %r1, %r3;
cvt.u64.u32 %rd3, %r4;
add.s64 %rd10, %rd9, 7;
and.b64 %rd11, %rd10, -8;
sub.s64 %rd12, %rd11, %rd9;
shr.s64 %rd13, %rd12, 2;
min.u64 %rd4, %rd13, %rd7;
mul.wide.u32 %rd14, %r4, 2;
add.s64 %rd5, %rd4, %rd14;
ld.param.f32 %f1, [_ZN4raft6linalg6detail10map_kernelILi2ELb0EfmNS_10compose_opIJNS_13plug_const_opIfNS_6mul_opEEENS_7cast_opIfEEEEEJ6__halfEEEvPT1_T2_T3_DpPKT4__param_2+4];
add.s64 %rd15, %rd5, 2;
setp.gt.u64 %p1, %rd15, %rd7;
@%p1 bra $L__BB159_2;
shl.b64 %rd17, %rd5, 1;
add.s64 %rd16, %rd8, %rd17;
ld.global.nc.b32 %r5, [%rd16];
mov.b32 {%rs1, %rs2}, %r5;
{ cvt.f32.f16 %f2, %rs1;}
{ cvt.f32.f16 %f3, %rs2;}
mul.ftz.f32 %f4, %f1, %f3;
mul.ftz.f32 %f5, %f1, %f2;
shl.b64 %rd18, %rd5, 2;
add.s64 %rd19, %rd2, %rd18;
st.global.v2.f32 [%rd19], {%f5, %f4};
$L__BB159_2:
setp.le.u64 %p2, %rd4, %rd3;
@%p2 bra $L__BB159_4;
shl.b64 %rd20, %rd3, 1;
add.s64 %rd21, %rd1, %rd20;
ld.global.u16 %rs3, [%rd21];
{ cvt.f32.f16 %f6, %rs3;}
mul.ftz.f32 %f7, %f1, %f6;
shl.b64 %rd22, %rd3, 2;
add.s64 %rd23, %rd2, %rd22;
st.global.f32 [%rd23], %f7;
$L__BB159_4:
sub.s64 %rd24, %rd7, %rd4;
and.b64 %rd25, %rd24, 1;
sub.s64 %rd26, %rd7, %rd25;
add.s64 %rd6, %rd26, %rd3;
setp.ge.u64 %p3, %rd6, %rd7;
@%p3 bra $L__BB159_6;
shl.b64 %rd27, %rd6, 1;
add.s64 %rd28, %rd1, %rd27;
ld.global.u16 %rs4, [%rd28];
{ cvt.f32.f16 %f8, %rs4;}
mul.ftz.f32 %f9, %f1, %f8;
shl.b64 %rd29, %rd6, 2;
add.s64 %rd30, %rd2, %rd29;
st.global.f32 [%rd30], %f9;
$L__BB159_6:
ret;
}
.visible .entry _ZN4raft6linalg6detail10map_kernelILi1ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4_(
.param .u64 _ZN4raft6linalg6detail10map_kernelILi1ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_0,
.param .u32 _ZN4raft6linalg6detail10map_kernelILi1ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_1,
.param .align 8 .b8 _ZN4raft6linalg6detail10map_kernelILi1ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2[24]
)
{
.reg .pred %p<3>;
.reg .b16 %rs<9>;
.reg .f32 %f<6>;
.reg .b32 %r<15>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [_ZN4raft6linalg6detail10map_kernelILi1ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_0];
ld.param.u32 %r7, [_ZN4raft6linalg6detail10map_kernelILi1ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_1];
ld.param.v2.u32 {%r8, %r9}, [_ZN4raft6linalg6detail10map_kernelILi1ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2+8];
ld.param.f32 %f5, [_ZN4raft6linalg6detail10map_kernelILi1ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2+16];
ld.param.u64 %rd2, [_ZN4raft6linalg6detail10map_kernelILi1ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2];
mov.u32 %r10, %ntid.x;
mov.u32 %r11, %ctaid.x;
mov.u32 %r12, %tid.x;
mad.lo.s32 %r1, %r11, %r10, %r12;
setp.ge.u32 %p1, %r1, %r7;
@%p1 bra $L__BB160_4;
rem.u32 %r4, %r1, %r9;
setp.lt.u32 %p2, %r4, %r8;
@%p2 bra $L__BB160_2;
bra.uni $L__BB160_3;
$L__BB160_2:
cvta.to.global.u64 %rd3, %rd2;
div.u32 %r13, %r1, %r9;
mad.lo.s32 %r14, %r13, %r8, %r4;
mul.wide.u32 %rd4, %r14, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.f32 %f5, [%rd5];
$L__BB160_3:
cvta.to.global.u64 %rd6, %rd1;
mul.wide.u32 %rd7, %r1, 4;
add.s64 %rd8, %rd6, %rd7;
st.global.f32 [%rd8], %f5;
$L__BB160_4:
ret;
}
.visible .entry _ZN4raft6linalg6detail10map_kernelILi4ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4_(
.param .u64 _ZN4raft6linalg6detail10map_kernelILi4ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_0,
.param .u32 _ZN4raft6linalg6detail10map_kernelILi4ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_1,
.param .align 8 .b8 _ZN4raft6linalg6detail10map_kernelILi4ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2[24]
)
{
.reg .pred %p<10>;
.reg .b16 %rs<9>;
.reg .f32 %f<23>;
.reg .b32 %r<47>;
.reg .b64 %rd<29>;
ld.param.u64 %rd4, [_ZN4raft6linalg6detail10map_kernelILi4ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_0];
ld.param.u32 %r20, [_ZN4raft6linalg6detail10map_kernelILi4ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_1];
ld.param.v2.u32 {%r23, %r24}, [_ZN4raft6linalg6detail10map_kernelILi4ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2+8];
ld.param.f32 %f22, [_ZN4raft6linalg6detail10map_kernelILi4ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2+16];
ld.param.u64 %rd3, [_ZN4raft6linalg6detail10map_kernelILi4ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2];
cvta.to.global.u64 %rd1, %rd4;
mov.u32 %r25, %ntid.x;
mov.u32 %r26, %ctaid.x;
mov.u32 %r27, %tid.x;
mad.lo.s32 %r1, %r26, %r25, %r27;
add.s64 %rd5, %rd4, 15;
and.b64 %rd6, %rd5, 17179869168;
sub.s64 %rd7, %rd6, %rd4;
shr.u64 %rd8, %rd7, 2;
cvt.u32.u64 %r28, %rd8;
min.u32 %r2, %r28, %r20;
shl.b32 %r29, %r1, 2;
add.s32 %r3, %r2, %r29;
add.s32 %r30, %r3, 4;
setp.gt.u32 %p1, %r30, %r20;
@%p1 bra $L__BB161_10;
cvta.to.global.u64 %rd2, %rd3;
rem.u32 %r6, %r3, %r24;
setp.ge.u32 %p2, %r6, %r23;
mov.f32 %f17, %f22;
@%p2 bra $L__BB161_3;
div.u32 %r31, %r3, %r24;
mad.lo.s32 %r32, %r31, %r23, %r6;
mul.wide.u32 %rd9, %r32, 4;
add.s64 %rd10, %rd2, %rd9;
ld.global.f32 %f17, [%rd10];
$L__BB161_3:
add.s32 %r7, %r3, 1;
rem.u32 %r8, %r7, %r24;
setp.ge.u32 %p3, %r8, %r23;
mov.f32 %f18, %f22;
@%p3 bra $L__BB161_5;
div.u32 %r33, %r7, %r24;
mad.lo.s32 %r34, %r33, %r23, %r8;
mul.wide.u32 %rd11, %r34, 4;
add.s64 %rd12, %rd2, %rd11;
ld.global.f32 %f18, [%rd12];
$L__BB161_5:
add.s32 %r9, %r3, 2;
rem.u32 %r10, %r9, %r24;
setp.ge.u32 %p4, %r10, %r23;
mov.f32 %f19, %f22;
@%p4 bra $L__BB161_7;
div.u32 %r35, %r9, %r24;
mad.lo.s32 %r36, %r35, %r23, %r10;
mul.wide.u32 %rd13, %r36, 4;
add.s64 %rd14, %rd2, %rd13;
ld.global.f32 %f19, [%rd14];
$L__BB161_7:
add.s32 %r37, %r3, 3;
div.u32 %r12, %r37, %r24;
mul.lo.s32 %r38, %r12, %r24;
sub.s32 %r11, %r37, %r38;
setp.ge.u32 %p5, %r11, %r23;
mov.f32 %f20, %f22;
@%p5 bra $L__BB161_9;
mad.lo.s32 %r39, %r12, %r23, %r11;
mul.wide.u32 %rd15, %r39, 4;
add.s64 %rd16, %rd2, %rd15;
ld.global.f32 %f20, [%rd16];
$L__BB161_9:
mul.wide.u32 %rd17, %r3, 4;
add.s64 %rd18, %rd1, %rd17;
st.global.v4.f32 [%rd18], {%f17, %f18, %f19, %f20};
$L__BB161_10:
setp.ge.u32 %p6, %r1, %r2;
@%p6 bra $L__BB161_14;
rem.u32 %r15, %r1, %r24;
setp.lt.u32 %p7, %r15, %r23;
mov.f32 %f21, %f22;
@%p7 bra $L__BB161_12;
bra.uni $L__BB161_13;
$L__BB161_12:
cvta.to.global.u64 %rd19, %rd3;
div.u32 %r40, %r1, %r24;
mad.lo.s32 %r41, %r40, %r23, %r15;
mul.wide.u32 %rd20, %r41, 4;
add.s64 %rd21, %rd19, %rd20;
ld.global.f32 %f21, [%rd21];
$L__BB161_13:
mul.wide.u32 %rd22, %r1, 4;
add.s64 %rd23, %rd1, %rd22;
st.global.f32 [%rd23], %f21;
$L__BB161_14:
sub.s32 %r42, %r20, %r2;
and.b32 %r43, %r42, 3;
sub.s32 %r44, %r20, %r43;
add.s32 %r16, %r44, %r1;
setp.ge.u32 %p8, %r16, %r20;
@%p8 bra $L__BB161_18;
rem.u32 %r19, %r16, %r24;
setp.lt.u32 %p9, %r19, %r23;
@%p9 bra $L__BB161_16;
bra.uni $L__BB161_17;
$L__BB161_16:
cvta.to.global.u64 %rd24, %rd3;
div.u32 %r45, %r16, %r24;
mad.lo.s32 %r46, %r45, %r23, %r19;
mul.wide.u32 %rd25, %r46, 4;
add.s64 %rd26, %rd24, %rd25;
ld.global.f32 %f22, [%rd26];
$L__BB161_17:
mul.wide.u32 %rd27, %r16, 4;
add.s64 %rd28, %rd1, %rd27;
st.global.f32 [%rd28], %f22;
$L__BB161_18:
ret;
}
.visible .entry _ZN4raft6linalg6detail10map_kernelILi2ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4_(
.param .u64 _ZN4raft6linalg6detail10map_kernelILi2ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_0,
.param .u32 _ZN4raft6linalg6detail10map_kernelILi2ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_1,
.param .align 8 .b8 _ZN4raft6linalg6detail10map_kernelILi2ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2[24]
)
{
.reg .pred %p<8>;
.reg .b16 %rs<9>;
.reg .f32 %f<17>;
.reg .b32 %r<39>;
.reg .b64 %rd<25>;
ld.param.u64 %rd4, [_ZN4raft6linalg6detail10map_kernelILi2ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_0];
ld.param.u32 %r16, [_ZN4raft6linalg6detail10map_kernelILi2ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_1];
ld.param.v2.u32 {%r19, %r20}, [_ZN4raft6linalg6detail10map_kernelILi2ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2+8];
ld.param.f32 %f16, [_ZN4raft6linalg6detail10map_kernelILi2ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2+16];
ld.param.u64 %rd3, [_ZN4raft6linalg6detail10map_kernelILi2ELb1EfjZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2];
cvta.to.global.u64 %rd1, %rd4;
mov.u32 %r21, %ntid.x;
mov.u32 %r22, %ctaid.x;
mov.u32 %r23, %tid.x;
mad.lo.s32 %r1, %r22, %r21, %r23;
add.s64 %rd5, %rd4, 7;
and.b64 %rd6, %rd5, 17179869176;
sub.s64 %rd7, %rd6, %rd4;
shr.u64 %rd8, %rd7, 2;
cvt.u32.u64 %r24, %rd8;
min.u32 %r2, %r24, %r16;
shl.b32 %r25, %r1, 1;
add.s32 %r3, %r2, %r25;
add.s32 %r26, %r3, 2;
setp.gt.u32 %p1, %r26, %r16;
@%p1 bra $L__BB162_6;
cvta.to.global.u64 %rd2, %rd3;
rem.u32 %r6, %r3, %r20;
setp.ge.u32 %p2, %r6, %r19;
mov.f32 %f13, %f16;
@%p2 bra $L__BB162_3;
div.u32 %r27, %r3, %r20;
mad.lo.s32 %r28, %r27, %r19, %r6;
mul.wide.u32 %rd9, %r28, 4;
add.s64 %rd10, %rd2, %rd9;
ld.global.f32 %f13, [%rd10];
$L__BB162_3:
add.s32 %r29, %r3, 1;
div.u32 %r8, %r29, %r20;
mul.lo.s32 %r30, %r8, %r20;
sub.s32 %r7, %r29, %r30;
setp.ge.u32 %p3, %r7, %r19;
mov.f32 %f14, %f16;
@%p3 bra $L__BB162_5;
mad.lo.s32 %r31, %r8, %r19, %r7;
mul.wide.u32 %rd11, %r31, 4;
add.s64 %rd12, %rd2, %rd11;
ld.global.f32 %f14, [%rd12];
$L__BB162_5:
mul.wide.u32 %rd13, %r3, 4;
add.s64 %rd14, %rd1, %rd13;
st.global.v2.f32 [%rd14], {%f13, %f14};
$L__BB162_6:
setp.ge.u32 %p4, %r1, %r2;
@%p4 bra $L__BB162_10;
rem.u32 %r11, %r1, %r20;
setp.lt.u32 %p5, %r11, %r19;
mov.f32 %f15, %f16;
@%p5 bra $L__BB162_8;
bra.uni $L__BB162_9;
$L__BB162_8:
cvta.to.global.u64 %rd15, %rd3;
div.u32 %r32, %r1, %r20;
mad.lo.s32 %r33, %r32, %r19, %r11;
mul.wide.u32 %rd16, %r33, 4;
add.s64 %rd17, %rd15, %rd16;
ld.global.f32 %f15, [%rd17];
$L__BB162_9:
mul.wide.u32 %rd18, %r1, 4;
add.s64 %rd19, %rd1, %rd18;
st.global.f32 [%rd19], %f15;
$L__BB162_10:
sub.s32 %r34, %r16, %r2;
and.b32 %r35, %r34, 1;
sub.s32 %r36, %r16, %r35;
add.s32 %r12, %r36, %r1;
setp.ge.u32 %p6, %r12, %r16;
@%p6 bra $L__BB162_14;
rem.u32 %r15, %r12, %r20;
setp.lt.u32 %p7, %r15, %r19;
@%p7 bra $L__BB162_12;
bra.uni $L__BB162_13;
$L__BB162_12:
cvta.to.global.u64 %rd20, %rd3;
div.u32 %r37, %r12, %r20;
mad.lo.s32 %r38, %r37, %r19, %r15;
mul.wide.u32 %rd21, %r38, 4;
add.s64 %rd22, %rd20, %rd21;
ld.global.f32 %f16, [%rd22];
$L__BB162_13:
mul.wide.u32 %rd23, %r12, 4;
add.s64 %rd24, %rd1, %rd23;
st.global.f32 [%rd24], %f16;
$L__BB162_14:
ret;
}
.visible .entry _ZN4raft6linalg6detail10map_kernelILi1ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4_(
.param .u64 _ZN4raft6linalg6detail10map_kernelILi1ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_0,
.param .u64 _ZN4raft6linalg6detail10map_kernelILi1ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_1,
.param .align 8 .b8 _ZN4raft6linalg6detail10map_kernelILi1ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2[24]
)
{
.reg .pred %p<3>;
.reg .b16 %rs<9>;
.reg .f32 %f<6>;
.reg .b32 %r<16>;
.reg .b64 %rd<11>;
ld.param.u64 %rd2, [_ZN4raft6linalg6detail10map_kernelILi1ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_0];
ld.param.u64 %rd4, [_ZN4raft6linalg6detail10map_kernelILi1ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_1];
ld.param.v2.u32 {%r6, %r7}, [_ZN4raft6linalg6detail10map_kernelILi1ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2+8];
ld.param.f32 %f5, [_ZN4raft6linalg6detail10map_kernelILi1ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2+16];
ld.param.u64 %rd3, [_ZN4raft6linalg6detail10map_kernelILi1ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2];
mov.u32 %r8, %ntid.x;
mov.u32 %r9, %ctaid.x;
mov.u32 %r10, %tid.x;
mad.lo.s32 %r11, %r9, %r8, %r10;
cvt.u64.u32 %rd1, %r11;
setp.ge.u64 %p1, %rd1, %rd4;
@%p1 bra $L__BB163_4;
cvt.u32.u64 %r12, %rd1;
rem.u32 %r2, %r12, %r7;
setp.lt.u32 %p2, %r2, %r6;
@%p2 bra $L__BB163_2;
bra.uni $L__BB163_3;
$L__BB163_2:
cvta.to.global.u64 %rd5, %rd3;
div.u32 %r14, %r12, %r7;
mad.lo.s32 %r15, %r14, %r6, %r2;
mul.wide.u32 %rd6, %r15, 4;
add.s64 %rd7, %rd5, %rd6;
ld.global.f32 %f5, [%rd7];
$L__BB163_3:
cvta.to.global.u64 %rd8, %rd2;
shl.b64 %rd9, %rd1, 2;
add.s64 %rd10, %rd8, %rd9;
st.global.f32 [%rd10], %f5;
$L__BB163_4:
ret;
}
.visible .entry _ZN4raft6linalg6detail10map_kernelILi4ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4_(
.param .u64 _ZN4raft6linalg6detail10map_kernelILi4ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_0,
.param .u64 _ZN4raft6linalg6detail10map_kernelILi4ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_1,
.param .align 8 .b8 _ZN4raft6linalg6detail10map_kernelILi4ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2[24]
)
{
.reg .pred %p<10>;
.reg .b16 %rs<9>;
.reg .f32 %f<23>;
.reg .b32 %r<41>;
.reg .b64 %rd<39>;
ld.param.u64 %rd9, [_ZN4raft6linalg6detail10map_kernelILi4ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_0];
ld.param.u64 %rd7, [_ZN4raft6linalg6detail10map_kernelILi4ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_1];
ld.param.v2.u32 {%r19, %r20}, [_ZN4raft6linalg6detail10map_kernelILi4ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2+8];
ld.param.f32 %f22, [_ZN4raft6linalg6detail10map_kernelILi4ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2+16];
ld.param.u64 %rd8, [_ZN4raft6linalg6detail10map_kernelILi4ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2];
cvta.to.global.u64 %rd1, %rd9;
mov.u32 %r21, %ntid.x;
mov.u32 %r22, %ctaid.x;
mov.u32 %r23, %tid.x;
mad.lo.s32 %r24, %r22, %r21, %r23;
cvt.u64.u32 %rd2, %r24;
add.s64 %rd10, %rd9, 15;
and.b64 %rd11, %rd10, -16;
sub.s64 %rd12, %rd11, %rd9;
shr.s64 %rd13, %rd12, 2;
min.u64 %rd3, %rd13, %rd7;
mul.wide.u32 %rd14, %r24, 4;
add.s64 %rd4, %rd3, %rd14;
add.s64 %rd15, %rd4, 4;
setp.gt.u64 %p1, %rd15, %rd7;
@%p1 bra $L__BB164_10;
cvta.to.global.u64 %rd5, %rd8;
cvt.u32.u64 %r2, %rd4;
rem.u32 %r4, %r2, %r20;
setp.ge.u32 %p2, %r4, %r19;
mov.f32 %f17, %f22;
@%p2 bra $L__BB164_3;
div.u32 %r25, %r2, %r20;
mad.lo.s32 %r26, %r25, %r19, %r4;
mul.wide.u32 %rd16, %r26, 4;
add.s64 %rd17, %rd5, %rd16;
ld.global.f32 %f17, [%rd17];
$L__BB164_3:
add.s32 %r5, %r2, 1;
rem.u32 %r6, %r5, %r20;
setp.ge.u32 %p3, %r6, %r19;
mov.f32 %f18, %f22;
@%p3 bra $L__BB164_5;
div.u32 %r27, %r5, %r20;
mad.lo.s32 %r28, %r27, %r19, %r6;
mul.wide.u32 %rd18, %r28, 4;
add.s64 %rd19, %rd5, %rd18;
ld.global.f32 %f18, [%rd19];
$L__BB164_5:
add.s32 %r7, %r2, 2;
rem.u32 %r8, %r7, %r20;
setp.ge.u32 %p4, %r8, %r19;
mov.f32 %f19, %f22;
@%p4 bra $L__BB164_7;
div.u32 %r29, %r7, %r20;
mad.lo.s32 %r30, %r29, %r19, %r8;
mul.wide.u32 %rd20, %r30, 4;
add.s64 %rd21, %rd5, %rd20;
ld.global.f32 %f19, [%rd21];
$L__BB164_7:
add.s32 %r31, %r2, 3;
div.u32 %r10, %r31, %r20;
mul.lo.s32 %r32, %r10, %r20;
sub.s32 %r9, %r31, %r32;
setp.ge.u32 %p5, %r9, %r19;
mov.f32 %f20, %f22;
@%p5 bra $L__BB164_9;
mad.lo.s32 %r33, %r10, %r19, %r9;
mul.wide.u32 %rd22, %r33, 4;
add.s64 %rd23, %rd5, %rd22;
ld.global.f32 %f20, [%rd23];
$L__BB164_9:
shl.b64 %rd24, %rd4, 2;
add.s64 %rd25, %rd1, %rd24;
st.global.v4.f32 [%rd25], {%f17, %f18, %f19, %f20};
$L__BB164_10:
setp.le.u64 %p6, %rd3, %rd2;
@%p6 bra $L__BB164_14;
cvt.u32.u64 %r34, %rd2;
rem.u32 %r12, %r34, %r20;
setp.lt.u32 %p7, %r12, %r19;
mov.f32 %f21, %f22;
@%p7 bra $L__BB164_12;
bra.uni $L__BB164_13;
$L__BB164_12:
cvta.to.global.u64 %rd26, %rd8;
div.u32 %r36, %r34, %r20;
mad.lo.s32 %r37, %r36, %r19, %r12;
mul.wide.u32 %rd27, %r37, 4;
add.s64 %rd28, %rd26, %rd27;
ld.global.f32 %f21, [%rd28];
$L__BB164_13:
shl.b64 %rd29, %rd2, 2;
add.s64 %rd30, %rd1, %rd29;
st.global.f32 [%rd30], %f21;
$L__BB164_14:
sub.s64 %rd31, %rd7, %rd3;
and.b64 %rd32, %rd31, 3;
sub.s64 %rd33, %rd7, %rd32;
add.s64 %rd6, %rd33, %rd2;
setp.ge.u64 %p8, %rd6, %rd7;
@%p8 bra $L__BB164_18;
cvt.u32.u64 %r38, %rd6;
div.u32 %r16, %r38, %r20;
mul.lo.s32 %r39, %r16, %r20;
sub.s32 %r15, %r38, %r39;
setp.lt.u32 %p9, %r15, %r19;
@%p9 bra $L__BB164_16;
bra.uni $L__BB164_17;
$L__BB164_16:
cvta.to.global.u64 %rd34, %rd8;
mad.lo.s32 %r40, %r16, %r19, %r15;
mul.wide.u32 %rd35, %r40, 4;
add.s64 %rd36, %rd34, %rd35;
ld.global.f32 %f22, [%rd36];
$L__BB164_17:
shl.b64 %rd37, %rd6, 2;
add.s64 %rd38, %rd1, %rd37;
st.global.f32 [%rd38], %f22;
$L__BB164_18:
ret;
}
.visible .entry _ZN4raft6linalg6detail10map_kernelILi2ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4_(
.param .u64 _ZN4raft6linalg6detail10map_kernelILi2ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_0,
.param .u64 _ZN4raft6linalg6detail10map_kernelILi2ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_1,
.param .align 8 .b8 _ZN4raft6linalg6detail10map_kernelILi2ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2[24]
)
{
.reg .pred %p<8>;
.reg .b16 %rs<9>;
.reg .f32 %f<17>;
.reg .b32 %r<33>;
.reg .b64 %rd<35>;
ld.param.u64 %rd9, [_ZN4raft6linalg6detail10map_kernelILi2ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_0];
ld.param.u64 %rd7, [_ZN4raft6linalg6detail10map_kernelILi2ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_1];
ld.param.v2.u32 {%r15, %r16}, [_ZN4raft6linalg6detail10map_kernelILi2ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2+8];
ld.param.f32 %f16, [_ZN4raft6linalg6detail10map_kernelILi2ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2+16];
ld.param.u64 %rd8, [_ZN4raft6linalg6detail10map_kernelILi2ELb1EfmZNS_9neighbors6ivf_pq6detail15select_clustersIfEEvRKNS_16device_resourcesEPjPfjjjjjNS_8distance12DistanceTypeEPKT_PKfPN3rmm2mr22device_memory_resourceEEUljE_JEEEvPT1_T2_T3_DpPKT4__param_2];
cvta.to.global.u64 %rd1, %rd9;
mov.u32 %r17, %ntid.x;
mov.u32 %r18, %ctaid.x;
mov.u32 %r19, %tid.x;
mad.lo.s32 %r20, %r18, %r17, %r19;
cvt.u64.u32 %rd2, %r20;
add.s64 %rd10, %rd9, 7;
and.b64 %rd11, %rd10, -8;
sub.s64 %rd12, %rd11, %rd9;
shr.s64 %rd13, %rd12, 2;
min.u64 %rd3, %rd13, %rd7;
mul.wide.u32 %rd14, %r20, 2;
add.s64 %rd4, %rd3, %rd14;
add.s64 %rd15, %rd4, 2;
setp.gt.u64 %p1, %rd15, %rd7;
@%p1 bra $L__BB165_6;
cvta.to.global.u64 %rd5, %rd8;
cvt.u32.u64 %r2, %rd4;
rem.u32 %r4, %r2, %r16;
setp.ge.u32 %p2, %r4, %r15;
mov.f32 %f13, %f16;
@%p2 bra $L__BB165_3;
div.u32 %r21, %r2, %r16;
mad.lo.s32 %r22, %r21, %r15, %r4;
mul.wide.u32 %rd16, %r22, 4;
add.s64 %rd17, %rd5, %rd16;
ld.global.f32 %f13, [%rd17];
$L__BB165_3:
add.s32 %r23, %r2, 1;
div.u32 %r6, %r23, %r16;
mul.lo.s32 %r24, %r6, %r16;
sub.s32 %r5, %r23, %r24;
setp.ge.u32 %p3, %r5, %r15;
mov.f32 %f14, %f16;
@%p3 bra $L__BB165_5;
mad.lo.s32 %r25, %r6, %r15, %r5;
mul.wide.u32 %rd18, %r25, 4;
add.s64 %rd19, %rd5, %rd18;
ld.global.f32 %f14, [%rd19];
$L__BB165_5:
shl.b64 %rd20, %rd4, 2;
add.s64 %rd21, %rd1, %rd20;
st.global.v2.f32 [%rd21], {%f13, %f14};
$L__BB165_6:
setp.le.u64 %p4, %rd3, %rd2;
@%p4 bra $L__BB165_10;
cvt.u32.u64 %r26, %rd2;
rem.u32 %r8, %r26, %r16;
setp.lt.u32 %p5, %r8, %r15;
mov.f32 %f15, %f16;
@%p5 bra $L__BB165_8;
bra.uni $L__BB165_9;
$L__BB165_8:
cvta.to.global.u64 %rd22, %rd8;
div.u32 %r28, %r26, %r16;
mad.lo.s32 %r29, %r28, %r15, %r8;
mul.wide.u32 %rd23, %r29, 4;
add.s64 %rd24, %rd22, %rd23;
ld.global.f32 %f15, [%rd24];
$L__BB165_9:
shl.b64 %rd25, %rd2, 2;
add.s64 %rd26, %rd1, %rd25;
st.global.f32 [%rd26], %f15;
$L__BB165_10:
sub.s64 %rd27, %rd7, %rd3;
and.b64 %rd28, %rd27, 1;
sub.s64 %rd29, %rd7, %rd28;
add.s64 %rd6, %rd29, %rd2;
setp.ge.u64 %p6, %rd6, %rd7;
@%p6 bra $L__BB165_14;
cvt.u32.u64 %r30, %rd6;
div.u32 %r12, %r30, %r16;
mul.lo.s32 %r31, %r12, %r16;
sub.s32 %r11, %r30, %r31;
setp.lt.u32 %p7, %r11, %r15;
@%p7 bra $L__BB165_12;
bra.uni $L__BB165_13;
$L__BB165_12:
cvta.to.global.u64 %rd30, %rd8;
mad.lo.s32 %r32, %r12, %r15, %r11;
mul.wide.u32 %rd31, %r32, 4;
add.s64 %rd32, %rd30, %rd31;
ld.global.f32 %f16, [%rd32];
$L__BB165_13:
shl.b64 %rd33, %rd6, 2;
add.s64 %rd34, %rd1, %rd33;
st.global.f32 [%rd34], %f16;
$L__BB165_14:
ret;
}
.visible .entry _ZN4raft8distance6detail10initKernelIfNS_12KeyValuePairIlfEElNS1_26MinAndDistanceReduceOpImplIlfEEEEvPT0_T1_T_T2_(
.param .u64 _ZN4raft8distance6detail10initKernelIfNS_12KeyValuePairIlfEElNS1_26MinAndDistanceReduceOpImplIlfEEEEvPT0_T1_T_T2__param_0,
.param .u64 _ZN4raft8distance6detail10initKernelIfNS_12KeyValuePairIlfEElNS1_26MinAndDistanceReduceOpImplIlfEEEEvPT0_T1_T_T2__param_1,
.param .f32 _ZN4raft8distance6detail10initKernelIfNS_12KeyValuePairIlfEElNS1_26MinAndDistanceReduceOpImplIlfEEEEvPT0_T1_T_T2__param_2,
.param .align 1 .b8 _ZN4raft8distance6detail10initKernelIfNS_12KeyValuePairIlfEElNS1_26MinAndDistanceReduceOpImplIlfEEEEvPT0_T1_T_T2__param_3[1]
)
{
.reg .pred %p<2>;
.reg .f32 %f<2>;
.reg .b32 %r<4>;
.reg .b64 %rd<10>;
ld.param.u64 %rd2, [_ZN4raft8distance6detail10initKernelIfNS_12KeyValuePairIlfEElNS1_26MinAndDistanceReduceOpImplIlfEEEEvPT0_T1_T_T2__param_0];
ld.param.u64 %rd3, [_ZN4raft8distance6detail10initKernelIfNS_12KeyValuePairIlfEElNS1_26MinAndDistanceReduceOpImplIlfEEEEvPT0_T1_T_T2__param_1];
ld.param.f32 %f1, [_ZN4raft8distance6detail10initKernelIfNS_12KeyValuePairIlfEElNS1_26MinAndDistanceReduceOpImplIlfEEEEvPT0_T1_T_T2__param_2];
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %ntid.x;
mul.wide.u32 %rd4, %r1, %r2;
mov.u32 %r3, %tid.x;
cvt.u64.u32 %rd5, %r3;
add.s64 %rd1, %rd4, %rd5;
setp.ge.s64 %p1, %rd1, %rd3;
@%p1 bra $L__BB166_2;
cvta.to.global.u64 %rd6, %rd2;
shl.b64 %rd7, %rd1, 4;
add.s64 %rd8, %rd6, %rd7;
mov.u64 %rd9, 0;
st.global.u64 [%rd8], %rd9;
st.global.f32 [%rd8+8], %f1;
$L__BB166_2:
ret;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment