Skip to content

Instantly share code, notes, and snippets.

@ericcano
Last active July 15, 2021 15:36
Show Gist options
  • Save ericcano/26cca8a25e7d6d04b75956db244a2690 to your computer and use it in GitHub Desktop.
Save ericcano/26cca8a25e7d6d04b75956db244a2690 to your computer and use it in GitHub Desktop.
.entry <unnamed>::eigenCrossProductSoA(double *, const double *, const double *, unsigned long, unsigned long)(
.param .u64 _ZN71_GLOBAL__N__47_tmpxft_00003474_00000000_7_test_v9_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_0,
.param .u64 _ZN71_GLOBAL__N__47_tmpxft_00003474_00000000_7_test_v9_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_1,
.param .u64 _ZN71_GLOBAL__N__47_tmpxft_00003474_00000000_7_test_v9_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_2,
.param .u64 _ZN71_GLOBAL__N__47_tmpxft_00003474_00000000_7_test_v9_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_3,
.param .u64 _ZN71_GLOBAL__N__47_tmpxft_00003474_00000000_7_test_v9_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_4
)
{
.reg .pred %p<2>;
.reg .b32 %r<5>;
.reg .f64 %fd<16>;
.reg .b64 %rd<24>;
$L__func_begin6:
ld.param.u64 %rd2, [_ZN71_GLOBAL__N__47_tmpxft_00003474_00000000_7_test_v9_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_0];
ld.param.u64 %rd3, [_ZN71_GLOBAL__N__47_tmpxft_00003474_00000000_7_test_v9_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_1];
ld.param.u64 %rd4, [_ZN71_GLOBAL__N__47_tmpxft_00003474_00000000_7_test_v9_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_2];
ld.param.u64 %rd6, [_ZN71_GLOBAL__N__47_tmpxft_00003474_00000000_7_test_v9_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_3];
ld.param.u64 %rd5, [_ZN71_GLOBAL__N__47_tmpxft_00003474_00000000_7_test_v9_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_4];
mov.u32 %r1, %ntid.x;
mov.u32 %r2, %ctaid.x;
mov.u32 %r3, %tid.x;
mad.lo.s32 %r4, %r2, %r1, %r3;
cvt.u64.u32 %rd1, %r4;
setp.ge.u64 %p1, %rd1, %rd6;
@%p1 bra $L__BB6_2;
cvta.to.global.u64 %rd7, %rd3;
shl.b64 %rd8, %rd1, 3;
add.s64 %rd9, %rd7, %rd8;
cvta.to.global.u64 %rd10, %rd2;
add.s64 %rd11, %rd10, %rd8;
shl.b64 %rd12, %rd5, 3;
add.s64 %rd13, %rd9, %rd12;
add.s64 %rd14, %rd1, %rd5;
add.s64 %rd15, %rd14, %rd5;
cvta.to.global.u64 %rd16, %rd4;
shl.b64 %rd17, %rd15, 3;
add.s64 %rd18, %rd16, %rd17;
ld.global.nc.f64 %fd1, [%rd18];
ld.global.nc.f64 %fd2, [%rd13];
mul.f64 %fd3, %fd2, %fd1;
add.s64 %rd19, %rd13, %rd12;
sub.s64 %rd20, %rd18, %rd12;
ld.global.nc.f64 %fd4, [%rd20];
ld.global.nc.f64 %fd5, [%rd19];
mul.f64 %fd6, %fd5, %fd4;
sub.f64 %fd7, %fd3, %fd6;
sub.s64 %rd21, %rd20, %rd12;
ld.global.nc.f64 %fd8, [%rd21];
mul.f64 %fd9, %fd5, %fd8;
ld.global.nc.f64 %fd10, [%rd9];
mul.f64 %fd11, %fd10, %fd1;
sub.f64 %fd12, %fd9, %fd11;
mul.f64 %fd13, %fd10, %fd4;
mul.f64 %fd14, %fd2, %fd8;
sub.f64 %fd15, %fd13, %fd14;
st.global.f64 [%rd11], %fd7;
add.s64 %rd22, %rd11, %rd12;
st.global.f64 [%rd22], %fd12;
add.s64 %rd23, %rd22, %rd12;
st.global.f64 [%rd23], %fd15;
$L__BB6_2:
ret;
$L__func_end6:
}
.entry <unnamed>::embeddedCrossProductSoA(testSoA::SoA)(
.param .align 8 .b8 _ZN71_GLOBAL__N__47_tmpxft_00003474_00000000_7_test_v9_cuda_cpp1_ii_e64742fe23embeddedCrossProductSoAEN7testSoA3SoAE_param_0[128]
)
{
.reg .pred %p<2>;
.reg .b32 %r<5>;
.reg .f64 %fd<16>;
.reg .b64 %rd<36>;
$L__func_begin7:
ld.param.u64 %rd13, [_ZN71_GLOBAL__N__47_tmpxft_00003474_00000000_7_test_v9_cuda_cpp1_ii_e64742fe23embeddedCrossProductSoAEN7testSoA3SoAE_param_0+88];
ld.param.u64 %rd12, [_ZN71_GLOBAL__N__47_tmpxft_00003474_00000000_7_test_v9_cuda_cpp1_ii_e64742fe23embeddedCrossProductSoAEN7testSoA3SoAE_param_0+80];
ld.param.u64 %rd11, [_ZN71_GLOBAL__N__47_tmpxft_00003474_00000000_7_test_v9_cuda_cpp1_ii_e64742fe23embeddedCrossProductSoAEN7testSoA3SoAE_param_0+72];
ld.param.u64 %rd10, [_ZN71_GLOBAL__N__47_tmpxft_00003474_00000000_7_test_v9_cuda_cpp1_ii_e64742fe23embeddedCrossProductSoAEN7testSoA3SoAE_param_0+64];
ld.param.u64 %rd9, [_ZN71_GLOBAL__N__47_tmpxft_00003474_00000000_7_test_v9_cuda_cpp1_ii_e64742fe23embeddedCrossProductSoAEN7testSoA3SoAE_param_0+56];
ld.param.u64 %rd8, [_ZN71_GLOBAL__N__47_tmpxft_00003474_00000000_7_test_v9_cuda_cpp1_ii_e64742fe23embeddedCrossProductSoAEN7testSoA3SoAE_param_0+48];
ld.param.u64 %rd3, [_ZN71_GLOBAL__N__47_tmpxft_00003474_00000000_7_test_v9_cuda_cpp1_ii_e64742fe23embeddedCrossProductSoAEN7testSoA3SoAE_param_0+8];
mov.u32 %r1, %ntid.x;
mov.u32 %r2, %ctaid.x;
mov.u32 %r3, %tid.x;
mad.lo.s32 %r4, %r2, %r1, %r3;
cvt.u64.u32 %rd1, %r4;
setp.le.u64 %p1, %rd3, %rd1;
@%p1 bra $L__BB7_2;
cvta.to.global.u64 %rd18, %rd12;
cvta.to.global.u64 %rd19, %rd10;
cvta.to.global.u64 %rd20, %rd8;
shl.b64 %rd21, %rd1, 3;
add.s64 %rd22, %rd20, %rd21;
add.s64 %rd23, %rd19, %rd21;
add.s64 %rd24, %rd18, %rd21;
shl.b64 %rd25, %rd9, 3;
add.s64 %rd26, %rd22, %rd25;
shl.b64 %rd27, %rd11, 4;
add.s64 %rd28, %rd23, %rd27;
ld.global.f64 %fd1, [%rd28];
ld.global.f64 %fd2, [%rd26];
mul.f64 %fd3, %fd2, %fd1;
add.s64 %rd29, %rd26, %rd25;
shl.b64 %rd30, %rd11, 3;
sub.s64 %rd31, %rd28, %rd30;
ld.global.f64 %fd4, [%rd31];
ld.global.f64 %fd5, [%rd29];
mul.f64 %fd6, %fd5, %fd4;
sub.f64 %fd7, %fd3, %fd6;
sub.s64 %rd32, %rd31, %rd30;
ld.global.f64 %fd8, [%rd32];
mul.f64 %fd9, %fd5, %fd8;
ld.global.f64 %fd10, [%rd22];
mul.f64 %fd11, %fd10, %fd1;
sub.f64 %fd12, %fd9, %fd11;
mul.f64 %fd13, %fd10, %fd4;
mul.f64 %fd14, %fd2, %fd8;
sub.f64 %fd15, %fd13, %fd14;
st.global.f64 [%rd24], %fd7;
shl.b64 %rd33, %rd13, 3;
add.s64 %rd34, %rd24, %rd33;
st.global.f64 [%rd34], %fd12;
add.s64 %rd35, %rd34, %rd33;
st.global.f64 [%rd35], %fd15;
$L__BB7_2:
ret;
$L__func_end7:
}
.entry <unnamed>::embeddedCrossProductLocalObjectSoA(std::byte *, unsigned long)(
.param .u64 _ZN71_GLOBAL__N__47_tmpxft_00003688_00000000_7_test_v9_cuda_cpp1_ii_e64742fe34embeddedCrossProductLocalObjectSoAEPSt4bytem_param_0,
.param .u64 _ZN71_GLOBAL__N__47_tmpxft_00003688_00000000_7_test_v9_cuda_cpp1_ii_e64742fe34embeddedCrossProductLocalObjectSoAEPSt4bytem_param_1
)
{
.reg .pred %p<2>;
.reg .b32 %r<5>;
.reg .f64 %fd<16>;
.reg .b64 %rd<25>;
$L__func_begin8:
ld.param.u64 %rd2, [_ZN71_GLOBAL__N__47_tmpxft_00003688_00000000_7_test_v9_cuda_cpp1_ii_e64742fe34embeddedCrossProductLocalObjectSoAEPSt4bytem_param_0];
ld.param.u64 %rd3, [_ZN71_GLOBAL__N__47_tmpxft_00003688_00000000_7_test_v9_cuda_cpp1_ii_e64742fe34embeddedCrossProductLocalObjectSoAEPSt4bytem_param_1];
mov.u32 %r1, %ntid.x;
mov.u32 %r2, %ctaid.x;
mov.u32 %r3, %tid.x;
mad.lo.s32 %r4, %r2, %r1, %r3;
cvt.u64.u32 %rd1, %r4;
setp.ge.u64 %p1, %rd1, %rd3;
@%p1 bra $L__BB8_2;
cvta.to.global.u64 %rd4, %rd2;
shl.b64 %rd5, %rd3, 3;
add.s64 %rd6, %rd5, 127;
and.b64 %rd7, %rd6, -128;
mul.lo.s64 %rd8, %rd7, 3;
add.s64 %rd9, %rd4, %rd8;
mul.lo.s64 %rd10, %rd7, 6;
add.s64 %rd11, %rd4, %rd10;
shl.b64 %rd12, %rd1, 3;
add.s64 %rd13, %rd9, %rd12;
add.s64 %rd14, %rd11, %rd12;
add.s64 %rd15, %rd13, %rd7;
shl.b64 %rd16, %rd6, 1;
and.b64 %rd17, %rd16, -256;
add.s64 %rd18, %rd14, %rd17;
ld.global.f64 %fd1, [%rd18];
ld.global.f64 %fd2, [%rd15];
mul.f64 %fd3, %fd2, %fd1;
add.s64 %rd19, %rd15, %rd7;
add.s64 %rd20, %rd14, %rd7;
ld.global.f64 %fd4, [%rd20];
ld.global.f64 %fd5, [%rd19];
mul.f64 %fd6, %fd5, %fd4;
sub.f64 %fd7, %fd3, %fd6;
add.s64 %rd21, %rd19, %rd7;
ld.global.f64 %fd8, [%rd21];
mul.f64 %fd9, %fd5, %fd8;
ld.global.f64 %fd10, [%rd13];
mul.f64 %fd11, %fd10, %fd1;
sub.f64 %fd12, %fd9, %fd11;
mul.f64 %fd13, %fd10, %fd4;
mul.f64 %fd14, %fd2, %fd8;
sub.f64 %fd15, %fd13, %fd14;
add.s64 %rd22, %rd18, %rd7;
st.global.f64 [%rd22], %fd7;
add.s64 %rd23, %rd22, %rd7;
st.global.f64 [%rd23], %fd12;
add.s64 %rd24, %rd23, %rd7;
st.global.f64 [%rd24], %fd15;
$L__BB8_2:
ret;
$L__func_end8:
}
.entry <unnamed>::indirectCrossProductSoA(testSoA::SoA, testSoA::SoA, testSoA::SoA, unsigned long)(
.param .align 8 .b8 _ZN71_GLOBAL__N__47_tmpxft_00003474_00000000_7_test_v9_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_0[128],
.param .align 8 .b8 _ZN71_GLOBAL__N__47_tmpxft_00003474_00000000_7_test_v9_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_1[128],
.param .align 8 .b8 _ZN71_GLOBAL__N__47_tmpxft_00003474_00000000_7_test_v9_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_2[128],
.param .u64 _ZN71_GLOBAL__N__47_tmpxft_00003474_00000000_7_test_v9_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_3
)
{
.reg .pred %p<2>;
.reg .b32 %r<5>;
.reg .f64 %fd<16>;
.reg .b64 %rd<70>;
$L__func_begin3:
ld.param.u64 %rd50, [_ZN71_GLOBAL__N__47_tmpxft_00003474_00000000_7_test_v9_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_3];
ld.param.u64 %rd39, [_ZN71_GLOBAL__N__47_tmpxft_00003474_00000000_7_test_v9_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_2+40];
ld.param.u64 %rd38, [_ZN71_GLOBAL__N__47_tmpxft_00003474_00000000_7_test_v9_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_2+32];
ld.param.u64 %rd37, [_ZN71_GLOBAL__N__47_tmpxft_00003474_00000000_7_test_v9_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_2+24];
ld.param.u64 %rd23, [_ZN71_GLOBAL__N__47_tmpxft_00003474_00000000_7_test_v9_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_1+40];
ld.param.u64 %rd22, [_ZN71_GLOBAL__N__47_tmpxft_00003474_00000000_7_test_v9_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_1+32];
ld.param.u64 %rd21, [_ZN71_GLOBAL__N__47_tmpxft_00003474_00000000_7_test_v9_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_1+24];
ld.param.u64 %rd7, [_ZN71_GLOBAL__N__47_tmpxft_00003474_00000000_7_test_v9_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_0+40];
ld.param.u64 %rd6, [_ZN71_GLOBAL__N__47_tmpxft_00003474_00000000_7_test_v9_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_0+32];
ld.param.u64 %rd5, [_ZN71_GLOBAL__N__47_tmpxft_00003474_00000000_7_test_v9_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_0+24];
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %ntid.x;
mov.u32 %r3, %tid.x;
mad.lo.s32 %r4, %r1, %r2, %r3;
cvt.u64.u32 %rd1, %r4;
setp.ge.u64 %p1, %rd1, %rd50;
@%p1 bra $L__BB3_2;
cvta.to.global.u64 %rd51, %rd39;
cvta.to.global.u64 %rd52, %rd38;
cvta.to.global.u64 %rd53, %rd37;
cvta.to.global.u64 %rd54, %rd23;
cvta.to.global.u64 %rd55, %rd22;
cvta.to.global.u64 %rd56, %rd21;
cvta.to.global.u64 %rd57, %rd7;
cvta.to.global.u64 %rd58, %rd6;
cvta.to.global.u64 %rd59, %rd5;
shl.b64 %rd60, %rd1, 3;
add.s64 %rd61, %rd59, %rd60;
add.s64 %rd62, %rd58, %rd60;
add.s64 %rd63, %rd57, %rd60;
add.s64 %rd64, %rd56, %rd60;
add.s64 %rd65, %rd55, %rd60;
add.s64 %rd66, %rd54, %rd60;
add.s64 %rd67, %rd53, %rd60;
add.s64 %rd68, %rd52, %rd60;
add.s64 %rd69, %rd51, %rd60;
ld.global.nc.f64 %fd1, [%rd69];
ld.global.nc.f64 %fd2, [%rd65];
mul.f64 %fd3, %fd2, %fd1;
ld.global.nc.f64 %fd4, [%rd68];
ld.global.nc.f64 %fd5, [%rd66];
mul.f64 %fd6, %fd5, %fd4;
sub.f64 %fd7, %fd3, %fd6;
st.global.f64 [%rd61], %fd7;
ld.global.nc.f64 %fd8, [%rd67];
mul.f64 %fd9, %fd5, %fd8;
ld.global.nc.f64 %fd10, [%rd64];
mul.f64 %fd11, %fd10, %fd1;
sub.f64 %fd12, %fd9, %fd11;
st.global.f64 [%rd62], %fd12;
mul.f64 %fd13, %fd10, %fd4;
mul.f64 %fd14, %fd2, %fd8;
sub.f64 %fd15, %fd13, %fd14;
st.global.f64 [%rd63], %fd15;
$L__BB3_2:
ret;
$L__func_end3:
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment