Skip to content

Instantly share code, notes, and snippets.

@ericcano
Last active July 12, 2021 09:45
Show Gist options
  • Save ericcano/6a3265af706c7607e0448b609aa92cb9 to your computer and use it in GitHub Desktop.
Save ericcano/6a3265af706c7607e0448b609aa92cb9 to your computer and use it in GitHub Desktop.
PTX result (excerpt for compilation result of cc0114e4336d9bae130e9c2875b68ffd85a64823 of https://github.com/ericcano/soa)
.entry <unnamed>::indirectCrossProductSoA(testSoA::SoA, testSoA::SoA, testSoA::SoA, unsigned long)(
.param .align 8 .b8 _ZN71_GLOBAL__N__47_tmpxft_00000638_00000000_7_test_v7_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_0[80],
.param .align 8 .b8 _ZN71_GLOBAL__N__47_tmpxft_00000638_00000000_7_test_v7_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_1[80],
.param .align 8 .b8 _ZN71_GLOBAL__N__47_tmpxft_00000638_00000000_7_test_v7_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_2[80],
.param .u64 _ZN71_GLOBAL__N__47_tmpxft_00000638_00000000_7_test_v7_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_3
)
{
.reg .pred %p<2>;
.reg .b32 %r<5>;
.reg .f64 %fd<22>;
.reg .b64 %rd<52>;
$L__func_begin3:
ld.param.u64 %rd32, [_ZN71_GLOBAL__N__47_tmpxft_00000638_00000000_7_test_v7_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_3];
ld.param.u64 %rd27, [_ZN71_GLOBAL__N__47_tmpxft_00000638_00000000_7_test_v7_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_2+40];
ld.param.u64 %rd26, [_ZN71_GLOBAL__N__47_tmpxft_00000638_00000000_7_test_v7_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_2+32];
ld.param.u64 %rd25, [_ZN71_GLOBAL__N__47_tmpxft_00000638_00000000_7_test_v7_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_2+24];
ld.param.u64 %rd17, [_ZN71_GLOBAL__N__47_tmpxft_00000638_00000000_7_test_v7_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_1+40];
ld.param.u64 %rd16, [_ZN71_GLOBAL__N__47_tmpxft_00000638_00000000_7_test_v7_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_1+32];
ld.param.u64 %rd15, [_ZN71_GLOBAL__N__47_tmpxft_00000638_00000000_7_test_v7_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_1+24];
ld.param.u64 %rd7, [_ZN71_GLOBAL__N__47_tmpxft_00000638_00000000_7_test_v7_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_0+40];
ld.param.u64 %rd6, [_ZN71_GLOBAL__N__47_tmpxft_00000638_00000000_7_test_v7_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_0+32];
ld.param.u64 %rd5, [_ZN71_GLOBAL__N__47_tmpxft_00000638_00000000_7_test_v7_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_0+24];
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %ntid.x;
mov.u32 %r3, %tid.x;
mad.lo.s32 %r4, %r1, %r2, %r3;
cvt.u64.u32 %rd1, %r4;
setp.ge.u64 %p1, %rd1, %rd32;
@%p1 bra $L__BB3_2;
cvta.to.global.u64 %rd33, %rd27;
cvta.to.global.u64 %rd34, %rd26;
cvta.to.global.u64 %rd35, %rd25;
cvta.to.global.u64 %rd36, %rd17;
cvta.to.global.u64 %rd37, %rd16;
cvta.to.global.u64 %rd38, %rd15;
cvta.to.global.u64 %rd39, %rd7;
cvta.to.global.u64 %rd40, %rd6;
cvta.to.global.u64 %rd41, %rd5;
shl.b64 %rd42, %rd1, 3;
add.s64 %rd43, %rd41, %rd42;
add.s64 %rd44, %rd40, %rd42;
add.s64 %rd45, %rd39, %rd42;
add.s64 %rd46, %rd38, %rd42;
add.s64 %rd47, %rd37, %rd42;
add.s64 %rd48, %rd36, %rd42;
add.s64 %rd49, %rd35, %rd42;
add.s64 %rd50, %rd34, %rd42;
add.s64 %rd51, %rd33, %rd42;
ld.global.f64 %fd1, [%rd51];
ld.global.f64 %fd2, [%rd47];
mul.f64 %fd3, %fd2, %fd1;
ld.global.f64 %fd4, [%rd50];
ld.global.f64 %fd5, [%rd48];
mul.f64 %fd6, %fd5, %fd4;
sub.f64 %fd7, %fd3, %fd6;
st.global.f64 [%rd43], %fd7;
ld.global.f64 %fd8, [%rd49];
ld.global.f64 %fd9, [%rd48];
mul.f64 %fd10, %fd9, %fd8;
ld.global.f64 %fd11, [%rd51];
ld.global.f64 %fd12, [%rd46];
mul.f64 %fd13, %fd12, %fd11;
sub.f64 %fd14, %fd10, %fd13;
st.global.f64 [%rd44], %fd14;
ld.global.f64 %fd15, [%rd50];
ld.global.f64 %fd16, [%rd46];
mul.f64 %fd17, %fd16, %fd15;
ld.global.f64 %fd18, [%rd49];
ld.global.f64 %fd19, [%rd47];
mul.f64 %fd20, %fd19, %fd18;
sub.f64 %fd21, %fd17, %fd20;
st.global.f64 [%rd45], %fd21;
$L__BB3_2:
ret;
$L__func_end3:
}
.entry <unnamed>::eigenCrossProductSoA(double *, const double *, const double *, unsigned long, unsigned long)(
.param .u64 _ZN71_GLOBAL__N__47_tmpxft_00000638_00000000_7_test_v7_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_0,
.param .u64 _ZN71_GLOBAL__N__47_tmpxft_00000638_00000000_7_test_v7_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_1,
.param .u64 _ZN71_GLOBAL__N__47_tmpxft_00000638_00000000_7_test_v7_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_2,
.param .u64 _ZN71_GLOBAL__N__47_tmpxft_00000638_00000000_7_test_v7_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_3,
.param .u64 _ZN71_GLOBAL__N__47_tmpxft_00000638_00000000_7_test_v7_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_4
)
{
.reg .pred %p<2>;
.reg .b32 %r<5>;
.reg .f64 %fd<16>;
.reg .b64 %rd<24>;
$L__func_begin6:
ld.param.u64 %rd2, [_ZN71_GLOBAL__N__47_tmpxft_00000638_00000000_7_test_v7_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_0];
ld.param.u64 %rd3, [_ZN71_GLOBAL__N__47_tmpxft_00000638_00000000_7_test_v7_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_1];
ld.param.u64 %rd4, [_ZN71_GLOBAL__N__47_tmpxft_00000638_00000000_7_test_v7_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_2];
ld.param.u64 %rd6, [_ZN71_GLOBAL__N__47_tmpxft_00000638_00000000_7_test_v7_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_3];
ld.param.u64 %rd5, [_ZN71_GLOBAL__N__47_tmpxft_00000638_00000000_7_test_v7_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_4];
mov.u32 %r1, %ntid.x;
mov.u32 %r2, %ctaid.x;
mov.u32 %r3, %tid.x;
mad.lo.s32 %r4, %r2, %r1, %r3;
cvt.u64.u32 %rd1, %r4;
setp.ge.u64 %p1, %rd1, %rd6;
@%p1 bra $L__BB6_2;
cvta.to.global.u64 %rd7, %rd3;
shl.b64 %rd8, %rd1, 3;
add.s64 %rd9, %rd7, %rd8;
cvta.to.global.u64 %rd10, %rd2;
add.s64 %rd11, %rd10, %rd8;
shl.b64 %rd12, %rd5, 3;
add.s64 %rd13, %rd9, %rd12;
add.s64 %rd14, %rd1, %rd5;
add.s64 %rd15, %rd14, %rd5;
cvta.to.global.u64 %rd16, %rd4;
shl.b64 %rd17, %rd15, 3;
add.s64 %rd18, %rd16, %rd17;
ld.global.f64 %fd1, [%rd18];
ld.global.f64 %fd2, [%rd13];
mul.f64 %fd3, %fd2, %fd1;
add.s64 %rd19, %rd13, %rd12;
sub.s64 %rd20, %rd18, %rd12;
ld.global.f64 %fd4, [%rd20];
ld.global.f64 %fd5, [%rd19];
mul.f64 %fd6, %fd5, %fd4;
sub.f64 %fd7, %fd3, %fd6;
sub.s64 %rd21, %rd20, %rd12;
ld.global.f64 %fd8, [%rd21];
mul.f64 %fd9, %fd5, %fd8;
ld.global.f64 %fd10, [%rd9];
mul.f64 %fd11, %fd10, %fd1;
sub.f64 %fd12, %fd9, %fd11;
mul.f64 %fd13, %fd10, %fd4;
mul.f64 %fd14, %fd2, %fd8;
sub.f64 %fd15, %fd13, %fd14;
st.global.f64 [%rd11], %fd7;
add.s64 %rd22, %rd11, %rd12;
st.global.f64 [%rd22], %fd12;
add.s64 %rd23, %rd22, %rd12;
st.global.f64 [%rd23], %fd15;
$L__BB6_2:
ret;
$L__func_end6:
}
@ericcano
Copy link
Author

This output is generated from the Makefile generated .tmp/test_v7_cuda.cu.ptx with extra filtering:

cat .tmp/test_v7_cuda.cu.ptx  | grep -v // | grep -v .loc | grep -v ^.global | grep -v ^$ | grep -v L__tmp | cu++filt | less

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment