Skip to content

Instantly share code, notes, and snippets.

@ericcano
Created July 12, 2021 12:14
Show Gist options
  • Save ericcano/45b39038f8bcd83b87acc619af536db8 to your computer and use it in GitHub Desktop.
Save ericcano/45b39038f8bcd83b87acc619af536db8 to your computer and use it in GitHub Desktop.
PTX result (excerpt for compilation result of ea7ec59cbe597bca22e89fa59d4d35fbfea39388 of https://github.com/ericcano/soa)
.entry <unnamed>::eigenCrossProductSoA(double *, const double *, const double *, unsigned long, unsigned long)(
.param .u64 _ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_0,
.param .u64 _ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_1,
.param .u64 _ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_2,
.param .u64 _ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_3,
.param .u64 _ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_4
)
{
.reg .pred %p<2>;
.reg .b32 %r<5>;
.reg .f64 %fd<16>;
.reg .b64 %rd<24>;
$L__func_begin6:
ld.param.u64 %rd2, [_ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_0];
ld.param.u64 %rd3, [_ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_1];
ld.param.u64 %rd4, [_ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_2];
ld.param.u64 %rd6, [_ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_3];
ld.param.u64 %rd5, [_ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_4];
mov.u32 %r1, %ntid.x;
mov.u32 %r2, %ctaid.x;
mov.u32 %r3, %tid.x;
mad.lo.s32 %r4, %r2, %r1, %r3;
cvt.u64.u32 %rd1, %r4;
setp.ge.u64 %p1, %rd1, %rd6;
@%p1 bra $L__BB6_2;
cvta.to.global.u64 %rd7, %rd3;
shl.b64 %rd8, %rd1, 3;
add.s64 %rd9, %rd7, %rd8;
cvta.to.global.u64 %rd10, %rd2;
add.s64 %rd11, %rd10, %rd8;
shl.b64 %rd12, %rd5, 3;
add.s64 %rd13, %rd9, %rd12;
add.s64 %rd14, %rd1, %rd5;
add.s64 %rd15, %rd14, %rd5;
cvta.to.global.u64 %rd16, %rd4;
shl.b64 %rd17, %rd15, 3;
add.s64 %rd18, %rd16, %rd17;
ld.global.nc.f64 %fd1, [%rd18];
ld.global.nc.f64 %fd2, [%rd13];
mul.f64 %fd3, %fd2, %fd1;
add.s64 %rd19, %rd13, %rd12;
sub.s64 %rd20, %rd18, %rd12;
ld.global.nc.f64 %fd4, [%rd20];
ld.global.nc.f64 %fd5, [%rd19];
mul.f64 %fd6, %fd5, %fd4;
sub.f64 %fd7, %fd3, %fd6;
sub.s64 %rd21, %rd20, %rd12;
ld.global.nc.f64 %fd8, [%rd21];
mul.f64 %fd9, %fd5, %fd8;
ld.global.nc.f64 %fd10, [%rd9];
mul.f64 %fd11, %fd10, %fd1;
sub.f64 %fd12, %fd9, %fd11;
mul.f64 %fd13, %fd10, %fd4;
mul.f64 %fd14, %fd2, %fd8;
sub.f64 %fd15, %fd13, %fd14;
st.global.f64 [%rd11], %fd7;
add.s64 %rd22, %rd11, %rd12;
st.global.f64 [%rd22], %fd12;
add.s64 %rd23, %rd22, %rd12;
st.global.f64 [%rd23], %fd15;
$L__BB6_2:
ret;
$L__func_end6:
}
.entry <unnamed>::indirectCrossProductSoA(testSoA::SoA, testSoA::SoA, testSoA::SoA, unsigned long)(
.param .align 8 .b8 _ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_0[80],
.param .align 8 .b8 _ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_1[80],
.param .align 8 .b8 _ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_2[80],
.param .u64 _ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_3
)
{
.reg .pred %p<2>;
.reg .b32 %r<5>;
.reg .f64 %fd<16>;
.reg .b64 %rd<52>;
$L__func_begin3:
ld.param.u64 %rd32, [_ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_3];
ld.param.u64 %rd27, [_ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_2+40];
ld.param.u64 %rd26, [_ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_2+32];
ld.param.u64 %rd25, [_ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_2+24];
ld.param.u64 %rd17, [_ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_1+40];
ld.param.u64 %rd16, [_ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_1+32];
ld.param.u64 %rd15, [_ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_1+24];
ld.param.u64 %rd7, [_ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_0+40];
ld.param.u64 %rd6, [_ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_0+32];
ld.param.u64 %rd5, [_ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_0+24];
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %ntid.x;
mov.u32 %r3, %tid.x;
mad.lo.s32 %r4, %r1, %r2, %r3;
cvt.u64.u32 %rd1, %r4;
setp.ge.u64 %p1, %rd1, %rd32;
@%p1 bra $L__BB3_2;
cvta.to.global.u64 %rd33, %rd27;
cvta.to.global.u64 %rd34, %rd26;
cvta.to.global.u64 %rd35, %rd25;
cvta.to.global.u64 %rd36, %rd17;
cvta.to.global.u64 %rd37, %rd16;
cvta.to.global.u64 %rd38, %rd15;
cvta.to.global.u64 %rd39, %rd7;
cvta.to.global.u64 %rd40, %rd6;
cvta.to.global.u64 %rd41, %rd5;
shl.b64 %rd42, %rd1, 3;
add.s64 %rd43, %rd41, %rd42;
add.s64 %rd44, %rd40, %rd42;
add.s64 %rd45, %rd39, %rd42;
add.s64 %rd46, %rd38, %rd42;
add.s64 %rd47, %rd37, %rd42;
add.s64 %rd48, %rd36, %rd42;
add.s64 %rd49, %rd35, %rd42;
add.s64 %rd50, %rd34, %rd42;
add.s64 %rd51, %rd33, %rd42;
ld.global.nc.f64 %fd1, [%rd51];
ld.global.nc.f64 %fd2, [%rd47];
mul.f64 %fd3, %fd2, %fd1;
ld.global.nc.f64 %fd4, [%rd50];
ld.global.nc.f64 %fd5, [%rd48];
mul.f64 %fd6, %fd5, %fd4;
sub.f64 %fd7, %fd3, %fd6;
st.global.f64 [%rd43], %fd7;
ld.global.nc.f64 %fd8, [%rd49];
mul.f64 %fd9, %fd5, %fd8;
ld.global.nc.f64 %fd10, [%rd46];
mul.f64 %fd11, %fd10, %fd1;
sub.f64 %fd12, %fd9, %fd11;
st.global.f64 [%rd44], %fd12;
mul.f64 %fd13, %fd10, %fd4;
mul.f64 %fd14, %fd2, %fd8;
sub.f64 %fd15, %fd13, %fd14;
st.global.f64 [%rd45], %fd15;
$L__BB3_2:
ret;
$L__func_end3:
}
@ericcano
Copy link
Author

eigenCrossProductSoA compiled for architecture sm_35 in order to compare with the result from compiler explorer.

.entry <unnamed>::eigenCrossProductSoA(double *, const double *, const double *, unsigned long, unsigned long)(
        .param .u64 _ZN71_GLOBAL__N__47_tmpxft_000005d0_00000000_7_test_v7_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_0,
        .param .u64 _ZN71_GLOBAL__N__47_tmpxft_000005d0_00000000_7_test_v7_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_1,
        .param .u64 _ZN71_GLOBAL__N__47_tmpxft_000005d0_00000000_7_test_v7_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_2,
        .param .u64 _ZN71_GLOBAL__N__47_tmpxft_000005d0_00000000_7_test_v7_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_3,
        .param .u64 _ZN71_GLOBAL__N__47_tmpxft_000005d0_00000000_7_test_v7_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_4
)
{
        .reg .pred      %p<2>;
        .reg .b32       %r<5>;
        .reg .f64       %fd<16>;
        .reg .b64       %rd<24>;
$L__func_begin6:
        ld.param.u64    %rd2, [_ZN71_GLOBAL__N__47_tmpxft_000005d0_00000000_7_test_v7_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_0];
        ld.param.u64    %rd3, [_ZN71_GLOBAL__N__47_tmpxft_000005d0_00000000_7_test_v7_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_1];
        ld.param.u64    %rd4, [_ZN71_GLOBAL__N__47_tmpxft_000005d0_00000000_7_test_v7_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_2];
        ld.param.u64    %rd6, [_ZN71_GLOBAL__N__47_tmpxft_000005d0_00000000_7_test_v7_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_3];
        ld.param.u64    %rd5, [_ZN71_GLOBAL__N__47_tmpxft_000005d0_00000000_7_test_v7_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_4];
        mov.u32         %r1, %ntid.x;
        mov.u32         %r2, %ctaid.x;
        mov.u32         %r3, %tid.x;
        mad.lo.s32      %r4, %r2, %r1, %r3;
        cvt.u64.u32     %rd1, %r4;
        setp.ge.u64     %p1, %rd1, %rd6;
        @%p1 bra        $L__BB6_2;
        cvta.to.global.u64      %rd7, %rd3;
        shl.b64         %rd8, %rd1, 3;
        add.s64         %rd9, %rd7, %rd8;
        cvta.to.global.u64      %rd10, %rd2;
        add.s64         %rd11, %rd10, %rd8;
        shl.b64         %rd12, %rd5, 3;
        add.s64         %rd13, %rd9, %rd12;
        add.s64         %rd14, %rd1, %rd5;
        add.s64         %rd15, %rd14, %rd5;
        cvta.to.global.u64      %rd16, %rd4;
        shl.b64         %rd17, %rd15, 3;
        add.s64         %rd18, %rd16, %rd17;
        ld.global.nc.f64        %fd1, [%rd18];
        ld.global.nc.f64        %fd2, [%rd13];
        mul.f64         %fd3, %fd2, %fd1;
        add.s64         %rd19, %rd13, %rd12;
        sub.s64         %rd20, %rd18, %rd12;
        ld.global.nc.f64        %fd4, [%rd20];
        ld.global.nc.f64        %fd5, [%rd19];
        mul.f64         %fd6, %fd5, %fd4;
        sub.f64         %fd7, %fd3, %fd6;
        sub.s64         %rd21, %rd20, %rd12;
        ld.global.nc.f64        %fd8, [%rd21];
        mul.f64         %fd9, %fd5, %fd8;
        ld.global.nc.f64        %fd10, [%rd9];
        mul.f64         %fd11, %fd10, %fd1;
        sub.f64         %fd12, %fd9, %fd11;
        mul.f64         %fd13, %fd10, %fd4;
        mul.f64         %fd14, %fd2, %fd8;
        sub.f64         %fd15, %fd13, %fd14;
        st.global.f64   [%rd11], %fd7;
        add.s64         %rd22, %rd11, %rd12;
        st.global.f64   [%rd22], %fd12;
        add.s64         %rd23, %rd22, %rd12;
        st.global.f64   [%rd23], %fd15;
$L__BB6_2:
        ret;
$L__func_end6:
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment