Created
July 12, 2021 12:14
-
-
Save ericcano/45b39038f8bcd83b87acc619af536db8 to your computer and use it in GitHub Desktop.
PTX result (excerpt for compilation result of ea7ec59cbe597bca22e89fa59d4d35fbfea39388 of https://github.com/ericcano/soa)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.entry <unnamed>::eigenCrossProductSoA(double *, const double *, const double *, unsigned long, unsigned long)( | |
.param .u64 _ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_0, | |
.param .u64 _ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_1, | |
.param .u64 _ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_2, | |
.param .u64 _ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_3, | |
.param .u64 _ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_4 | |
) | |
{ | |
.reg .pred %p<2>; | |
.reg .b32 %r<5>; | |
.reg .f64 %fd<16>; | |
.reg .b64 %rd<24>; | |
$L__func_begin6: | |
ld.param.u64 %rd2, [_ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_0]; | |
ld.param.u64 %rd3, [_ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_1]; | |
ld.param.u64 %rd4, [_ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_2]; | |
ld.param.u64 %rd6, [_ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_3]; | |
ld.param.u64 %rd5, [_ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe20eigenCrossProductSoAEPdPKdS2_mm_param_4]; | |
mov.u32 %r1, %ntid.x; | |
mov.u32 %r2, %ctaid.x; | |
mov.u32 %r3, %tid.x; | |
mad.lo.s32 %r4, %r2, %r1, %r3; | |
cvt.u64.u32 %rd1, %r4; | |
setp.ge.u64 %p1, %rd1, %rd6; | |
@%p1 bra $L__BB6_2; | |
cvta.to.global.u64 %rd7, %rd3; | |
shl.b64 %rd8, %rd1, 3; | |
add.s64 %rd9, %rd7, %rd8; | |
cvta.to.global.u64 %rd10, %rd2; | |
add.s64 %rd11, %rd10, %rd8; | |
shl.b64 %rd12, %rd5, 3; | |
add.s64 %rd13, %rd9, %rd12; | |
add.s64 %rd14, %rd1, %rd5; | |
add.s64 %rd15, %rd14, %rd5; | |
cvta.to.global.u64 %rd16, %rd4; | |
shl.b64 %rd17, %rd15, 3; | |
add.s64 %rd18, %rd16, %rd17; | |
ld.global.nc.f64 %fd1, [%rd18]; | |
ld.global.nc.f64 %fd2, [%rd13]; | |
mul.f64 %fd3, %fd2, %fd1; | |
add.s64 %rd19, %rd13, %rd12; | |
sub.s64 %rd20, %rd18, %rd12; | |
ld.global.nc.f64 %fd4, [%rd20]; | |
ld.global.nc.f64 %fd5, [%rd19]; | |
mul.f64 %fd6, %fd5, %fd4; | |
sub.f64 %fd7, %fd3, %fd6; | |
sub.s64 %rd21, %rd20, %rd12; | |
ld.global.nc.f64 %fd8, [%rd21]; | |
mul.f64 %fd9, %fd5, %fd8; | |
ld.global.nc.f64 %fd10, [%rd9]; | |
mul.f64 %fd11, %fd10, %fd1; | |
sub.f64 %fd12, %fd9, %fd11; | |
mul.f64 %fd13, %fd10, %fd4; | |
mul.f64 %fd14, %fd2, %fd8; | |
sub.f64 %fd15, %fd13, %fd14; | |
st.global.f64 [%rd11], %fd7; | |
add.s64 %rd22, %rd11, %rd12; | |
st.global.f64 [%rd22], %fd12; | |
add.s64 %rd23, %rd22, %rd12; | |
st.global.f64 [%rd23], %fd15; | |
$L__BB6_2: | |
ret; | |
$L__func_end6: | |
} | |
.entry <unnamed>::indirectCrossProductSoA(testSoA::SoA, testSoA::SoA, testSoA::SoA, unsigned long)( | |
.param .align 8 .b8 _ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_0[80], | |
.param .align 8 .b8 _ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_1[80], | |
.param .align 8 .b8 _ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_2[80], | |
.param .u64 _ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_3 | |
) | |
{ | |
.reg .pred %p<2>; | |
.reg .b32 %r<5>; | |
.reg .f64 %fd<16>; | |
.reg .b64 %rd<52>; | |
$L__func_begin3: | |
ld.param.u64 %rd32, [_ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_3]; | |
ld.param.u64 %rd27, [_ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_2+40]; | |
ld.param.u64 %rd26, [_ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_2+32]; | |
ld.param.u64 %rd25, [_ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_2+24]; | |
ld.param.u64 %rd17, [_ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_1+40]; | |
ld.param.u64 %rd16, [_ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_1+32]; | |
ld.param.u64 %rd15, [_ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_1+24]; | |
ld.param.u64 %rd7, [_ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_0+40]; | |
ld.param.u64 %rd6, [_ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_0+32]; | |
ld.param.u64 %rd5, [_ZN71_GLOBAL__N__47_tmpxft_00001beb_00000000_7_test_v7_cuda_cpp1_ii_e64742fe23indirectCrossProductSoAEN7testSoA3SoAES1_S1_m_param_0+24]; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %ntid.x; | |
mov.u32 %r3, %tid.x; | |
mad.lo.s32 %r4, %r1, %r2, %r3; | |
cvt.u64.u32 %rd1, %r4; | |
setp.ge.u64 %p1, %rd1, %rd32; | |
@%p1 bra $L__BB3_2; | |
cvta.to.global.u64 %rd33, %rd27; | |
cvta.to.global.u64 %rd34, %rd26; | |
cvta.to.global.u64 %rd35, %rd25; | |
cvta.to.global.u64 %rd36, %rd17; | |
cvta.to.global.u64 %rd37, %rd16; | |
cvta.to.global.u64 %rd38, %rd15; | |
cvta.to.global.u64 %rd39, %rd7; | |
cvta.to.global.u64 %rd40, %rd6; | |
cvta.to.global.u64 %rd41, %rd5; | |
shl.b64 %rd42, %rd1, 3; | |
add.s64 %rd43, %rd41, %rd42; | |
add.s64 %rd44, %rd40, %rd42; | |
add.s64 %rd45, %rd39, %rd42; | |
add.s64 %rd46, %rd38, %rd42; | |
add.s64 %rd47, %rd37, %rd42; | |
add.s64 %rd48, %rd36, %rd42; | |
add.s64 %rd49, %rd35, %rd42; | |
add.s64 %rd50, %rd34, %rd42; | |
add.s64 %rd51, %rd33, %rd42; | |
ld.global.nc.f64 %fd1, [%rd51]; | |
ld.global.nc.f64 %fd2, [%rd47]; | |
mul.f64 %fd3, %fd2, %fd1; | |
ld.global.nc.f64 %fd4, [%rd50]; | |
ld.global.nc.f64 %fd5, [%rd48]; | |
mul.f64 %fd6, %fd5, %fd4; | |
sub.f64 %fd7, %fd3, %fd6; | |
st.global.f64 [%rd43], %fd7; | |
ld.global.nc.f64 %fd8, [%rd49]; | |
mul.f64 %fd9, %fd5, %fd8; | |
ld.global.nc.f64 %fd10, [%rd46]; | |
mul.f64 %fd11, %fd10, %fd1; | |
sub.f64 %fd12, %fd9, %fd11; | |
st.global.f64 [%rd44], %fd12; | |
mul.f64 %fd13, %fd10, %fd4; | |
mul.f64 %fd14, %fd2, %fd8; | |
sub.f64 %fd15, %fd13, %fd14; | |
st.global.f64 [%rd45], %fd15; | |
$L__BB3_2: | |
ret; | |
$L__func_end3: | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
eigenCrossProductSoA
compiled for architecturesm_35
in order to compare with the result from compiler explorer.