Created
March 23, 2012 13:01
-
-
Save dadeba/2170449 to your computer and use it in GitHub Desktop.
ASM file for gravity_v4v2.cl (https://gist.github.com/2167470)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.file "/tmp/5d41e25b-e85e-4f10-836a-5b23eab3f6a7.TMP" | |
.text | |
.globl _Z12native_rsqrtDv8_f | |
.align 16, 0x90 | |
.type _Z12native_rsqrtDv8_f,@function | |
_Z12native_rsqrtDv8_f: # @_Z12native_rsqrtDv8_f | |
# BB#0: | |
vrsqrtps YMM0, YMM0 | |
ret | |
.Ltmp0: | |
.size _Z12native_rsqrtDv8_f, .Ltmp0-_Z12native_rsqrtDv8_f | |
.globl sum | |
.align 16, 0x90 | |
.type sum,@function | |
sum: # @sum | |
# BB#0: | |
vextractf128 XMM1, YMM0, 1 | |
vaddss XMM2, XMM0, XMM1 | |
vpshufd XMM3, XMM1, 1 # xmm3 = xmm1[1,0,0,0] | |
vpshufd XMM4, XMM0, 1 # xmm4 = xmm0[1,0,0,0] | |
vaddss XMM3, XMM4, XMM3 | |
vinsertps XMM2, XMM2, XMM3, 16 # xmm2 = xmm2[0],xmm3[0],xmm2[2,3] | |
vmovhlps XMM3, XMM1, XMM1 # xmm1 = xmm1[1,1] | |
vmovhlps XMM4, XMM0, XMM0 # xmm0 = xmm0[1,1] | |
vaddss XMM3, XMM4, XMM3 | |
vinsertps XMM2, XMM2, XMM3, 32 # xmm2 = xmm2[0,1],xmm3[0],xmm2[3] | |
vpshufd XMM1, XMM1, 3 # xmm1 = xmm1[3,0,0,0] | |
vpshufd XMM0, XMM0, 3 # xmm0 = xmm0[3,0,0,0] | |
vaddss XMM0, XMM0, XMM1 | |
vinsertps XMM0, XMM2, XMM0, 48 # xmm2 = xmm2[0,1,2],xmm0[0] | |
ret | |
.Ltmp1: | |
.size sum, .Ltmp1-sum | |
.globl grav3 | |
.align 16, 0x90 | |
.type grav3,@function | |
grav3: # @grav3 | |
# BB#0: # %FirstBB | |
push RBP | |
mov RBP, RSP | |
and RSP, -32 | |
push R15 | |
push R14 | |
push R13 | |
push R12 | |
push RBX | |
sub RSP, 120 | |
mov EAX, DWORD PTR [RBP + 80] | |
mov ECX, EAX | |
sar ECX, 31 | |
shr ECX, 30 | |
add ECX, EAX | |
sar ECX, 2 | |
lea EDX, DWORD PTR [RCX - 1] | |
inc RDX | |
cmp ECX, 1 | |
mov ECX, 1 | |
cmova RCX, RDX | |
mov RDX, QWORD PTR [RBP + 128] | |
add RDX, 8 | |
add EAX, 3 | |
mov RSI, -1 | |
vbroadcastss YMM0, DWORD PTR [RBP + 88] | |
vmovaps YMMWORD PTR [RSP + 32], YMM0 # 32-byte Spill | |
mov RDI, QWORD PTR [RBP + 120] | |
mov R8, QWORD PTR [RBP + 32] | |
mov R9, QWORD PTR [RBP + 24] | |
mov R10, QWORD PTR [RBP + 16] | |
.align 16, 0x90 | |
.LBB2_1: # %SyncBB | |
# =>This Loop Header: Depth=1 | |
# Child Loop BB2_4 Depth 2 | |
mov R11D, DWORD PTR [RDX - 8] | |
mov EBX, DWORD PTR [RDX] | |
add R11D, DWORD PTR [RDI] | |
add EBX, DWORD PTR [RDI + 8] | |
mov R14, QWORD PTR [RBP + 104] | |
imul EBX, DWORD PTR [R14 + 32] | |
add EBX, R11D | |
shl RBX, 4 | |
vmovaps XMM0, XMMWORD PTR [R10 + RBX] | |
vperm2f128 YMM0, YMM0, YMM0, 0 | |
vmovaps YMMWORD PTR [RSP + 64], YMM0 # 32-byte Spill | |
vmovaps XMM0, XMMWORD PTR [R9 + RBX] | |
vperm2f128 YMM0, YMM0, YMM0, 0 | |
vmovaps YMMWORD PTR [RSP], YMM0 # 32-byte Spill | |
vmovaps XMM0, XMMWORD PTR [R8 + RBX] | |
vperm2f128 YMM0, YMM0, YMM0, 0 | |
cmp EAX, 6 | |
ja .LBB2_3 | |
# BB#2: # %SyncBB.._crit_edge_crit_edge | |
# in Loop: Header=BB2_1 Depth=1 | |
vxorps YMM1, YMM1, YMM1 | |
vmovaps YMM2, YMM1 | |
vmovaps YMM3, YMM1 | |
vmovaps YMM4, YMM1 | |
jmp .LBB2_5 | |
.LBB2_3: # %SyncBB.bb.nph_crit_edge | |
# in Loop: Header=BB2_1 Depth=1 | |
vxorps YMM1, YMM1, YMM1 | |
mov R11, R10 | |
mov R14, R9 | |
mov R15, R8 | |
mov R12, QWORD PTR [RBP + 40] | |
mov R13, RCX | |
vmovaps YMM2, YMM1 | |
vmovaps YMM3, YMM1 | |
vmovaps YMM4, YMM1 | |
.align 16, 0x90 | |
.LBB2_4: # %bb.nph | |
# Parent Loop BB2_1 Depth=1 | |
# => This Inner Loop Header: Depth=2 | |
vmovaps XMM5, XMMWORD PTR [R14] | |
vmovhlps XMM6, XMM5, XMM5 # xmm5 = xmm5[1,1] | |
vpshufd XMM6, XMM6, 0 # xmm6 = xmm6[0,0,0,0] | |
vpshufd XMM7, XMM5, 3 # xmm7 = xmm5[3,0,0,0] | |
vpshufd XMM7, XMM7, 0 # xmm7 = xmm7[0,0,0,0] | |
vinsertf128 YMM6, YMM6, XMM7, 1 | |
vmovaps YMM7, YMMWORD PTR [RSP] # 32-byte Reload | |
vsubps YMM6, YMM6, YMM7 | |
vmulps YMM8, YMM6, YMM6 | |
vmovaps XMM9, XMMWORD PTR [R11] | |
vmovhlps XMM10, XMM9, XMM9 # xmm9 = xmm9[1,1] | |
vpshufd XMM10, XMM10, 0 # xmm10 = xmm10[0,0,0,0] | |
vpshufd XMM11, XMM9, 3 # xmm11 = xmm9[3,0,0,0] | |
vpshufd XMM11, XMM11, 0 # xmm11 = xmm11[0,0,0,0] | |
vinsertf128 YMM10, YMM10, XMM11, 1 | |
vsubps YMM10, YMM10, YMMWORD PTR [RSP + 64] # 32-byte Folded Reload | |
vmulps YMM11, YMM10, YMM10 | |
vaddps YMM8, YMM11, YMM8 | |
vmovaps XMM11, XMMWORD PTR [R15] | |
vmovhlps XMM12, XMM11, XMM11 # xmm11 = xmm11[1,1] | |
vpshufd XMM12, XMM12, 0 # xmm12 = xmm12[0,0,0,0] | |
vpshufd XMM13, XMM11, 3 # xmm13 = xmm11[3,0,0,0] | |
vpshufd XMM13, XMM13, 0 # xmm13 = xmm13[0,0,0,0] | |
vinsertf128 YMM12, YMM12, XMM13, 1 | |
vsubps YMM12, YMM12, YMM0 | |
vmulps YMM13, YMM12, YMM12 | |
vaddps YMM8, YMM8, YMM13 | |
vaddps YMM8, YMM8, YMMWORD PTR [RSP + 32] # 32-byte Folded Reload | |
vrsqrtps YMM8, YMM8 | |
vmovaps XMM13, XMMWORD PTR [R12] | |
vmovhlps XMM14, XMM13, XMM13 # xmm13 = xmm13[1,1] | |
vpshufd XMM14, XMM14, 0 # xmm14 = xmm14[0,0,0,0] | |
vpshufd XMM15, XMM13, 3 # xmm15 = xmm13[3,0,0,0] | |
vpshufd XMM15, XMM15, 0 # xmm15 = xmm15[0,0,0,0] | |
vinsertf128 YMM14, YMM14, XMM15, 1 | |
vmulps YMM14, YMM14, YMM8 | |
vmulps YMM8, YMM8, YMM8 | |
vmulps YMM8, YMM14, YMM8 | |
vmulps YMM12, YMM12, YMM8 | |
vpshufd XMM15, XMM5, 0 # xmm15 = xmm5[0,0,0,0] | |
vpshufd XMM5, XMM5, 1 # xmm5 = xmm5[1,0,0,0] | |
vpshufd XMM5, XMM5, 0 # xmm5 = xmm5[0,0,0,0] | |
vinsertf128 YMM5, YMM15, XMM5, 1 | |
vsubps YMM5, YMM5, YMM7 | |
vmulps YMM7, YMM5, YMM5 | |
vpshufd XMM15, XMM9, 0 # xmm15 = xmm9[0,0,0,0] | |
vpshufd XMM9, XMM9, 1 # xmm9 = xmm9[1,0,0,0] | |
vpshufd XMM9, XMM9, 0 # xmm9 = xmm9[0,0,0,0] | |
vinsertf128 YMM9, YMM15, XMM9, 1 | |
vsubps YMM9, YMM9, YMMWORD PTR [RSP + 64] # 32-byte Folded Reload | |
vmulps YMM15, YMM9, YMM9 | |
vaddps YMM7, YMM15, YMM7 | |
vpshufd XMM15, XMM11, 0 # xmm15 = xmm11[0,0,0,0] | |
vpshufd XMM11, XMM11, 1 # xmm11 = xmm11[1,0,0,0] | |
vpshufd XMM11, XMM11, 0 # xmm11 = xmm11[0,0,0,0] | |
vinsertf128 YMM11, YMM15, XMM11, 1 | |
vsubps YMM11, YMM11, YMM0 | |
vmulps YMM15, YMM11, YMM11 | |
vaddps YMM7, YMM7, YMM15 | |
vaddps YMM7, YMM7, YMMWORD PTR [RSP + 32] # 32-byte Folded Reload | |
vrsqrtps YMM7, YMM7 | |
vpshufd XMM15, XMM13, 0 # xmm15 = xmm13[0,0,0,0] | |
vpshufd XMM13, XMM13, 1 # xmm13 = xmm13[1,0,0,0] | |
vpshufd XMM13, XMM13, 0 # xmm13 = xmm13[0,0,0,0] | |
vinsertf128 YMM13, YMM15, XMM13, 1 | |
vmulps YMM13, YMM13, YMM7 | |
vmulps YMM7, YMM7, YMM7 | |
vmulps YMM7, YMM13, YMM7 | |
vmulps YMM11, YMM11, YMM7 | |
vaddps YMM2, YMM2, YMM11 | |
vaddps YMM2, YMM2, YMM12 | |
vmulps YMM6, YMM6, YMM8 | |
vmulps YMM5, YMM5, YMM7 | |
vaddps YMM3, YMM3, YMM5 | |
vaddps YMM3, YMM3, YMM6 | |
vmulps YMM5, YMM10, YMM8 | |
vmulps YMM6, YMM9, YMM7 | |
vaddps YMM4, YMM4, YMM6 | |
vaddps YMM4, YMM4, YMM5 | |
vsubps YMM1, YMM1, YMM13 | |
vsubps YMM1, YMM1, YMM14 | |
add R11, 16 | |
add R14, 16 | |
add R15, 16 | |
add R12, 16 | |
dec R13 | |
jne .LBB2_4 | |
.LBB2_5: # %._crit_edge | |
# in Loop: Header=BB2_1 Depth=1 | |
vextractf128 XMM0, YMM4, 1 | |
vaddss XMM5, XMM4, XMM0 | |
vpshufd XMM6, XMM0, 1 # xmm6 = xmm0[1,0,0,0] | |
vpshufd XMM7, XMM4, 1 # xmm7 = xmm4[1,0,0,0] | |
vaddss XMM6, XMM7, XMM6 | |
vinsertps XMM5, XMM5, XMM6, 16 # xmm5 = xmm5[0],xmm6[0],xmm5[2,3] | |
vmovhlps XMM6, XMM0, XMM0 # xmm0 = xmm0[1,1] | |
vmovhlps XMM7, XMM4, XMM4 # xmm4 = xmm4[1,1] | |
vaddss XMM6, XMM7, XMM6 | |
vinsertps XMM5, XMM5, XMM6, 32 # xmm5 = xmm5[0,1],xmm6[0],xmm5[3] | |
vpshufd XMM0, XMM0, 3 # xmm0 = xmm0[3,0,0,0] | |
vpshufd XMM4, XMM4, 3 # xmm4 = xmm4[3,0,0,0] | |
vaddss XMM0, XMM4, XMM0 | |
vinsertps XMM0, XMM5, XMM0, 48 # xmm5 = xmm5[0,1,2],xmm0[0] | |
mov R11, QWORD PTR [RBP + 48] | |
vmovdqa XMMWORD PTR [R11 + RBX], XMM0 | |
vextractf128 XMM0, YMM3, 1 | |
vaddss XMM4, XMM3, XMM0 | |
vpshufd XMM5, XMM0, 1 # xmm5 = xmm0[1,0,0,0] | |
vpshufd XMM6, XMM3, 1 # xmm6 = xmm3[1,0,0,0] | |
vaddss XMM5, XMM6, XMM5 | |
vinsertps XMM4, XMM4, XMM5, 16 # xmm4 = xmm4[0],xmm5[0],xmm4[2,3] | |
vmovhlps XMM5, XMM0, XMM0 # xmm0 = xmm0[1,1] | |
vmovhlps XMM6, XMM3, XMM3 # xmm3 = xmm3[1,1] | |
vaddss XMM5, XMM6, XMM5 | |
vinsertps XMM4, XMM4, XMM5, 32 # xmm4 = xmm4[0,1],xmm5[0],xmm4[3] | |
vpshufd XMM0, XMM0, 3 # xmm0 = xmm0[3,0,0,0] | |
vpshufd XMM3, XMM3, 3 # xmm3 = xmm3[3,0,0,0] | |
vaddss XMM0, XMM3, XMM0 | |
vinsertps XMM0, XMM4, XMM0, 48 # xmm4 = xmm4[0,1,2],xmm0[0] | |
mov R11, QWORD PTR [RBP + 56] | |
vmovdqa XMMWORD PTR [R11 + RBX], XMM0 | |
vextractf128 XMM0, YMM2, 1 | |
vaddss XMM3, XMM2, XMM0 | |
vpshufd XMM4, XMM0, 1 # xmm4 = xmm0[1,0,0,0] | |
vpshufd XMM5, XMM2, 1 # xmm5 = xmm2[1,0,0,0] | |
vaddss XMM4, XMM5, XMM4 | |
vinsertps XMM3, XMM3, XMM4, 16 # xmm3 = xmm3[0],xmm4[0],xmm3[2,3] | |
vmovhlps XMM4, XMM0, XMM0 # xmm0 = xmm0[1,1] | |
vmovhlps XMM5, XMM2, XMM2 # xmm2 = xmm2[1,1] | |
vaddss XMM4, XMM5, XMM4 | |
vinsertps XMM3, XMM3, XMM4, 32 # xmm3 = xmm3[0,1],xmm4[0],xmm3[3] | |
vpshufd XMM0, XMM0, 3 # xmm0 = xmm0[3,0,0,0] | |
vpshufd XMM2, XMM2, 3 # xmm2 = xmm2[3,0,0,0] | |
vaddss XMM0, XMM2, XMM0 | |
vinsertps XMM0, XMM3, XMM0, 48 # xmm3 = xmm3[0,1,2],xmm0[0] | |
mov R11, QWORD PTR [RBP + 64] | |
vmovdqa XMMWORD PTR [R11 + RBX], XMM0 | |
vextractf128 XMM0, YMM1, 1 | |
vaddss XMM2, XMM1, XMM0 | |
vpshufd XMM3, XMM0, 1 # xmm3 = xmm0[1,0,0,0] | |
vpshufd XMM4, XMM1, 1 # xmm4 = xmm1[1,0,0,0] | |
vaddss XMM3, XMM4, XMM3 | |
vinsertps XMM2, XMM2, XMM3, 16 # xmm2 = xmm2[0],xmm3[0],xmm2[2,3] | |
vmovhlps XMM3, XMM0, XMM0 # xmm0 = xmm0[1,1] | |
vmovhlps XMM4, XMM1, XMM1 # xmm1 = xmm1[1,1] | |
vaddss XMM3, XMM4, XMM3 | |
vinsertps XMM2, XMM2, XMM3, 32 # xmm2 = xmm2[0,1],xmm3[0],xmm2[3] | |
vpshufd XMM0, XMM0, 3 # xmm0 = xmm0[3,0,0,0] | |
vpshufd XMM1, XMM1, 3 # xmm1 = xmm1[3,0,0,0] | |
vaddss XMM0, XMM1, XMM0 | |
vinsertps XMM0, XMM2, XMM0, 48 # xmm2 = xmm2[0,1,2],xmm0[0] | |
mov R11, QWORD PTR [RBP + 72] | |
vmovdqa XMMWORD PTR [R11 + RBX], XMM0 | |
add RDX, 32 | |
inc RSI | |
cmp RSI, QWORD PTR [RBP + 144] | |
jb .LBB2_1 | |
# BB#6: # %SyncBB19 | |
add RSP, 120 | |
pop RBX | |
pop R12 | |
pop R13 | |
pop R14 | |
pop R15 | |
mov RSP, RBP | |
pop RBP | |
ret | |
.Ltmp2: | |
.size grav3, .Ltmp2-grav3 | |
.section .note.GNU-stack,"",@progbits |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment