Skip to content

Instantly share code, notes, and snippets.

@dadeba
Created March 23, 2012 13:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dadeba/2170449 to your computer and use it in GitHub Desktop.
Save dadeba/2170449 to your computer and use it in GitHub Desktop.
ASM file for gravity_v4v2.cl (https://gist.github.com/2167470)
.file "/tmp/5d41e25b-e85e-4f10-836a-5b23eab3f6a7.TMP"
.text
.globl _Z12native_rsqrtDv8_f
.align 16, 0x90
.type _Z12native_rsqrtDv8_f,@function
_Z12native_rsqrtDv8_f: # @_Z12native_rsqrtDv8_f
# BB#0:
vrsqrtps YMM0, YMM0
ret
.Ltmp0:
.size _Z12native_rsqrtDv8_f, .Ltmp0-_Z12native_rsqrtDv8_f
.globl sum
.align 16, 0x90
.type sum,@function
sum: # @sum
# BB#0:
vextractf128 XMM1, YMM0, 1
vaddss XMM2, XMM0, XMM1
vpshufd XMM3, XMM1, 1 # xmm3 = xmm1[1,0,0,0]
vpshufd XMM4, XMM0, 1 # xmm4 = xmm0[1,0,0,0]
vaddss XMM3, XMM4, XMM3
vinsertps XMM2, XMM2, XMM3, 16 # xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
vmovhlps XMM3, XMM1, XMM1 # xmm1 = xmm1[1,1]
vmovhlps XMM4, XMM0, XMM0 # xmm0 = xmm0[1,1]
vaddss XMM3, XMM4, XMM3
vinsertps XMM2, XMM2, XMM3, 32 # xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
vpshufd XMM1, XMM1, 3 # xmm1 = xmm1[3,0,0,0]
vpshufd XMM0, XMM0, 3 # xmm0 = xmm0[3,0,0,0]
vaddss XMM0, XMM0, XMM1
vinsertps XMM0, XMM2, XMM0, 48 # xmm2 = xmm2[0,1,2],xmm0[0]
ret
.Ltmp1:
.size sum, .Ltmp1-sum
.globl grav3
.align 16, 0x90
.type grav3,@function
grav3: # @grav3
# BB#0: # %FirstBB
push RBP
mov RBP, RSP
and RSP, -32
push R15
push R14
push R13
push R12
push RBX
sub RSP, 120
mov EAX, DWORD PTR [RBP + 80]
mov ECX, EAX
sar ECX, 31
shr ECX, 30
add ECX, EAX
sar ECX, 2
lea EDX, DWORD PTR [RCX - 1]
inc RDX
cmp ECX, 1
mov ECX, 1
cmova RCX, RDX
mov RDX, QWORD PTR [RBP + 128]
add RDX, 8
add EAX, 3
mov RSI, -1
vbroadcastss YMM0, DWORD PTR [RBP + 88]
vmovaps YMMWORD PTR [RSP + 32], YMM0 # 32-byte Spill
mov RDI, QWORD PTR [RBP + 120]
mov R8, QWORD PTR [RBP + 32]
mov R9, QWORD PTR [RBP + 24]
mov R10, QWORD PTR [RBP + 16]
.align 16, 0x90
.LBB2_1: # %SyncBB
# =>This Loop Header: Depth=1
# Child Loop BB2_4 Depth 2
mov R11D, DWORD PTR [RDX - 8]
mov EBX, DWORD PTR [RDX]
add R11D, DWORD PTR [RDI]
add EBX, DWORD PTR [RDI + 8]
mov R14, QWORD PTR [RBP + 104]
imul EBX, DWORD PTR [R14 + 32]
add EBX, R11D
shl RBX, 4
vmovaps XMM0, XMMWORD PTR [R10 + RBX]
vperm2f128 YMM0, YMM0, YMM0, 0
vmovaps YMMWORD PTR [RSP + 64], YMM0 # 32-byte Spill
vmovaps XMM0, XMMWORD PTR [R9 + RBX]
vperm2f128 YMM0, YMM0, YMM0, 0
vmovaps YMMWORD PTR [RSP], YMM0 # 32-byte Spill
vmovaps XMM0, XMMWORD PTR [R8 + RBX]
vperm2f128 YMM0, YMM0, YMM0, 0
cmp EAX, 6
ja .LBB2_3
# BB#2: # %SyncBB.._crit_edge_crit_edge
# in Loop: Header=BB2_1 Depth=1
vxorps YMM1, YMM1, YMM1
vmovaps YMM2, YMM1
vmovaps YMM3, YMM1
vmovaps YMM4, YMM1
jmp .LBB2_5
.LBB2_3: # %SyncBB.bb.nph_crit_edge
# in Loop: Header=BB2_1 Depth=1
vxorps YMM1, YMM1, YMM1
mov R11, R10
mov R14, R9
mov R15, R8
mov R12, QWORD PTR [RBP + 40]
mov R13, RCX
vmovaps YMM2, YMM1
vmovaps YMM3, YMM1
vmovaps YMM4, YMM1
.align 16, 0x90
.LBB2_4: # %bb.nph
# Parent Loop BB2_1 Depth=1
# => This Inner Loop Header: Depth=2
vmovaps XMM5, XMMWORD PTR [R14]
vmovhlps XMM6, XMM5, XMM5 # xmm5 = xmm5[1,1]
vpshufd XMM6, XMM6, 0 # xmm6 = xmm6[0,0,0,0]
vpshufd XMM7, XMM5, 3 # xmm7 = xmm5[3,0,0,0]
vpshufd XMM7, XMM7, 0 # xmm7 = xmm7[0,0,0,0]
vinsertf128 YMM6, YMM6, XMM7, 1
vmovaps YMM7, YMMWORD PTR [RSP] # 32-byte Reload
vsubps YMM6, YMM6, YMM7
vmulps YMM8, YMM6, YMM6
vmovaps XMM9, XMMWORD PTR [R11]
vmovhlps XMM10, XMM9, XMM9 # xmm9 = xmm9[1,1]
vpshufd XMM10, XMM10, 0 # xmm10 = xmm10[0,0,0,0]
vpshufd XMM11, XMM9, 3 # xmm11 = xmm9[3,0,0,0]
vpshufd XMM11, XMM11, 0 # xmm11 = xmm11[0,0,0,0]
vinsertf128 YMM10, YMM10, XMM11, 1
vsubps YMM10, YMM10, YMMWORD PTR [RSP + 64] # 32-byte Folded Reload
vmulps YMM11, YMM10, YMM10
vaddps YMM8, YMM11, YMM8
vmovaps XMM11, XMMWORD PTR [R15]
vmovhlps XMM12, XMM11, XMM11 # xmm11 = xmm11[1,1]
vpshufd XMM12, XMM12, 0 # xmm12 = xmm12[0,0,0,0]
vpshufd XMM13, XMM11, 3 # xmm13 = xmm11[3,0,0,0]
vpshufd XMM13, XMM13, 0 # xmm13 = xmm13[0,0,0,0]
vinsertf128 YMM12, YMM12, XMM13, 1
vsubps YMM12, YMM12, YMM0
vmulps YMM13, YMM12, YMM12
vaddps YMM8, YMM8, YMM13
vaddps YMM8, YMM8, YMMWORD PTR [RSP + 32] # 32-byte Folded Reload
vrsqrtps YMM8, YMM8
vmovaps XMM13, XMMWORD PTR [R12]
vmovhlps XMM14, XMM13, XMM13 # xmm13 = xmm13[1,1]
vpshufd XMM14, XMM14, 0 # xmm14 = xmm14[0,0,0,0]
vpshufd XMM15, XMM13, 3 # xmm15 = xmm13[3,0,0,0]
vpshufd XMM15, XMM15, 0 # xmm15 = xmm15[0,0,0,0]
vinsertf128 YMM14, YMM14, XMM15, 1
vmulps YMM14, YMM14, YMM8
vmulps YMM8, YMM8, YMM8
vmulps YMM8, YMM14, YMM8
vmulps YMM12, YMM12, YMM8
vpshufd XMM15, XMM5, 0 # xmm15 = xmm5[0,0,0,0]
vpshufd XMM5, XMM5, 1 # xmm5 = xmm5[1,0,0,0]
vpshufd XMM5, XMM5, 0 # xmm5 = xmm5[0,0,0,0]
vinsertf128 YMM5, YMM15, XMM5, 1
vsubps YMM5, YMM5, YMM7
vmulps YMM7, YMM5, YMM5
vpshufd XMM15, XMM9, 0 # xmm15 = xmm9[0,0,0,0]
vpshufd XMM9, XMM9, 1 # xmm9 = xmm9[1,0,0,0]
vpshufd XMM9, XMM9, 0 # xmm9 = xmm9[0,0,0,0]
vinsertf128 YMM9, YMM15, XMM9, 1
vsubps YMM9, YMM9, YMMWORD PTR [RSP + 64] # 32-byte Folded Reload
vmulps YMM15, YMM9, YMM9
vaddps YMM7, YMM15, YMM7
vpshufd XMM15, XMM11, 0 # xmm15 = xmm11[0,0,0,0]
vpshufd XMM11, XMM11, 1 # xmm11 = xmm11[1,0,0,0]
vpshufd XMM11, XMM11, 0 # xmm11 = xmm11[0,0,0,0]
vinsertf128 YMM11, YMM15, XMM11, 1
vsubps YMM11, YMM11, YMM0
vmulps YMM15, YMM11, YMM11
vaddps YMM7, YMM7, YMM15
vaddps YMM7, YMM7, YMMWORD PTR [RSP + 32] # 32-byte Folded Reload
vrsqrtps YMM7, YMM7
vpshufd XMM15, XMM13, 0 # xmm15 = xmm13[0,0,0,0]
vpshufd XMM13, XMM13, 1 # xmm13 = xmm13[1,0,0,0]
vpshufd XMM13, XMM13, 0 # xmm13 = xmm13[0,0,0,0]
vinsertf128 YMM13, YMM15, XMM13, 1
vmulps YMM13, YMM13, YMM7
vmulps YMM7, YMM7, YMM7
vmulps YMM7, YMM13, YMM7
vmulps YMM11, YMM11, YMM7
vaddps YMM2, YMM2, YMM11
vaddps YMM2, YMM2, YMM12
vmulps YMM6, YMM6, YMM8
vmulps YMM5, YMM5, YMM7
vaddps YMM3, YMM3, YMM5
vaddps YMM3, YMM3, YMM6
vmulps YMM5, YMM10, YMM8
vmulps YMM6, YMM9, YMM7
vaddps YMM4, YMM4, YMM6
vaddps YMM4, YMM4, YMM5
vsubps YMM1, YMM1, YMM13
vsubps YMM1, YMM1, YMM14
add R11, 16
add R14, 16
add R15, 16
add R12, 16
dec R13
jne .LBB2_4
.LBB2_5: # %._crit_edge
# in Loop: Header=BB2_1 Depth=1
vextractf128 XMM0, YMM4, 1
vaddss XMM5, XMM4, XMM0
vpshufd XMM6, XMM0, 1 # xmm6 = xmm0[1,0,0,0]
vpshufd XMM7, XMM4, 1 # xmm7 = xmm4[1,0,0,0]
vaddss XMM6, XMM7, XMM6
vinsertps XMM5, XMM5, XMM6, 16 # xmm5 = xmm5[0],xmm6[0],xmm5[2,3]
vmovhlps XMM6, XMM0, XMM0 # xmm0 = xmm0[1,1]
vmovhlps XMM7, XMM4, XMM4 # xmm4 = xmm4[1,1]
vaddss XMM6, XMM7, XMM6
vinsertps XMM5, XMM5, XMM6, 32 # xmm5 = xmm5[0,1],xmm6[0],xmm5[3]
vpshufd XMM0, XMM0, 3 # xmm0 = xmm0[3,0,0,0]
vpshufd XMM4, XMM4, 3 # xmm4 = xmm4[3,0,0,0]
vaddss XMM0, XMM4, XMM0
vinsertps XMM0, XMM5, XMM0, 48 # xmm5 = xmm5[0,1,2],xmm0[0]
mov R11, QWORD PTR [RBP + 48]
vmovdqa XMMWORD PTR [R11 + RBX], XMM0
vextractf128 XMM0, YMM3, 1
vaddss XMM4, XMM3, XMM0
vpshufd XMM5, XMM0, 1 # xmm5 = xmm0[1,0,0,0]
vpshufd XMM6, XMM3, 1 # xmm6 = xmm3[1,0,0,0]
vaddss XMM5, XMM6, XMM5
vinsertps XMM4, XMM4, XMM5, 16 # xmm4 = xmm4[0],xmm5[0],xmm4[2,3]
vmovhlps XMM5, XMM0, XMM0 # xmm0 = xmm0[1,1]
vmovhlps XMM6, XMM3, XMM3 # xmm3 = xmm3[1,1]
vaddss XMM5, XMM6, XMM5
vinsertps XMM4, XMM4, XMM5, 32 # xmm4 = xmm4[0,1],xmm5[0],xmm4[3]
vpshufd XMM0, XMM0, 3 # xmm0 = xmm0[3,0,0,0]
vpshufd XMM3, XMM3, 3 # xmm3 = xmm3[3,0,0,0]
vaddss XMM0, XMM3, XMM0
vinsertps XMM0, XMM4, XMM0, 48 # xmm4 = xmm4[0,1,2],xmm0[0]
mov R11, QWORD PTR [RBP + 56]
vmovdqa XMMWORD PTR [R11 + RBX], XMM0
vextractf128 XMM0, YMM2, 1
vaddss XMM3, XMM2, XMM0
vpshufd XMM4, XMM0, 1 # xmm4 = xmm0[1,0,0,0]
vpshufd XMM5, XMM2, 1 # xmm5 = xmm2[1,0,0,0]
vaddss XMM4, XMM5, XMM4
vinsertps XMM3, XMM3, XMM4, 16 # xmm3 = xmm3[0],xmm4[0],xmm3[2,3]
vmovhlps XMM4, XMM0, XMM0 # xmm0 = xmm0[1,1]
vmovhlps XMM5, XMM2, XMM2 # xmm2 = xmm2[1,1]
vaddss XMM4, XMM5, XMM4
vinsertps XMM3, XMM3, XMM4, 32 # xmm3 = xmm3[0,1],xmm4[0],xmm3[3]
vpshufd XMM0, XMM0, 3 # xmm0 = xmm0[3,0,0,0]
vpshufd XMM2, XMM2, 3 # xmm2 = xmm2[3,0,0,0]
vaddss XMM0, XMM2, XMM0
vinsertps XMM0, XMM3, XMM0, 48 # xmm3 = xmm3[0,1,2],xmm0[0]
mov R11, QWORD PTR [RBP + 64]
vmovdqa XMMWORD PTR [R11 + RBX], XMM0
vextractf128 XMM0, YMM1, 1
vaddss XMM2, XMM1, XMM0
vpshufd XMM3, XMM0, 1 # xmm3 = xmm0[1,0,0,0]
vpshufd XMM4, XMM1, 1 # xmm4 = xmm1[1,0,0,0]
vaddss XMM3, XMM4, XMM3
vinsertps XMM2, XMM2, XMM3, 16 # xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
vmovhlps XMM3, XMM0, XMM0 # xmm0 = xmm0[1,1]
vmovhlps XMM4, XMM1, XMM1 # xmm1 = xmm1[1,1]
vaddss XMM3, XMM4, XMM3
vinsertps XMM2, XMM2, XMM3, 32 # xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
vpshufd XMM0, XMM0, 3 # xmm0 = xmm0[3,0,0,0]
vpshufd XMM1, XMM1, 3 # xmm1 = xmm1[3,0,0,0]
vaddss XMM0, XMM1, XMM0
vinsertps XMM0, XMM2, XMM0, 48 # xmm2 = xmm2[0,1,2],xmm0[0]
mov R11, QWORD PTR [RBP + 72]
vmovdqa XMMWORD PTR [R11 + RBX], XMM0
add RDX, 32
inc RSI
cmp RSI, QWORD PTR [RBP + 144]
jb .LBB2_1
# BB#6: # %SyncBB19
add RSP, 120
pop RBX
pop R12
pop R13
pop R14
pop R15
mov RSP, RBP
pop RBP
ret
.Ltmp2:
.size grav3, .Ltmp2-grav3
.section .note.GNU-stack,"",@progbits
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment