Skip to content

Instantly share code, notes, and snippets.

@zeux

zeux/nbody.s

Last active Jan 2, 2020
Embed
What would you like to do?
luaujit: nbody.lua when compiled using experimental Luau JIT engine. All assembly snippets only show inner loop body. Variants: scalar - using type info, records and basic block compiler to generate much more efficient inner loop; vector - scalar but with scalars replaced with first-class 3-component vector
# table type guard (memory safety)
cmp dword ptr [rdi + 12], 6
jne 1072 <.text+0x5ea>
# load array index and convert to integer (+ exactness check)
movsd xmm0, qword ptr [rdi + 256]
cvttsd2si eax, xmm0
cvtsi2sd xmm1, eax
ucomisd xmm1, xmm0
jne 1046 <.text+0x5ea>
# indices are 1-based; could remove this one with runtime changes
dec eax
# load table and do a bounds check on the lookup
mov rcx, qword ptr [rdi]
mov rdx, qword ptr [rcx + 32]
cmp dword ptr [rcx + 20], eax
jbe 1028 <.text+0x5ea>
# actual array lookup
shl eax, 4
movups xmm0, xmmword ptr [rdx + rax]
movups xmmword ptr [rdi + 272], xmm0
# record type guard (memory safety)
cmp dword ptr [rdi + 284], 10
jne 1030 <.text+0x607>
# check that record has enough fields (memory safety)
mov rcx, qword ptr [rdi + 272]
cmp dword ptr [rcx + 12], 6
jle 1013 <.text+0x607>
# basic block compiler, load phase (loads from records and stack)
vmovsd xmm12, qword ptr [rdi + 176]
vmovsd xmm13, qword ptr [rdi + 192]
vmovsd xmm14, qword ptr [rdi + 208]
vmovsd xmm1, qword ptr [rcx + 96]
vmovsd xmm2, qword ptr [rcx + 112]
vmovsd xmm3, qword ptr [rcx + 128]
vmovsd xmm15, qword ptr [rcx + 32]
vmovsd xmm11, qword ptr [rcx + 48]
vmovsd xmm8, qword ptr [rcx + 64]
vmovsd xmm9, qword ptr [rcx + 80]
vmovsd xmm0, qword ptr [rdi + 112]
# basic block compiler, arith phase (mostly devoid of memory access)
vsubsd xmm4, xmm0, xmm1
vmovsd xmm0, qword ptr [rdi + 128]
vsubsd xmm5, xmm0, xmm2
vmovsd xmm0, qword ptr [rdi + 144]
vsubsd xmm6, xmm0, xmm3
vmulsd xmm2, xmm4, xmm4
vmulsd xmm1, xmm5, xmm5
vaddsd xmm3, xmm2, xmm1
vmulsd xmm2, xmm6, xmm6
vaddsd xmm7, xmm3, xmm2
vsqrtsd xmm10, xmm10, xmm7
vmulsd xmm2, xmm10, xmm10
vmulsd xmm3, xmm2, xmm10
vmovsd xmm0, qword ptr [rdi + 32]
vdivsd xmm7, xmm0, xmm3
vmulsd xmm3, xmm7, qword ptr [rdi + 160]
vmulsd xmm2, xmm15, xmm7
vmulsd xmm1, xmm4, xmm2
vsubsd xmm12, xmm12, xmm1
vmulsd xmm1, xmm5, xmm2
vsubsd xmm13, xmm13, xmm1
vmulsd xmm1, xmm6, xmm2
vsubsd xmm14, xmm14, xmm1
vmulsd xmm1, xmm4, xmm3
vaddsd xmm11, xmm11, xmm1
vmulsd xmm1, xmm5, xmm3
vaddsd xmm8, xmm8, xmm1
vmulsd xmm1, xmm6, xmm3
vaddsd xmm9, xmm9, xmm1
# basic block compiler, store phase (note, stores type tags for memory safety)
vmovsd qword ptr [rdi + 176], xmm12
mov dword ptr [rdi + 188], 3
vmovsd qword ptr [rdi + 192], xmm13
mov dword ptr [rdi + 204], 3
vmovsd qword ptr [rdi + 208], xmm14
mov dword ptr [rdi + 220], 3
vmovsd qword ptr [rcx + 48], xmm11
mov dword ptr [rcx + 60], 3
vmovsd qword ptr [rcx + 64], xmm8
mov dword ptr [rcx + 76], 3
vmovsd qword ptr [rcx + 80], xmm9
mov dword ptr [rcx + 92], 3
# loop interrupt check, necessary to solve halting problem
mov rax, qword ptr [rbx + 32]
mov rax, qword ptr [rax + 176]
test rax, rax
jne 860 <.text+0x6a2>
# loop back edge
movsd xmm0, qword ptr [rdi + 256]
movsd xmm1, qword ptr [rdi + 224]
addsd xmm0, qword ptr [rdi + 240]
movsd qword ptr [rdi + 256], xmm0
ucomisd xmm1, xmm0
jae -448 <.text+0x1b0>
# table type guard (memory safety)
cmp dword ptr [rdi + 12], 6
jne 802 <.text+0x49c>
# load array index and convert to integer (+ exactness check)
movsd xmm0, qword ptr [rdi + 192]
cvttsd2si eax, xmm0
cvtsi2sd xmm1, eax
ucomisd xmm1, xmm0
jne 776 <.text+0x49c>
# indices are 1-based; could remove this one with runtime changes
dec eax
# load table and do a bounds check on the lookup
mov rcx, qword ptr [rdi]
mov rdx, qword ptr [rcx + 32]
cmp dword ptr [rcx + 20], eax
jbe 758 <.text+0x49c>
# actual array lookup
shl eax, 4
movups xmm0, xmmword ptr [rdx + rax]
movups xmmword ptr [rdi + 208], xmm0
# record type guard (memory safety)
cmp dword ptr [rdi + 220], 10
jne 760 <.text+0x4b9>
# check that record has enough fields (memory safety)
mov rcx, qword ptr [rdi + 208]
cmp dword ptr [rcx + 12], 2
jle 743 <.text+0x4b9>
# basic block compiler, load phase (loads from records and stack)
vmovups xmm8, xmmword ptr [rdi + 128]
vmovups xmm2, xmmword ptr [rcx + 48]
vmovups xmm3, xmmword ptr [rcx + 64]
vmovsd xmm7, qword ptr [rcx + 32]
# basic block compiler, arith phase (mostly devoid of memory access)
vmovups xmm0, xmmword ptr [rdi + 112]
vsubps xmm1, xmm0, xmm2
vdpps xmm0, xmm1, xmm1, 119
vcvtss2sd xmm2, xmm0, xmm0
vsqrtsd xmm4, xmm4, xmm2
vmulsd xmm6, xmm4, xmm4
vmulsd xmm5, xmm6, xmm4
vmovsd xmm0, qword ptr [rdi + 32]
vdivsd xmm2, xmm0, xmm5
vmulsd xmm5, xmm2, qword ptr [rdi + 144]
vmulsd xmm6, xmm7, xmm2
vcvtsd2ss xmm0, xmm0, xmm6
vshufps xmm0, xmm0, xmm0, 0
vmulps xmm2, xmm0, xmm1
vsubps xmm8, xmm8, xmm2
vcvtsd2ss xmm0, xmm0, xmm5
vshufps xmm0, xmm0, xmm0, 0
vmulps xmm2, xmm0, xmm1
vaddps xmm3, xmm3, xmm2
# basic block compiler, store phase (note, stores type tags for memory safety)
vmovups xmmword ptr [rdi + 128], xmm8
mov dword ptr [rdi + 140], 4
vmovups xmmword ptr [rcx + 64], xmm3
mov dword ptr [rcx + 76], 4
# loop interrupt check, necessary to solve halting problem
mov rax, qword ptr [rbx + 32]
mov rax, qword ptr [rax + 176]
test rax, rax
jne 699 <.text+0x52c>
# loop back edge
movsd xmm0, qword ptr [rdi + 192]
movsd xmm1, qword ptr [rdi + 160]
addsd xmm0, qword ptr [rdi + 176]
movsd qword ptr [rdi + 192], xmm0
ucomisd xmm1, xmm0
jae -299 <.text+0x170>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment