luaujit: nbody.lua when compiled using experimental Luau JIT engine. All assembly snippets only show inner loop body. Variants: scalar - using type info, records and basic block compiler to generate much more efficient inner loop; vector - scalar but with scalars replaced with first-class 3-component vector
# table type guard (memory safety) | |
cmp dword ptr [rdi + 12], 6 | |
jne 1072 <.text+0x5ea> | |
# load array index and convert to integer (+ exactness check) | |
movsd xmm0, qword ptr [rdi + 256] | |
cvttsd2si eax, xmm0 | |
cvtsi2sd xmm1, eax | |
ucomisd xmm1, xmm0 | |
jne 1046 <.text+0x5ea> | |
# indices are 1-based; could remove this one with runtime changes | |
dec eax | |
# load table and do a bounds check on the lookup | |
mov rcx, qword ptr [rdi] | |
mov rdx, qword ptr [rcx + 32] | |
cmp dword ptr [rcx + 20], eax | |
jbe 1028 <.text+0x5ea> | |
# actual array lookup | |
shl eax, 4 | |
movups xmm0, xmmword ptr [rdx + rax] | |
movups xmmword ptr [rdi + 272], xmm0 | |
# record type guard (memory safety) | |
cmp dword ptr [rdi + 284], 10 | |
jne 1030 <.text+0x607> | |
# check that record has enough fields (memory safety) | |
mov rcx, qword ptr [rdi + 272] | |
cmp dword ptr [rcx + 12], 6 | |
jle 1013 <.text+0x607> | |
# basic block compiler, load phase (loads from records and stack) | |
vmovsd xmm12, qword ptr [rdi + 176] | |
vmovsd xmm13, qword ptr [rdi + 192] | |
vmovsd xmm14, qword ptr [rdi + 208] | |
vmovsd xmm1, qword ptr [rcx + 96] | |
vmovsd xmm2, qword ptr [rcx + 112] | |
vmovsd xmm3, qword ptr [rcx + 128] | |
vmovsd xmm15, qword ptr [rcx + 32] | |
vmovsd xmm11, qword ptr [rcx + 48] | |
vmovsd xmm8, qword ptr [rcx + 64] | |
vmovsd xmm9, qword ptr [rcx + 80] | |
vmovsd xmm0, qword ptr [rdi + 112] | |
# basic block compiler, arith phase (mostly devoid of memory access) | |
vsubsd xmm4, xmm0, xmm1 | |
vmovsd xmm0, qword ptr [rdi + 128] | |
vsubsd xmm5, xmm0, xmm2 | |
vmovsd xmm0, qword ptr [rdi + 144] | |
vsubsd xmm6, xmm0, xmm3 | |
vmulsd xmm2, xmm4, xmm4 | |
vmulsd xmm1, xmm5, xmm5 | |
vaddsd xmm3, xmm2, xmm1 | |
vmulsd xmm2, xmm6, xmm6 | |
vaddsd xmm7, xmm3, xmm2 | |
vsqrtsd xmm10, xmm10, xmm7 | |
vmulsd xmm2, xmm10, xmm10 | |
vmulsd xmm3, xmm2, xmm10 | |
vmovsd xmm0, qword ptr [rdi + 32] | |
vdivsd xmm7, xmm0, xmm3 | |
vmulsd xmm3, xmm7, qword ptr [rdi + 160] | |
vmulsd xmm2, xmm15, xmm7 | |
vmulsd xmm1, xmm4, xmm2 | |
vsubsd xmm12, xmm12, xmm1 | |
vmulsd xmm1, xmm5, xmm2 | |
vsubsd xmm13, xmm13, xmm1 | |
vmulsd xmm1, xmm6, xmm2 | |
vsubsd xmm14, xmm14, xmm1 | |
vmulsd xmm1, xmm4, xmm3 | |
vaddsd xmm11, xmm11, xmm1 | |
vmulsd xmm1, xmm5, xmm3 | |
vaddsd xmm8, xmm8, xmm1 | |
vmulsd xmm1, xmm6, xmm3 | |
vaddsd xmm9, xmm9, xmm1 | |
# basic block compiler, store phase (note, stores type tags for memory safety) | |
vmovsd qword ptr [rdi + 176], xmm12 | |
mov dword ptr [rdi + 188], 3 | |
vmovsd qword ptr [rdi + 192], xmm13 | |
mov dword ptr [rdi + 204], 3 | |
vmovsd qword ptr [rdi + 208], xmm14 | |
mov dword ptr [rdi + 220], 3 | |
vmovsd qword ptr [rcx + 48], xmm11 | |
mov dword ptr [rcx + 60], 3 | |
vmovsd qword ptr [rcx + 64], xmm8 | |
mov dword ptr [rcx + 76], 3 | |
vmovsd qword ptr [rcx + 80], xmm9 | |
mov dword ptr [rcx + 92], 3 | |
# loop interrupt check, necessary to solve halting problem | |
mov rax, qword ptr [rbx + 32] | |
mov rax, qword ptr [rax + 176] | |
test rax, rax | |
jne 860 <.text+0x6a2> | |
# loop back edge | |
movsd xmm0, qword ptr [rdi + 256] | |
movsd xmm1, qword ptr [rdi + 224] | |
addsd xmm0, qword ptr [rdi + 240] | |
movsd qword ptr [rdi + 256], xmm0 | |
ucomisd xmm1, xmm0 | |
jae -448 <.text+0x1b0> |
# table type guard (memory safety) | |
cmp dword ptr [rdi + 12], 6 | |
jne 802 <.text+0x49c> | |
# load array index and convert to integer (+ exactness check) | |
movsd xmm0, qword ptr [rdi + 192] | |
cvttsd2si eax, xmm0 | |
cvtsi2sd xmm1, eax | |
ucomisd xmm1, xmm0 | |
jne 776 <.text+0x49c> | |
# indices are 1-based; could remove this one with runtime changes | |
dec eax | |
# load table and do a bounds check on the lookup | |
mov rcx, qword ptr [rdi] | |
mov rdx, qword ptr [rcx + 32] | |
cmp dword ptr [rcx + 20], eax | |
jbe 758 <.text+0x49c> | |
# actual array lookup | |
shl eax, 4 | |
movups xmm0, xmmword ptr [rdx + rax] | |
movups xmmword ptr [rdi + 208], xmm0 | |
# record type guard (memory safety) | |
cmp dword ptr [rdi + 220], 10 | |
jne 760 <.text+0x4b9> | |
# check that record has enough fields (memory safety) | |
mov rcx, qword ptr [rdi + 208] | |
cmp dword ptr [rcx + 12], 2 | |
jle 743 <.text+0x4b9> | |
# basic block compiler, load phase (loads from records and stack) | |
vmovups xmm8, xmmword ptr [rdi + 128] | |
vmovups xmm2, xmmword ptr [rcx + 48] | |
vmovups xmm3, xmmword ptr [rcx + 64] | |
vmovsd xmm7, qword ptr [rcx + 32] | |
# basic block compiler, arith phase (mostly devoid of memory access) | |
vmovups xmm0, xmmword ptr [rdi + 112] | |
vsubps xmm1, xmm0, xmm2 | |
vdpps xmm0, xmm1, xmm1, 119 | |
vcvtss2sd xmm2, xmm0, xmm0 | |
vsqrtsd xmm4, xmm4, xmm2 | |
vmulsd xmm6, xmm4, xmm4 | |
vmulsd xmm5, xmm6, xmm4 | |
vmovsd xmm0, qword ptr [rdi + 32] | |
vdivsd xmm2, xmm0, xmm5 | |
vmulsd xmm5, xmm2, qword ptr [rdi + 144] | |
vmulsd xmm6, xmm7, xmm2 | |
vcvtsd2ss xmm0, xmm0, xmm6 | |
vshufps xmm0, xmm0, xmm0, 0 | |
vmulps xmm2, xmm0, xmm1 | |
vsubps xmm8, xmm8, xmm2 | |
vcvtsd2ss xmm0, xmm0, xmm5 | |
vshufps xmm0, xmm0, xmm0, 0 | |
vmulps xmm2, xmm0, xmm1 | |
vaddps xmm3, xmm3, xmm2 | |
# basic block compiler, store phase (note, stores type tags for memory safety) | |
vmovups xmmword ptr [rdi + 128], xmm8 | |
mov dword ptr [rdi + 140], 4 | |
vmovups xmmword ptr [rcx + 64], xmm3 | |
mov dword ptr [rcx + 76], 4 | |
# loop interrupt check, necessary to solve halting problem | |
mov rax, qword ptr [rbx + 32] | |
mov rax, qword ptr [rax + 176] | |
test rax, rax | |
jne 699 <.text+0x52c> | |
# loop back edge | |
movsd xmm0, qword ptr [rdi + 192] | |
movsd xmm1, qword ptr [rdi + 160] | |
addsd xmm0, qword ptr [rdi + 176] | |
movsd qword ptr [rdi + 192], xmm0 | |
ucomisd xmm1, xmm0 | |
jae -299 <.text+0x170> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment