Skip to content

Instantly share code, notes, and snippets.

@ArchRobison
Created January 19, 2015 15:19
Show Gist options
  • Save ArchRobison/1442861e2e062ff10309 to your computer and use it in GitHub Desktop.
Save ArchRobison/1442861e2e062ff10309 to your computer and use it in GitHub Desktop.
Demo of vectorizing sqrt using LLVM 3.5.0
julia> function f(x,y,z)
@fastmath @inbounds @simd for i=1:length(z)
z[i] = sqrt(x[i]^2+y[i]^2)
end
end
f (generic function with 1 method)
julia> code_native(f,(Vector{Float32},Vector{Float32},Vector{Float32}))
.text
Filename: none
Source line: 0
push rbp
mov rbp, rsp
Source line: 49
mov rax, qword ptr [rdx + 16]
xor r11d, r11d
test rax, rax
cmovns r11, rax
dec r11
jo L343
inc r11
jo L343
Source line: 50
test r11, r11
jle L333
Source line: 3
mov r8, qword ptr [rdi + 8]
mov r9, qword ptr [rsi + 8]
mov r10, qword ptr [rdx + 8]
xor esi, esi
test r11, r11
je L263
xor esi, esi
Source line: 56
mov rdi, r11
and rdi, -32
je L258
lea rcx, qword ptr [r10 + 96]
lea rsi, qword ptr [r9 + 96]
lea rdx, qword ptr [r8 + 96]
mov rax, r11
and rax, -32
nop word ptr cs:[rax + rax]
Source line: 3
L112: vmovups ymm0, ymmword ptr [rdx - 96]
vmovups ymm1, ymmword ptr [rdx - 64]
vmovups ymm2, ymmword ptr [rdx - 32]
vmovups ymm3, ymmword ptr [rdx]
vmulps ymm0, ymm0, ymm0
vmulps ymm1, ymm1, ymm1
vmulps ymm2, ymm2, ymm2
vmulps ymm3, ymm3, ymm3
vmovups ymm4, ymmword ptr [rsi - 96]
vmovups ymm5, ymmword ptr [rsi - 64]
vmovups ymm6, ymmword ptr [rsi - 32]
vmovups ymm7, ymmword ptr [rsi]
vmulps ymm4, ymm4, ymm4
vmulps ymm5, ymm5, ymm5
vmulps ymm6, ymm6, ymm6
vmulps ymm7, ymm7, ymm7
vaddps ymm0, ymm0, ymm4
vaddps ymm1, ymm1, ymm5
vaddps ymm2, ymm2, ymm6
vaddps ymm3, ymm3, ymm7
vsqrtps ymm0, ymm0
vsqrtps ymm1, ymm1
vsqrtps ymm2, ymm2
vsqrtps ymm3, ymm3
vmovups ymmword ptr [rcx - 96], ymm0
vmovups ymmword ptr [rcx - 64], ymm1
vmovups ymmword ptr [rcx - 32], ymm2
vmovups ymmword ptr [rcx], ymm3
Source line: 56
sub rcx, -128
sub rsi, -128
sub rdx, -128
add rax, -32
jne L112
mov rsi, rdi
L258: cmp r11, rsi
je L333
L263: lea rax, qword ptr [r10 + 4*rsi]
lea rcx, qword ptr [r9 + 4*rsi]
lea rdx, qword ptr [r8 + 4*rsi]
sub r11, rsi
nop word ptr cs:[rax + rax]
Source line: 3
L288: vmovss xmm0, dword ptr [rdx]
vmulss xmm0, xmm0, xmm0
vmovss xmm1, dword ptr [rcx]
vmulss xmm1, xmm1, xmm1
vaddss xmm0, xmm0, xmm1
vsqrtss xmm0, xmm0, xmm0
vmovss dword ptr [rax], xmm0
Source line: 57
add rax, 4
add rcx, 4
add rdx, 4
dec r11
jne L288
L333: mov eax, 12394448
Source line: 64
pop rbp
vzeroupper
ret
Source line: 49
L343: movabs rax, jl_overflow_exception
mov rdi, qword ptr [rax]
movabs rax, jl_throw_with_superfluous_argument
mov esi, 49
call rax
julia>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment