Created
January 19, 2015 15:19
-
-
Save ArchRobison/1442861e2e062ff10309 to your computer and use it in GitHub Desktop.
Demo of vectorizing sqrt using LLVM 3.5.0
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
julia> function f(x,y,z) | |
@fastmath @inbounds @simd for i=1:length(z) | |
z[i] = sqrt(x[i]^2+y[i]^2) | |
end | |
end | |
f (generic function with 1 method) | |
julia> code_native(f,(Vector{Float32},Vector{Float32},Vector{Float32})) | |
.text | |
Filename: none | |
Source line: 0 | |
push rbp | |
mov rbp, rsp | |
Source line: 49 | |
mov rax, qword ptr [rdx + 16] | |
xor r11d, r11d | |
test rax, rax | |
cmovns r11, rax | |
dec r11 | |
jo L343 | |
inc r11 | |
jo L343 | |
Source line: 50 | |
test r11, r11 | |
jle L333 | |
Source line: 3 | |
mov r8, qword ptr [rdi + 8] | |
mov r9, qword ptr [rsi + 8] | |
mov r10, qword ptr [rdx + 8] | |
xor esi, esi | |
test r11, r11 | |
je L263 | |
xor esi, esi | |
Source line: 56 | |
mov rdi, r11 | |
and rdi, -32 | |
je L258 | |
lea rcx, qword ptr [r10 + 96] | |
lea rsi, qword ptr [r9 + 96] | |
lea rdx, qword ptr [r8 + 96] | |
mov rax, r11 | |
and rax, -32 | |
nop word ptr cs:[rax + rax] | |
Source line: 3 | |
L112: vmovups ymm0, ymmword ptr [rdx - 96] | |
vmovups ymm1, ymmword ptr [rdx - 64] | |
vmovups ymm2, ymmword ptr [rdx - 32] | |
vmovups ymm3, ymmword ptr [rdx] | |
vmulps ymm0, ymm0, ymm0 | |
vmulps ymm1, ymm1, ymm1 | |
vmulps ymm2, ymm2, ymm2 | |
vmulps ymm3, ymm3, ymm3 | |
vmovups ymm4, ymmword ptr [rsi - 96] | |
vmovups ymm5, ymmword ptr [rsi - 64] | |
vmovups ymm6, ymmword ptr [rsi - 32] | |
vmovups ymm7, ymmword ptr [rsi] | |
vmulps ymm4, ymm4, ymm4 | |
vmulps ymm5, ymm5, ymm5 | |
vmulps ymm6, ymm6, ymm6 | |
vmulps ymm7, ymm7, ymm7 | |
vaddps ymm0, ymm0, ymm4 | |
vaddps ymm1, ymm1, ymm5 | |
vaddps ymm2, ymm2, ymm6 | |
vaddps ymm3, ymm3, ymm7 | |
vsqrtps ymm0, ymm0 | |
vsqrtps ymm1, ymm1 | |
vsqrtps ymm2, ymm2 | |
vsqrtps ymm3, ymm3 | |
vmovups ymmword ptr [rcx - 96], ymm0 | |
vmovups ymmword ptr [rcx - 64], ymm1 | |
vmovups ymmword ptr [rcx - 32], ymm2 | |
vmovups ymmword ptr [rcx], ymm3 | |
Source line: 56 | |
sub rcx, -128 | |
sub rsi, -128 | |
sub rdx, -128 | |
add rax, -32 | |
jne L112 | |
mov rsi, rdi | |
L258: cmp r11, rsi | |
je L333 | |
L263: lea rax, qword ptr [r10 + 4*rsi] | |
lea rcx, qword ptr [r9 + 4*rsi] | |
lea rdx, qword ptr [r8 + 4*rsi] | |
sub r11, rsi | |
nop word ptr cs:[rax + rax] | |
Source line: 3 | |
L288: vmovss xmm0, dword ptr [rdx] | |
vmulss xmm0, xmm0, xmm0 | |
vmovss xmm1, dword ptr [rcx] | |
vmulss xmm1, xmm1, xmm1 | |
vaddss xmm0, xmm0, xmm1 | |
vsqrtss xmm0, xmm0, xmm0 | |
vmovss dword ptr [rax], xmm0 | |
Source line: 57 | |
add rax, 4 | |
add rcx, 4 | |
add rdx, 4 | |
dec r11 | |
jne L288 | |
L333: mov eax, 12394448 | |
Source line: 64 | |
pop rbp | |
vzeroupper | |
ret | |
Source line: 49 | |
L343: movabs rax, jl_overflow_exception | |
mov rdi, qword ptr [rax] | |
movabs rax, jl_throw_with_superfluous_argument | |
mov esi, 49 | |
call rax | |
julia> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment