Skip to content

Instantly share code, notes, and snippets.

@ArchRobison
Created October 2, 2015 16:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ArchRobison/4fe680d43059e7356963 to your computer and use it in GitHub Desktop.
Save ArchRobison/4fe680d43059e7356963 to your computer and use it in GitHub Desktop.
T=NTuple{4,Float32}
zero(::Type{T}) = (0.0f0,0.0f0,0.0f0,0.f0)
function add( a::T, b::T )
(a[1]+b[1], a[2]+b[2], a[3]+b[3], a[4]+b[4])
end
function mul( a::T, b::T )
(a[1]*b[1], a[2]*b[2], a[3]*b[3], a[4]*b[4])
end
function muladd( a::T, b::T, c::T )
@fastmath add(a,mul(b,c))
end
function vec( a::T, x::Vector{T}, y::Vector{T} )
@inbounds for i=1:length(x)
y[i] = muladd(y[i],a,x[i])
end
y
end
function f()
u = fill( (0.0f0,0.0f0,0.0f0,0.f0), 1000)
v = fill( (0.0f0,0.0f0,0.0f0,0.f0), 1000)
@time for i=1:100000
vec(zero(T), v, u)
end
end
code_native(vec,(T,Vector{T},Vector{T}))
f()
f()
f()
L64:L176:L196: .text
pushq %rax
movq %fs:40, %rax
movq %rax, (%rsp)
movq 8(%rdi), %rax
cmpq $1, %rax
jl L176
movq (%rsi), %rcx
movq (%rdi), %rdx
vmovshdup %xmm0, %xmm8 # xmm8 = xmm0[1,1,3,3]
vpermilpd $1, %xmm0, %xmm9 # xmm9 = xmm0[1,0]
vpermilps $231, %xmm0, %xmm3 # xmm3 = xmm0[3,1,2,3]
nopw %cs:(%rax,%rax)
vmovups (%rcx), %xmm4
vmovups (%rdx), %xmm5
vmulss %xmm5, %xmm0, %xmm6
vmovshdup %xmm5, %xmm7 # xmm7 = xmm5[1,1,3,3]
vmulss %xmm7, %xmm8, %xmm7
vpermilpd $1, %xmm5, %xmm1 # xmm1 = xmm5[1,0]
vmulss %xmm1, %xmm9, %xmm1
vpermilps $231, %xmm5, %xmm5 # xmm5 = xmm5[3,1,2,3]
vmulss %xmm5, %xmm3, %xmm5
vaddss %xmm6, %xmm4, %xmm6
vmovshdup %xmm4, %xmm2 # xmm2 = xmm4[1,1,3,3]
vaddss %xmm7, %xmm2, %xmm2
vpermilpd $1, %xmm4, %xmm7 # xmm7 = xmm4[1,0]
vaddss %xmm1, %xmm7, %xmm1
vpermilps $231, %xmm4, %xmm4 # xmm4 = xmm4[3,1,2,3]
vaddss %xmm5, %xmm4, %xmm4
vinsertps $16, %xmm2, %xmm6, %xmm2 # xmm2 = xmm6[0],xmm2[0],xmm6[2,3]
vinsertps $32, %xmm1, %xmm2, %xmm1 # xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
vinsertps $48, %xmm4, %xmm1, %xmm1 # xmm1 = xmm1[0,1,2],xmm4[0]
vmovups %xmm1, (%rcx)
addq $-1, %rax
addq $16, %rdx
addq $16, %rcx
cmpq $0, %rax
jne L64
movq %fs:40, %rax
cmpq (%rsp), %rax
jne L196
movq %rsi, %rax
popq %rdx
retq
movabsq $__stack_chk_fail, %rax
callq *%rax
pushq %rax
movq (%rsi), %rax
vmovaps (%rax), %xmm0
movq 8(%rsi), %rdi
movq 16(%rsi), %rsi
movabsq $vec, %rax
callq *%rax
popq %rdx
retq
0.317651 seconds (100.03 k allocations: 3.054 MB)
0.297012 seconds (100.00 k allocations: 3.052 MB)
0.297075 seconds (100.00 k allocations: 3.052 MB)
L32:L62:L82: .text
pushq %rax
movq %fs:40, %rax
movq %rax, (%rsp)
movq 8(%rdi), %rax
cmpq $1, %rax
jl L62
movq (%rsi), %rcx
movq (%rdi), %rdx
nop
vmulps (%rdx), %xmm0, %xmm1
vaddps (%rcx), %xmm1, %xmm1
vmovups %xmm1, (%rcx)
addq $-1, %rax
addq $16, %rdx
addq $16, %rcx
cmpq $0, %rax
jne L32
movq %fs:40, %rax
cmpq (%rsp), %rax
jne L82
movq %rsi, %rax
popq %rdx
retq
movabsq $__stack_chk_fail, %rax
callq *%rax
nop
pushq %rax
movq (%rsi), %rax
vmovaps (%rax), %xmm0
movq 8(%rsi), %rdi
movq 16(%rsi), %rsi
movabsq $vec, %rax
callq *%rax
popq %rdx
retq
0.111175 seconds (100.03 k allocations: 3.054 MB)
0.084908 seconds (100.00 k allocations: 3.052 MB)
0.087529 seconds (100.00 k allocations: 3.052 MB)
L48:L123: .text
movq 8(%rsi), %rax
cmpq $1, %rax
jl L123
movq (%rdx), %rcx
movq (%rsi), %rsi
vmovss (%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero
vmovss 4(%rdi), %xmm1 # xmm1 = mem[0],zero,zero,zero
vmovss 8(%rdi), %xmm2 # xmm2 = mem[0],zero,zero,zero
vmovss 12(%rdi), %xmm3 # xmm3 = mem[0],zero,zero,zero
nopw %cs:(%rax,%rax)
vmulss (%rsi), %xmm0, %xmm4
vmulss 4(%rsi), %xmm1, %xmm5
vmulss 8(%rsi), %xmm2, %xmm6
vmulss 12(%rsi), %xmm3, %xmm7
vaddss (%rcx), %xmm4, %xmm4
vaddss 4(%rcx), %xmm5, %xmm5
vaddss 8(%rcx), %xmm6, %xmm6
vaddss 12(%rcx), %xmm7, %xmm7
vmovss %xmm7, 12(%rcx)
vmovss %xmm6, 8(%rcx)
vmovss %xmm5, 4(%rcx)
vmovss %xmm4, (%rcx)
addq $-1, %rax
addq $16, %rsi
addq $16, %rcx
cmpq $0, %rax
jne L48
movq %rdx, %rax
retq
nop
pushq %rax
movq (%rsi), %rdi
movq 8(%rsi), %rax
movq 16(%rsi), %rdx
movabsq $vec, %rcx
movq %rax, %rsi
callq *%rcx
popq %rdx
retq
0.210977 seconds (100.03 k allocations: 3.054 MB)
0.189340 seconds (100.00 k allocations: 3.052 MB)
0.188498 seconds (100.00 k allocations: 3.052 MB)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment