Created
October 2, 2015 16:35
-
-
Save ArchRobison/4fe680d43059e7356963 to your computer and use it in GitHub Desktop.
Impact of https://github.com/ArchRobison/julia/tree/adr/simdtuple changes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
T=NTuple{4,Float32} | |
zero(::Type{T}) = (0.0f0,0.0f0,0.0f0,0.f0) | |
function add( a::T, b::T ) | |
(a[1]+b[1], a[2]+b[2], a[3]+b[3], a[4]+b[4]) | |
end | |
function mul( a::T, b::T ) | |
(a[1]*b[1], a[2]*b[2], a[3]*b[3], a[4]*b[4]) | |
end | |
function muladd( a::T, b::T, c::T ) | |
@fastmath add(a,mul(b,c)) | |
end | |
function vec( a::T, x::Vector{T}, y::Vector{T} ) | |
@inbounds for i=1:length(x) | |
y[i] = muladd(y[i],a,x[i]) | |
end | |
y | |
end | |
function f() | |
u = fill( (0.0f0,0.0f0,0.0f0,0.f0), 1000) | |
v = fill( (0.0f0,0.0f0,0.0f0,0.f0), 1000) | |
@time for i=1:100000 | |
vec(zero(T), v, u) | |
end | |
end | |
code_native(vec,(T,Vector{T},Vector{T})) | |
f() | |
f() | |
f() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
L64:L176:L196: .text | |
pushq %rax | |
movq %fs:40, %rax | |
movq %rax, (%rsp) | |
movq 8(%rdi), %rax | |
cmpq $1, %rax | |
jl L176 | |
movq (%rsi), %rcx | |
movq (%rdi), %rdx | |
vmovshdup %xmm0, %xmm8 # xmm8 = xmm0[1,1,3,3] | |
vpermilpd $1, %xmm0, %xmm9 # xmm9 = xmm0[1,0] | |
vpermilps $231, %xmm0, %xmm3 # xmm3 = xmm0[3,1,2,3] | |
nopw %cs:(%rax,%rax) | |
vmovups (%rcx), %xmm4 | |
vmovups (%rdx), %xmm5 | |
vmulss %xmm5, %xmm0, %xmm6 | |
vmovshdup %xmm5, %xmm7 # xmm7 = xmm5[1,1,3,3] | |
vmulss %xmm7, %xmm8, %xmm7 | |
vpermilpd $1, %xmm5, %xmm1 # xmm1 = xmm5[1,0] | |
vmulss %xmm1, %xmm9, %xmm1 | |
vpermilps $231, %xmm5, %xmm5 # xmm5 = xmm5[3,1,2,3] | |
vmulss %xmm5, %xmm3, %xmm5 | |
vaddss %xmm6, %xmm4, %xmm6 | |
vmovshdup %xmm4, %xmm2 # xmm2 = xmm4[1,1,3,3] | |
vaddss %xmm7, %xmm2, %xmm2 | |
vpermilpd $1, %xmm4, %xmm7 # xmm7 = xmm4[1,0] | |
vaddss %xmm1, %xmm7, %xmm1 | |
vpermilps $231, %xmm4, %xmm4 # xmm4 = xmm4[3,1,2,3] | |
vaddss %xmm5, %xmm4, %xmm4 | |
vinsertps $16, %xmm2, %xmm6, %xmm2 # xmm2 = xmm6[0],xmm2[0],xmm6[2,3] | |
vinsertps $32, %xmm1, %xmm2, %xmm1 # xmm1 = xmm2[0,1],xmm1[0],xmm2[3] | |
vinsertps $48, %xmm4, %xmm1, %xmm1 # xmm1 = xmm1[0,1,2],xmm4[0] | |
vmovups %xmm1, (%rcx) | |
addq $-1, %rax | |
addq $16, %rdx | |
addq $16, %rcx | |
cmpq $0, %rax | |
jne L64 | |
movq %fs:40, %rax | |
cmpq (%rsp), %rax | |
jne L196 | |
movq %rsi, %rax | |
popq %rdx | |
retq | |
movabsq $__stack_chk_fail, %rax | |
callq *%rax | |
pushq %rax | |
movq (%rsi), %rax | |
vmovaps (%rax), %xmm0 | |
movq 8(%rsi), %rdi | |
movq 16(%rsi), %rsi | |
movabsq $vec, %rax | |
callq *%rax | |
popq %rdx | |
retq | |
0.317651 seconds (100.03 k allocations: 3.054 MB) | |
0.297012 seconds (100.00 k allocations: 3.052 MB) | |
0.297075 seconds (100.00 k allocations: 3.052 MB) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
L32:L62:L82: .text | |
pushq %rax | |
movq %fs:40, %rax | |
movq %rax, (%rsp) | |
movq 8(%rdi), %rax | |
cmpq $1, %rax | |
jl L62 | |
movq (%rsi), %rcx | |
movq (%rdi), %rdx | |
nop | |
vmulps (%rdx), %xmm0, %xmm1 | |
vaddps (%rcx), %xmm1, %xmm1 | |
vmovups %xmm1, (%rcx) | |
addq $-1, %rax | |
addq $16, %rdx | |
addq $16, %rcx | |
cmpq $0, %rax | |
jne L32 | |
movq %fs:40, %rax | |
cmpq (%rsp), %rax | |
jne L82 | |
movq %rsi, %rax | |
popq %rdx | |
retq | |
movabsq $__stack_chk_fail, %rax | |
callq *%rax | |
nop | |
pushq %rax | |
movq (%rsi), %rax | |
vmovaps (%rax), %xmm0 | |
movq 8(%rsi), %rdi | |
movq 16(%rsi), %rsi | |
movabsq $vec, %rax | |
callq *%rax | |
popq %rdx | |
retq | |
0.111175 seconds (100.03 k allocations: 3.054 MB) | |
0.084908 seconds (100.00 k allocations: 3.052 MB) | |
0.087529 seconds (100.00 k allocations: 3.052 MB) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
L48:L123: .text | |
movq 8(%rsi), %rax | |
cmpq $1, %rax | |
jl L123 | |
movq (%rdx), %rcx | |
movq (%rsi), %rsi | |
vmovss (%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero | |
vmovss 4(%rdi), %xmm1 # xmm1 = mem[0],zero,zero,zero | |
vmovss 8(%rdi), %xmm2 # xmm2 = mem[0],zero,zero,zero | |
vmovss 12(%rdi), %xmm3 # xmm3 = mem[0],zero,zero,zero | |
nopw %cs:(%rax,%rax) | |
vmulss (%rsi), %xmm0, %xmm4 | |
vmulss 4(%rsi), %xmm1, %xmm5 | |
vmulss 8(%rsi), %xmm2, %xmm6 | |
vmulss 12(%rsi), %xmm3, %xmm7 | |
vaddss (%rcx), %xmm4, %xmm4 | |
vaddss 4(%rcx), %xmm5, %xmm5 | |
vaddss 8(%rcx), %xmm6, %xmm6 | |
vaddss 12(%rcx), %xmm7, %xmm7 | |
vmovss %xmm7, 12(%rcx) | |
vmovss %xmm6, 8(%rcx) | |
vmovss %xmm5, 4(%rcx) | |
vmovss %xmm4, (%rcx) | |
addq $-1, %rax | |
addq $16, %rsi | |
addq $16, %rcx | |
cmpq $0, %rax | |
jne L48 | |
movq %rdx, %rax | |
retq | |
nop | |
pushq %rax | |
movq (%rsi), %rdi | |
movq 8(%rsi), %rax | |
movq 16(%rsi), %rdx | |
movabsq $vec, %rcx | |
movq %rax, %rsi | |
callq *%rcx | |
popq %rdx | |
retq | |
0.210977 seconds (100.03 k allocations: 3.054 MB) | |
0.189340 seconds (100.00 k allocations: 3.052 MB) | |
0.188498 seconds (100.00 k allocations: 3.052 MB) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment