Skip to content

Instantly share code, notes, and snippets.

@haampie
Last active May 23, 2020 10:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save haampie/e968a9317b0f787a90d0338e27cae092 to your computer and use it in GitHub Desktop.
Save haampie/e968a9317b0f787a90d0338e27cae092 to your computer and use it in GitHub Desktop.
fast_transpose.jl
using SIMDPirates
canonicalize(A) = A
canonicalize(A::Union{<:SubArray}) = canonicalize(parent(A))
function version_1!(B::AbstractMatrix{Float64}, A::AbstractMatrix{Float64})
m, n = size(A)
Ac = canonicalize(A)
Bc = canonicalize(B)
sA = stride(Ac, 2)
sB = stride(Bc, 2)
@inbounds for i = Base.OneTo(m ÷ 4)
pA = pointer(A) + 4(i - 1) * sizeof(Float64)
pB = pointer(B) + 4(i - 1) * sB * sizeof(Float64)
for j = Base.OneTo(n ÷ 4)
v1 = vload(SVec{4,Float64}, pA + 0 * sA * sizeof(Float64))
v2 = vload(SVec{4,Float64}, pA + 1 * sA * sizeof(Float64))
v3 = vload(SVec{4,Float64}, pA + 2 * sA * sizeof(Float64))
v4 = vload(SVec{4,Float64}, pA + 3 * sA * sizeof(Float64))
w1 = SVec{4,Float64}(v1[1], v2[1], v3[1], v4[1])
w2 = SVec{4,Float64}(v1[2], v2[2], v3[2], v4[2])
w3 = SVec{4,Float64}(v1[3], v2[3], v3[3], v4[3])
w4 = SVec{4,Float64}(v1[4], v2[4], v3[4], v4[4])
vstore!(pB + 0 * sB * sizeof(Float64), w1)
vstore!(pB + 1 * sB * sizeof(Float64), w2)
vstore!(pB + 2 * sB * sizeof(Float64), w3)
vstore!(pB + 3 * sB * sizeof(Float64), w4)
pA += 4 * sA * sizeof(Float64)
pB += 4 * sizeof(Float64)
end
end
return B
end
function version_2!(B::AbstractMatrix{Float64}, A::AbstractMatrix{Float64})
m, n = size(A)
Ac = canonicalize(A)
Bc = canonicalize(B)
sA = stride(Ac, 2)
sB = stride(Bc, 2)
@inbounds for i = Base.OneTo(m ÷ 4)
pA = pointer(A) + 4(i - 1) * sizeof(Float64)
pB = pointer(B) + 4(i - 1) * sB * sizeof(Float64)
for j = Base.OneTo(n ÷ 4)
v00 = vload(SVec{4,Float64}, pA + 0 * sA * sizeof(Float64))
v01 = vload(SVec{4,Float64}, pA + 1 * sA * sizeof(Float64))
v02 = vload(SVec{4,Float64}, pA + 2 * sA * sizeof(Float64))
v03 = vload(SVec{4,Float64}, pA + 3 * sA * sizeof(Float64))
v04 = SIMDPirates.shufflevector(v00, v01, Val{(0, 4, 2, 6)}())
v05 = SIMDPirates.shufflevector(v00, v01, Val{(1, 5, 3, 7)}())
v06 = SIMDPirates.shufflevector(v02, v03, Val{(0, 4, 2, 6)}())
v07 = SIMDPirates.shufflevector(v02, v03, Val{(1, 5, 3, 7)}())
v08 = SIMDPirates.shufflevector(v04, v06, Val{(0, 1, 4, 5)}())
v09 = SIMDPirates.shufflevector(v05, v07, Val{(0, 1, 4, 5)}())
v10 = SIMDPirates.shufflevector(v04, v06, Val{(2, 3, 6, 7)}())
v11 = SIMDPirates.shufflevector(v05, v07, Val{(2, 3, 6, 7)}())
vstore!(pB + 0 * sB * sizeof(Float64), v08)
vstore!(pB + 1 * sB * sizeof(Float64), v09)
vstore!(pB + 2 * sB * sizeof(Float64), v10)
vstore!(pB + 3 * sB * sizeof(Float64), v11)
pA += 4 * sA * sizeof(Float64)
pB += 4 * sizeof(Float64)
end
end
return B
end
using Test, LinearAlgebra, BenchmarkTools
function do_tests()
A = rand(1024, 1024); B = rand(1024, 1024);
version_1!(A, B);
a = @test norm(A' - B) == 0
A = rand(1024, 1024); B = rand(1024, 1024);
version_2!(A, B);
b = @test norm(A' - B) == 0
return a, b
end
function benchmark(ns = 2 .^ (5:10))
timing_1 = Float64[]
timing_2 = Float64[]
for n in ns
t1 = @belapsed version_1!(A, B) setup=(A=rand($n, $n);B=rand($n, $n))
t2 = @belapsed version_2!(A, B) setup=(A=rand($n, $n);B=rand($n, $n))
push!(timing_1, t1)
push!(timing_2, t2)
end
return ns, timing_1, timing_2
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment