Skip to content

Instantly share code, notes, and snippets.

@haampie
Last active September 27, 2018 12:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save haampie/def0a6dbaf3d9ab3980ee8876c3d7be0 to your computer and use it in GitHub Desktop.
Save haampie/def0a6dbaf3d9ab3980ee8876c3d7be0 to your computer and use it in GitHub Desktop.
fusing_perf.jl
using BenchmarkTools
using LinearAlgebra
using LinearAlgebra: givensAlgorithm
"""
I want to apply 4 'fused' Givens rotations to 4 columns of matrix Q. Here Q
is a n x 4 matrix. In the benchmarks I compare the number of GFLOP/s when the
rotations are applied to Q directly (vertical) versus when Q is first
transposed (horizontal).
In the 'vertical' case: the access pattern is not contiguous.
In the 'horizontal' case: the access pattern is perfectly contiguous.
However, the generated code for the contiguous case does not use AVX operations.
My benchmark results:
(12.02936857562408, 39.85624546661375, 50.41209223009482)
So the non-contiguous example is 3.3x faster and comes to 79% of GEMM performance.
"""
function bench_panel(n = 256)
G = random_fused_rotations()
BLAS.set_num_threads(1)
maxflops = peakflops() / 1e9
flop = 24 * n # 24 flop per row of Q
t1 = @belapsed apply_fused_packed_horizontal!(Q, $G) setup = (Q = rand(4, $n))
t2 = @belapsed apply_fused_packed_vertical!(Q, $G) setup = (Q = rand($n, 4))
return flop / t1 / 1e9, flop / t2 / 1e9, maxflops
end
struct Fused2x2{Tc,Ts}
c1::Tc
s1::Ts
c2::Tc
s2::Ts
c3::Tc
s3::Ts
c4::Tc
s4::Ts
end
generate_rotation() = givensAlgorithm(rand(), rand())[1:2]
random_fused_rotations() = Fused2x2(generate_rotation()...,generate_rotation()...,generate_rotation()...,generate_rotation()...)
@inline function kernel(a0, a1, a2, a3, G::Fused2x2)
# Apply rotation 1
a1′ = muladd( a1, G.c1, a2 * G.s1')
a2′ = muladd(-a1, G.s1, a2 * G.c1 )
# Apply rotation 2
a2′′ = muladd( a2′, G.c2, a3 * G.s2')
a3′′ = muladd(-a2′, G.s2, a3 * G.c2 )
# Apply rotation 3
a0′′′ = muladd( a0, G.c3, a1′ * G.s3')
a1′′′ = muladd(-a0, G.s3, a1′ * G.c3 )
# Apply rotation 4
a1′′′′ = muladd( a1′′′, G.c4, a2′′ * G.s4')
a2′′′′ = muladd(-a1′′′, G.s4, a2′′ * G.c4 )
return a0′′′, a1′′′′, a2′′′′, a3′′
end
function apply_fused_packed_horizontal!(Q, G)
@inbounds for j = axes(Q, 2)
a0 = Q[1, j]
a1 = Q[2, j]
a2 = Q[3, j]
a3 = Q[4, j]
a0′′′, a1′′′′, a2′′′′, a3′′ = kernel(a0, a1, a2, a3, G)
Q[1, j] = a0′′′
Q[2, j] = a1′′′′
Q[3, j] = a2′′′′
Q[4, j] = a3′′
end
end
function apply_fused_packed_vertical!(Q, G)
@inbounds for j = axes(Q, 1)
a0 = Q[j, 1]
a1 = Q[j, 2]
a2 = Q[j, 3]
a3 = Q[j, 4]
a0′′′, a1′′′′, a2′′′′, a3′′ = kernel(a0, a1, a2, a3, G)
Q[j, 1] = a0′′′
Q[j, 2] = a1′′′′
Q[j, 3] = a2′′′′
Q[j, 4] = a3′′
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment