haampie/fusing_perf.jl

## fusing_perf.jl
using BenchmarkTools
using LinearAlgebra
using LinearAlgebra: givensAlgorithm

"""
I want to apply 4 'fused' Givens rotations to 4 columns of matrix Q. Here Q
is a n x 4 matrix. In the benchmarks I compare the number of GFLOP/s when the
rotations are applied to Q directly (vertical) versus when Q is first
transposed (horizontal).

In the 'vertical' case: the access pattern is not contiguous.
In the 'horizontal' case: the access pattern is perfectly contiguous.

However, the generated code for the contiguous case does not use AVX operations.
My benchmark results:
(12.02936857562408, 39.85624546661375, 50.41209223009482)
So the non-contiguous example is 3.3x faster and comes to 79% of GEMM performance.
"""
function bench_panel(n = 256)
    G = random_fused_rotations()

    BLAS.set_num_threads(1)
    maxflops = peakflops() / 1e9

    flop = 24 * n # 24 flop per row of Q
    t1 = @belapsed apply_fused_packed_horizontal!(Q, $G) setup = (Q = rand(4, $n))
    t2 = @belapsed apply_fused_packed_vertical!(Q, $G) setup = (Q = rand($n, 4))

    return flop / t1 / 1e9, flop / t2 / 1e9, maxflops
end

struct Fused2x2{Tc,Ts}
    c1::Tc
    s1::Ts
    c2::Tc
    s2::Ts
    c3::Tc
    s3::Ts
    c4::Tc
    s4::Ts
end

generate_rotation() = givensAlgorithm(rand(), rand())[1:2]
random_fused_rotations() = Fused2x2(generate_rotation()...,generate_rotation()...,generate_rotation()...,generate_rotation()...)

@inline function kernel(a0, a1, a2, a3, G::Fused2x2)
    # Apply rotation 1
    a1′ = muladd( a1, G.c1, a2 * G.s1')
    a2′ = muladd(-a1, G.s1, a2 * G.c1 )

    # Apply rotation 2
    a2′′ = muladd( a2′, G.c2, a3 * G.s2')
    a3′′ = muladd(-a2′, G.s2, a3 * G.c2 )

    # Apply rotation 3
    a0′′′ = muladd( a0, G.c3, a1′ * G.s3')
    a1′′′ = muladd(-a0, G.s3, a1′ * G.c3 )

    # Apply rotation 4
    a1′′′′ = muladd( a1′′′, G.c4, a2′′ * G.s4')
    a2′′′′ = muladd(-a1′′′, G.s4, a2′′ * G.c4 )

    return a0′′′, a1′′′′, a2′′′′, a3′′
end

function apply_fused_packed_horizontal!(Q, G)
    @inbounds for j = axes(Q, 2)
        a0 = Q[1, j]
        a1 = Q[2, j]
        a2 = Q[3, j]
        a3 = Q[4, j]

        a0′′′, a1′′′′, a2′′′′, a3′′ = kernel(a0, a1, a2, a3, G)

        Q[1, j] = a0′′′
        Q[2, j] = a1′′′′
        Q[3, j] = a2′′′′
        Q[4, j] = a3′′
    end
end

function apply_fused_packed_vertical!(Q, G)
    @inbounds for j = axes(Q, 1)
        a0 = Q[j, 1]
        a1 = Q[j, 2]
        a2 = Q[j, 3]
        a3 = Q[j, 4]

        a0′′′, a1′′′′, a2′′′′, a3′′ = kernel(a0, a1, a2, a3, G)

        Q[j, 1] = a0′′′
        Q[j, 2] = a1′′′′
        Q[j, 3] = a2′′′′
        Q[j, 4] = a3′′
    end
end
	using BenchmarkTools
	using LinearAlgebra
	using LinearAlgebra: givensAlgorithm

	"""
	I want to apply 4 'fused' Givens rotations to 4 columns of matrix Q. Here Q
	is a n x 4 matrix. In the benchmarks I compare the number of GFLOP/s when the
	rotations are applied to Q directly (vertical) versus when Q is first
	transposed (horizontal).

	In the 'vertical' case: the access pattern is not contiguous.
	In the 'horizontal' case: the access pattern is perfectly contiguous.

	However, the generated code for the contiguous case does not use AVX operations.
	My benchmark results:
	(12.02936857562408, 39.85624546661375, 50.41209223009482)
	So the non-contiguous example is 3.3x faster and comes to 79% of GEMM performance.
	"""
	function bench_panel(n = 256)
	G = random_fused_rotations()

	BLAS.set_num_threads(1)
	maxflops = peakflops() / 1e9

	flop = 24 * n # 24 flop per row of Q
	t1 = @belapsed apply_fused_packed_horizontal!(Q, $G) setup = (Q = rand(4, $n))
	t2 = @belapsed apply_fused_packed_vertical!(Q, $G) setup = (Q = rand($n, 4))

	return flop / t1 / 1e9, flop / t2 / 1e9, maxflops
	end

	struct Fused2x2{Tc,Ts}
	c1::Tc
	s1::Ts
	c2::Tc
	s2::Ts
	c3::Tc
	s3::Ts
	c4::Tc
	s4::Ts
	end

	generate_rotation() = givensAlgorithm(rand(), rand())[1:2]
	random_fused_rotations() = Fused2x2(generate_rotation()...,generate_rotation()...,generate_rotation()...,generate_rotation()...)

	@inline function kernel(a0, a1, a2, a3, G::Fused2x2)
	# Apply rotation 1
	a1′ = muladd( a1, G.c1, a2 * G.s1')
	a2′ = muladd(-a1, G.s1, a2 * G.c1 )

	# Apply rotation 2
	a2′′ = muladd( a2′, G.c2, a3 * G.s2')
	a3′′ = muladd(-a2′, G.s2, a3 * G.c2 )

	# Apply rotation 3
	a0′′′ = muladd( a0, G.c3, a1′ * G.s3')
	a1′′′ = muladd(-a0, G.s3, a1′ * G.c3 )

	# Apply rotation 4
	a1′′′′ = muladd( a1′′′, G.c4, a2′′ * G.s4')
	a2′′′′ = muladd(-a1′′′, G.s4, a2′′ * G.c4 )

	return a0′′′, a1′′′′, a2′′′′, a3′′
	end

	function apply_fused_packed_horizontal!(Q, G)
	@inbounds for j = axes(Q, 2)
	a0 = Q[1, j]
	a1 = Q[2, j]
	a2 = Q[3, j]
	a3 = Q[4, j]

	a0′′′, a1′′′′, a2′′′′, a3′′ = kernel(a0, a1, a2, a3, G)

	Q[1, j] = a0′′′
	Q[2, j] = a1′′′′
	Q[3, j] = a2′′′′
	Q[4, j] = a3′′
	end
	end

	function apply_fused_packed_vertical!(Q, G)
	@inbounds for j = axes(Q, 1)
	a0 = Q[j, 1]
	a1 = Q[j, 2]
	a2 = Q[j, 3]
	a3 = Q[j, 4]

	a0′′′, a1′′′′, a2′′′′, a3′′ = kernel(a0, a1, a2, a3, G)

	Q[j, 1] = a0′′′
	Q[j, 2] = a1′′′′
	Q[j, 3] = a2′′′′
	Q[j, 4] = a3′′
	end
	end