haampie/micro.cc

## micro.cc
#include <cstdint>
#include <cmath>

// g++ -Wall -O3 -std=c++14 -march=native -fPIC -shared -o givenlib.so micro.cc

extern "C" {
void fused_horizontal(double * __restrict__ A, int64_t cols, double c1, double s1, double c2, double s2, double c3, double s3, double c4, double s4)
{
    for (int64_t col = 0; col < cols; ++col, A += 4)
    {
        double a0 = *(A + 0);
        double a1 = *(A + 1);
        double a2 = *(A + 2);
        double a3 = *(A + 3);

        // Apply rotation 1
        double a1_ = std::fma( a1, c1, a2 * s1);
        double a2_ = std::fma(-a1, s1, a2 * c1);

        // Apply rotation 2
        double a2__ = std::fma( a2_, c2, a3 * s2);
        double a3__ = std::fma(-a2_, s2, a3 * c2);

        // Apply rotation 3
        double a0___ = std::fma( a0, c3, a1_ * s3);
        double a1___ = std::fma(-a0, s3, a1_ * c3);

        // Apply rotation 4
        double a1____ = std::fma( a1___, c4, a2__ * s4);
        double a2____ = std::fma(-a1___, s4, a2__ * c4);

        *(A + 0) = a0___;
        *(A + 1) = a1____;
        *(A + 2) = a2____;
        *(A + 3) = a3__;
    }
}

void fused_vertical(double * __restrict__ A, int64_t rows, double c1, double s1, double c2, double s2, double c3, double s3, double c4, double s4)
{
    for (int64_t row = 0; row < rows; ++row, A += 1)
    {
        double a0 = *(A + 0 * rows);
        double a1 = *(A + 1 * rows);
        double a2 = *(A + 2 * rows);
        double a3 = *(A + 3 * rows);

        // Apply rotation 1
        double a1_ = std::fma( a1, c1, a2 * s1);
        double a2_ = std::fma(-a1, s1, a2 * c1);

        // Apply rotation 2
        double a2__ = std::fma( a2_, c2, a3 * s2);
        double a3__ = std::fma(-a2_, s2, a3 * c2);

        // Apply rotation 3
        double a0___ = std::fma( a0, c3, a1_ * s3);
        double a1___ = std::fma(-a0, s3, a1_ * c3);

        // Apply rotation 4
        double a1____ = std::fma( a1___, c4, a2__ * s4);
        double a2____ = std::fma(-a1___, s4, a2__ * c4);

        *(A + 0 * rows) = a0___;
        *(A + 1 * rows) = a1____;
        *(A + 2 * rows) = a2____;
        *(A + 3 * rows) = a3__;
    }
}
}

## micro.jl
using BenchmarkTools
using LinearAlgebra
using LinearAlgebra: givensAlgorithm

"""
I want to apply 4 'fused' Givens rotations to 4 columns of matrix Q. Here Q
is a n x 4 matrix. In the benchmarks I compare the number of GFLOP/s when the
rotations are applied to Q directly (vertical) versus when Q is first
transposed (horizontal).

In the 'vertical' case: the access pattern is not contiguous.
In the 'horizontal' case: the access pattern is perfectly contiguous.

However, the generated code for the contiguous case does not use AVX operations.
My benchmark results:
"""
function bench_panel(n = 256)
    G = random_fused_rotations()

    BLAS.set_num_threads(1)
    maxflops = peakflops() / 1e9

    flop = 24 * n # 24 flop per row of Q
    t1 = @belapsed apply_fused_packed_horizontal!(Q, $G) setup = (Q = rand(4, $n))
    t2 = @belapsed apply_fused_packed_vertical!(Q, $G) setup = (Q = rand($n, 4))
    t3 = @belapsed apply_fused_packed_horizontal_cpp!(Q, $G) setup = (Q = rand(4, $n))
    t4 = @belapsed apply_fused_packed_vertical_cpp!(Q, $G) setup = (Q = rand($n, 4))

    return (
        flop / t1 / 1e9,
        flop / t2 / 1e9,
        flop / t3 / 1e9,
        flop / t4 / 1e9,
        maxflops
    )
end

function test_equal_impl(n = 256)
    G = random_fused_rotations()

    Q1 = rand(4, n)
    Q2 = copy(Q1)

    apply_fused_packed_horizontal!(Q1, G)
    apply_fused_packed_horizontal_cpp!(Q2, G)

    @show norm(Q1 - Q2)

    Q3 = rand(n, 4)
    Q4 = copy(Q3)

    apply_fused_packed_vertical!(Q3, G)
    apply_fused_packed_vertical_cpp!(Q4, G)

    @show norm(Q3 - Q4)
end

struct Fused2x2{Tc,Ts}
    c1::Tc
    s1::Ts
    c2::Tc
    s2::Ts
    c3::Tc
    s3::Ts
    c4::Tc
    s4::Ts
end

generate_rotation() = givensAlgorithm(rand(), rand())[1:2]
random_fused_rotations() = Fused2x2(generate_rotation()...,generate_rotation()...,generate_rotation()...,generate_rotation()...)

@inline function kernel(a0, a1, a2, a3, G::Fused2x2)
    # Apply rotation 1
    a1′ = muladd( a1, G.c1, a2 * G.s1')
    a2′ = muladd(-a1, G.s1, a2 * G.c1 )

    # Apply rotation 2
    a2′′ = muladd( a2′, G.c2, a3 * G.s2')
    a3′′ = muladd(-a2′, G.s2, a3 * G.c2 )

    # Apply rotation 3
    a0′′′ = muladd( a0, G.c3, a1′ * G.s3')
    a1′′′ = muladd(-a0, G.s3, a1′ * G.c3 )

    # Apply rotation 4
    a1′′′′ = muladd( a1′′′, G.c4, a2′′ * G.s4')
    a2′′′′ = muladd(-a1′′′, G.s4, a2′′ * G.c4 )

    return a0′′′, a1′′′′, a2′′′′, a3′′
end

function apply_fused_packed_horizontal!(Q, G)
    @inbounds for j = axes(Q, 2)
        a0 = Q[1, j]
        a1 = Q[2, j]
        a2 = Q[3, j]
        a3 = Q[4, j]

        a0′′′, a1′′′′, a2′′′′, a3′′ = kernel(a0, a1, a2, a3, G)

        Q[1, j] = a0′′′
        Q[2, j] = a1′′′′
        Q[3, j] = a2′′′′
        Q[4, j] = a3′′
    end
end

function apply_fused_packed_vertical!(Q, G)
    @inbounds for j = axes(Q, 1)
        a0 = Q[j, 1]
        a1 = Q[j, 2]
        a2 = Q[j, 3]
        a3 = Q[j, 4]

        a0′′′, a1′′′′, a2′′′′, a3′′ = kernel(a0, a1, a2, a3, G)

        Q[j, 1] = a0′′′
        Q[j, 2] = a1′′′′
        Q[j, 3] = a2′′′′
        Q[j, 4] = a3′′
    end
end

function apply_fused_packed_horizontal_cpp!(Q, G)
    ccall((:fused_horizontal, "./givenlib.so"), Cvoid,
    (Ptr{Float64}, Int64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64),
    pointer(Q), size(Q,2), G.c1, G.s1, G.c2, G.s2, G.c3, G.s3, G.c4, G.s4)
end

function apply_fused_packed_vertical_cpp!(Q, G)
    ccall((:fused_vertical, "./givenlib.so"), Cvoid,
    (Ptr{Float64}, Int64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64),
    pointer(Q), size(Q,1), G.c1, G.s1, G.c2, G.s2, G.c3, G.s3, G.c4, G.s4)
end
	#include <cstdint>
	#include <cmath>

	// g++ -Wall -O3 -std=c++14 -march=native -fPIC -shared -o givenlib.so micro.cc

	extern "C" {
	void fused_horizontal(double * __restrict__ A, int64_t cols, double c1, double s1, double c2, double s2, double c3, double s3, double c4, double s4)
	{
	for (int64_t col = 0; col < cols; ++col, A += 4)
	{
	double a0 = *(A + 0);
	double a1 = *(A + 1);
	double a2 = *(A + 2);
	double a3 = *(A + 3);

	// Apply rotation 1
	double a1_ = std::fma( a1, c1, a2 * s1);
	double a2_ = std::fma(-a1, s1, a2 * c1);

	// Apply rotation 2
	double a2__ = std::fma( a2_, c2, a3 * s2);
	double a3__ = std::fma(-a2_, s2, a3 * c2);

	// Apply rotation 3
	double a0___ = std::fma( a0, c3, a1_ * s3);
	double a1___ = std::fma(-a0, s3, a1_ * c3);

	// Apply rotation 4
	double a1____ = std::fma( a1___, c4, a2__ * s4);
	double a2____ = std::fma(-a1___, s4, a2__ * c4);

	*(A + 0) = a0___;
	*(A + 1) = a1____;
	*(A + 2) = a2____;
	*(A + 3) = a3__;
	}
	}

	void fused_vertical(double * __restrict__ A, int64_t rows, double c1, double s1, double c2, double s2, double c3, double s3, double c4, double s4)
	{
	for (int64_t row = 0; row < rows; ++row, A += 1)
	{
	double a0 = (A + 0 rows);
	double a1 = (A + 1 rows);
	double a2 = (A + 2 rows);
	double a3 = (A + 3 rows);

	// Apply rotation 1
	double a1_ = std::fma( a1, c1, a2 * s1);
	double a2_ = std::fma(-a1, s1, a2 * c1);

	// Apply rotation 2
	double a2__ = std::fma( a2_, c2, a3 * s2);
	double a3__ = std::fma(-a2_, s2, a3 * c2);

	// Apply rotation 3
	double a0___ = std::fma( a0, c3, a1_ * s3);
	double a1___ = std::fma(-a0, s3, a1_ * c3);

	// Apply rotation 4
	double a1____ = std::fma( a1___, c4, a2__ * s4);
	double a2____ = std::fma(-a1___, s4, a2__ * c4);

	(A + 0 rows) = a0___;
	(A + 1 rows) = a1____;
	(A + 2 rows) = a2____;
	(A + 3 rows) = a3__;
	}
	}
	}
	using BenchmarkTools
	using LinearAlgebra
	using LinearAlgebra: givensAlgorithm

	"""
	I want to apply 4 'fused' Givens rotations to 4 columns of matrix Q. Here Q
	is a n x 4 matrix. In the benchmarks I compare the number of GFLOP/s when the
	rotations are applied to Q directly (vertical) versus when Q is first
	transposed (horizontal).

	In the 'vertical' case: the access pattern is not contiguous.
	In the 'horizontal' case: the access pattern is perfectly contiguous.

	However, the generated code for the contiguous case does not use AVX operations.
	My benchmark results:
	"""
	function bench_panel(n = 256)
	G = random_fused_rotations()

	BLAS.set_num_threads(1)
	maxflops = peakflops() / 1e9

	flop = 24 * n # 24 flop per row of Q
	t1 = @belapsed apply_fused_packed_horizontal!(Q, $G) setup = (Q = rand(4, $n))
	t2 = @belapsed apply_fused_packed_vertical!(Q, $G) setup = (Q = rand($n, 4))
	t3 = @belapsed apply_fused_packed_horizontal_cpp!(Q, $G) setup = (Q = rand(4, $n))
	t4 = @belapsed apply_fused_packed_vertical_cpp!(Q, $G) setup = (Q = rand($n, 4))

	return (
	flop / t1 / 1e9,
	flop / t2 / 1e9,
	flop / t3 / 1e9,
	flop / t4 / 1e9,
	maxflops
	)
	end

	function test_equal_impl(n = 256)
	G = random_fused_rotations()

	Q1 = rand(4, n)
	Q2 = copy(Q1)

	apply_fused_packed_horizontal!(Q1, G)
	apply_fused_packed_horizontal_cpp!(Q2, G)

	@show norm(Q1 - Q2)

	Q3 = rand(n, 4)
	Q4 = copy(Q3)

	apply_fused_packed_vertical!(Q3, G)
	apply_fused_packed_vertical_cpp!(Q4, G)

	@show norm(Q3 - Q4)
	end

	struct Fused2x2{Tc,Ts}
	c1::Tc
	s1::Ts
	c2::Tc
	s2::Ts
	c3::Tc
	s3::Ts
	c4::Tc
	s4::Ts
	end

	generate_rotation() = givensAlgorithm(rand(), rand())[1:2]
	random_fused_rotations() = Fused2x2(generate_rotation()...,generate_rotation()...,generate_rotation()...,generate_rotation()...)

	@inline function kernel(a0, a1, a2, a3, G::Fused2x2)
	# Apply rotation 1
	a1′ = muladd( a1, G.c1, a2 * G.s1')
	a2′ = muladd(-a1, G.s1, a2 * G.c1 )

	# Apply rotation 2
	a2′′ = muladd( a2′, G.c2, a3 * G.s2')
	a3′′ = muladd(-a2′, G.s2, a3 * G.c2 )

	# Apply rotation 3
	a0′′′ = muladd( a0, G.c3, a1′ * G.s3')
	a1′′′ = muladd(-a0, G.s3, a1′ * G.c3 )

	# Apply rotation 4
	a1′′′′ = muladd( a1′′′, G.c4, a2′′ * G.s4')
	a2′′′′ = muladd(-a1′′′, G.s4, a2′′ * G.c4 )

	return a0′′′, a1′′′′, a2′′′′, a3′′
	end

	function apply_fused_packed_horizontal!(Q, G)
	@inbounds for j = axes(Q, 2)
	a0 = Q[1, j]
	a1 = Q[2, j]
	a2 = Q[3, j]
	a3 = Q[4, j]

	a0′′′, a1′′′′, a2′′′′, a3′′ = kernel(a0, a1, a2, a3, G)

	Q[1, j] = a0′′′
	Q[2, j] = a1′′′′
	Q[3, j] = a2′′′′
	Q[4, j] = a3′′
	end
	end

	function apply_fused_packed_vertical!(Q, G)
	@inbounds for j = axes(Q, 1)
	a0 = Q[j, 1]
	a1 = Q[j, 2]
	a2 = Q[j, 3]
	a3 = Q[j, 4]

	a0′′′, a1′′′′, a2′′′′, a3′′ = kernel(a0, a1, a2, a3, G)

	Q[j, 1] = a0′′′
	Q[j, 2] = a1′′′′
	Q[j, 3] = a2′′′′
	Q[j, 4] = a3′′
	end
	end

	function apply_fused_packed_horizontal_cpp!(Q, G)
	ccall((:fused_horizontal, "./givenlib.so"), Cvoid,
	(Ptr{Float64}, Int64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64),
	pointer(Q), size(Q,2), G.c1, G.s1, G.c2, G.s2, G.c3, G.s3, G.c4, G.s4)
	end

	function apply_fused_packed_vertical_cpp!(Q, G)
	ccall((:fused_vertical, "./givenlib.so"), Cvoid,
	(Ptr{Float64}, Int64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64),
	pointer(Q), size(Q,1), G.c1, G.s1, G.c2, G.s2, G.c3, G.s3, G.c4, G.s4)
	end