Skip to content

Instantly share code, notes, and snippets.

@haampie
Created September 27, 2018 14:25
Show Gist options
  • Save haampie/ccf43d1eff2f385bb9dc90f7560953b1 to your computer and use it in GitHub Desktop.
Save haampie/ccf43d1eff2f385bb9dc90f7560953b1 to your computer and use it in GitHub Desktop.
cpp_vs_julia
#include <cstdint>
#include <cmath>
// g++ -Wall -O3 -std=c++14 -march=native -fPIC -shared -o givenlib.so micro.cc
extern "C" {
void fused_horizontal(double * __restrict__ A, int64_t cols, double c1, double s1, double c2, double s2, double c3, double s3, double c4, double s4)
{
for (int64_t col = 0; col < cols; ++col, A += 4)
{
double a0 = *(A + 0);
double a1 = *(A + 1);
double a2 = *(A + 2);
double a3 = *(A + 3);
// Apply rotation 1
double a1_ = std::fma( a1, c1, a2 * s1);
double a2_ = std::fma(-a1, s1, a2 * c1);
// Apply rotation 2
double a2__ = std::fma( a2_, c2, a3 * s2);
double a3__ = std::fma(-a2_, s2, a3 * c2);
// Apply rotation 3
double a0___ = std::fma( a0, c3, a1_ * s3);
double a1___ = std::fma(-a0, s3, a1_ * c3);
// Apply rotation 4
double a1____ = std::fma( a1___, c4, a2__ * s4);
double a2____ = std::fma(-a1___, s4, a2__ * c4);
*(A + 0) = a0___;
*(A + 1) = a1____;
*(A + 2) = a2____;
*(A + 3) = a3__;
}
}
void fused_vertical(double * __restrict__ A, int64_t rows, double c1, double s1, double c2, double s2, double c3, double s3, double c4, double s4)
{
for (int64_t row = 0; row < rows; ++row, A += 1)
{
double a0 = *(A + 0 * rows);
double a1 = *(A + 1 * rows);
double a2 = *(A + 2 * rows);
double a3 = *(A + 3 * rows);
// Apply rotation 1
double a1_ = std::fma( a1, c1, a2 * s1);
double a2_ = std::fma(-a1, s1, a2 * c1);
// Apply rotation 2
double a2__ = std::fma( a2_, c2, a3 * s2);
double a3__ = std::fma(-a2_, s2, a3 * c2);
// Apply rotation 3
double a0___ = std::fma( a0, c3, a1_ * s3);
double a1___ = std::fma(-a0, s3, a1_ * c3);
// Apply rotation 4
double a1____ = std::fma( a1___, c4, a2__ * s4);
double a2____ = std::fma(-a1___, s4, a2__ * c4);
*(A + 0 * rows) = a0___;
*(A + 1 * rows) = a1____;
*(A + 2 * rows) = a2____;
*(A + 3 * rows) = a3__;
}
}
}
using BenchmarkTools
using LinearAlgebra
using LinearAlgebra: givensAlgorithm
"""
I want to apply 4 'fused' Givens rotations to 4 columns of matrix Q. Here Q
is a n x 4 matrix. In the benchmarks I compare the number of GFLOP/s when the
rotations are applied to Q directly (vertical) versus when Q is first
transposed (horizontal).
In the 'vertical' case: the access pattern is not contiguous.
In the 'horizontal' case: the access pattern is perfectly contiguous.
However, the generated code for the contiguous case does not use AVX operations.
My benchmark results:
"""
function bench_panel(n = 256)
G = random_fused_rotations()
BLAS.set_num_threads(1)
maxflops = peakflops() / 1e9
flop = 24 * n # 24 flop per row of Q
t1 = @belapsed apply_fused_packed_horizontal!(Q, $G) setup = (Q = rand(4, $n))
t2 = @belapsed apply_fused_packed_vertical!(Q, $G) setup = (Q = rand($n, 4))
t3 = @belapsed apply_fused_packed_horizontal_cpp!(Q, $G) setup = (Q = rand(4, $n))
t4 = @belapsed apply_fused_packed_vertical_cpp!(Q, $G) setup = (Q = rand($n, 4))
return (
flop / t1 / 1e9,
flop / t2 / 1e9,
flop / t3 / 1e9,
flop / t4 / 1e9,
maxflops
)
end
function test_equal_impl(n = 256)
G = random_fused_rotations()
Q1 = rand(4, n)
Q2 = copy(Q1)
apply_fused_packed_horizontal!(Q1, G)
apply_fused_packed_horizontal_cpp!(Q2, G)
@show norm(Q1 - Q2)
Q3 = rand(n, 4)
Q4 = copy(Q3)
apply_fused_packed_vertical!(Q3, G)
apply_fused_packed_vertical_cpp!(Q4, G)
@show norm(Q3 - Q4)
end
struct Fused2x2{Tc,Ts}
c1::Tc
s1::Ts
c2::Tc
s2::Ts
c3::Tc
s3::Ts
c4::Tc
s4::Ts
end
generate_rotation() = givensAlgorithm(rand(), rand())[1:2]
random_fused_rotations() = Fused2x2(generate_rotation()...,generate_rotation()...,generate_rotation()...,generate_rotation()...)
@inline function kernel(a0, a1, a2, a3, G::Fused2x2)
# Apply rotation 1
a1′ = muladd( a1, G.c1, a2 * G.s1')
a2′ = muladd(-a1, G.s1, a2 * G.c1 )
# Apply rotation 2
a2′′ = muladd( a2′, G.c2, a3 * G.s2')
a3′′ = muladd(-a2′, G.s2, a3 * G.c2 )
# Apply rotation 3
a0′′′ = muladd( a0, G.c3, a1′ * G.s3')
a1′′′ = muladd(-a0, G.s3, a1′ * G.c3 )
# Apply rotation 4
a1′′′′ = muladd( a1′′′, G.c4, a2′′ * G.s4')
a2′′′′ = muladd(-a1′′′, G.s4, a2′′ * G.c4 )
return a0′′′, a1′′′′, a2′′′′, a3′′
end
function apply_fused_packed_horizontal!(Q, G)
@inbounds for j = axes(Q, 2)
a0 = Q[1, j]
a1 = Q[2, j]
a2 = Q[3, j]
a3 = Q[4, j]
a0′′′, a1′′′′, a2′′′′, a3′′ = kernel(a0, a1, a2, a3, G)
Q[1, j] = a0′′′
Q[2, j] = a1′′′′
Q[3, j] = a2′′′′
Q[4, j] = a3′′
end
end
function apply_fused_packed_vertical!(Q, G)
@inbounds for j = axes(Q, 1)
a0 = Q[j, 1]
a1 = Q[j, 2]
a2 = Q[j, 3]
a3 = Q[j, 4]
a0′′′, a1′′′′, a2′′′′, a3′′ = kernel(a0, a1, a2, a3, G)
Q[j, 1] = a0′′′
Q[j, 2] = a1′′′′
Q[j, 3] = a2′′′′
Q[j, 4] = a3′′
end
end
function apply_fused_packed_horizontal_cpp!(Q, G)
ccall((:fused_horizontal, "./givenlib.so"), Cvoid,
(Ptr{Float64}, Int64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64),
pointer(Q), size(Q,2), G.c1, G.s1, G.c2, G.s2, G.c3, G.s3, G.c4, G.s4)
end
function apply_fused_packed_vertical_cpp!(Q, G)
ccall((:fused_vertical, "./givenlib.so"), Cvoid,
(Ptr{Float64}, Int64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64),
pointer(Q), size(Q,1), G.c1, G.s1, G.c2, G.s2, G.c3, G.s3, G.c4, G.s4)
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment