Created
September 27, 2018 14:25
-
-
Save haampie/ccf43d1eff2f385bb9dc90f7560953b1 to your computer and use it in GitHub Desktop.
cpp_vs_julia
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cstdint> | |
#include <cmath> | |
// g++ -Wall -O3 -std=c++14 -march=native -fPIC -shared -o givenlib.so micro.cc | |
extern "C" { | |
void fused_horizontal(double * __restrict__ A, int64_t cols, double c1, double s1, double c2, double s2, double c3, double s3, double c4, double s4) | |
{ | |
for (int64_t col = 0; col < cols; ++col, A += 4) | |
{ | |
double a0 = *(A + 0); | |
double a1 = *(A + 1); | |
double a2 = *(A + 2); | |
double a3 = *(A + 3); | |
// Apply rotation 1 | |
double a1_ = std::fma( a1, c1, a2 * s1); | |
double a2_ = std::fma(-a1, s1, a2 * c1); | |
// Apply rotation 2 | |
double a2__ = std::fma( a2_, c2, a3 * s2); | |
double a3__ = std::fma(-a2_, s2, a3 * c2); | |
// Apply rotation 3 | |
double a0___ = std::fma( a0, c3, a1_ * s3); | |
double a1___ = std::fma(-a0, s3, a1_ * c3); | |
// Apply rotation 4 | |
double a1____ = std::fma( a1___, c4, a2__ * s4); | |
double a2____ = std::fma(-a1___, s4, a2__ * c4); | |
*(A + 0) = a0___; | |
*(A + 1) = a1____; | |
*(A + 2) = a2____; | |
*(A + 3) = a3__; | |
} | |
} | |
void fused_vertical(double * __restrict__ A, int64_t rows, double c1, double s1, double c2, double s2, double c3, double s3, double c4, double s4) | |
{ | |
for (int64_t row = 0; row < rows; ++row, A += 1) | |
{ | |
double a0 = *(A + 0 * rows); | |
double a1 = *(A + 1 * rows); | |
double a2 = *(A + 2 * rows); | |
double a3 = *(A + 3 * rows); | |
// Apply rotation 1 | |
double a1_ = std::fma( a1, c1, a2 * s1); | |
double a2_ = std::fma(-a1, s1, a2 * c1); | |
// Apply rotation 2 | |
double a2__ = std::fma( a2_, c2, a3 * s2); | |
double a3__ = std::fma(-a2_, s2, a3 * c2); | |
// Apply rotation 3 | |
double a0___ = std::fma( a0, c3, a1_ * s3); | |
double a1___ = std::fma(-a0, s3, a1_ * c3); | |
// Apply rotation 4 | |
double a1____ = std::fma( a1___, c4, a2__ * s4); | |
double a2____ = std::fma(-a1___, s4, a2__ * c4); | |
*(A + 0 * rows) = a0___; | |
*(A + 1 * rows) = a1____; | |
*(A + 2 * rows) = a2____; | |
*(A + 3 * rows) = a3__; | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using BenchmarkTools | |
using LinearAlgebra | |
using LinearAlgebra: givensAlgorithm | |
""" | |
I want to apply 4 'fused' Givens rotations to 4 columns of matrix Q. Here Q | |
is a n x 4 matrix. In the benchmarks I compare the number of GFLOP/s when the | |
rotations are applied to Q directly (vertical) versus when Q is first | |
transposed (horizontal). | |
In the 'vertical' case: the access pattern is not contiguous. | |
In the 'horizontal' case: the access pattern is perfectly contiguous. | |
However, the generated code for the contiguous case does not use AVX operations. | |
My benchmark results: | |
""" | |
function bench_panel(n = 256) | |
G = random_fused_rotations() | |
BLAS.set_num_threads(1) | |
maxflops = peakflops() / 1e9 | |
flop = 24 * n # 24 flop per row of Q | |
t1 = @belapsed apply_fused_packed_horizontal!(Q, $G) setup = (Q = rand(4, $n)) | |
t2 = @belapsed apply_fused_packed_vertical!(Q, $G) setup = (Q = rand($n, 4)) | |
t3 = @belapsed apply_fused_packed_horizontal_cpp!(Q, $G) setup = (Q = rand(4, $n)) | |
t4 = @belapsed apply_fused_packed_vertical_cpp!(Q, $G) setup = (Q = rand($n, 4)) | |
return ( | |
flop / t1 / 1e9, | |
flop / t2 / 1e9, | |
flop / t3 / 1e9, | |
flop / t4 / 1e9, | |
maxflops | |
) | |
end | |
function test_equal_impl(n = 256) | |
G = random_fused_rotations() | |
Q1 = rand(4, n) | |
Q2 = copy(Q1) | |
apply_fused_packed_horizontal!(Q1, G) | |
apply_fused_packed_horizontal_cpp!(Q2, G) | |
@show norm(Q1 - Q2) | |
Q3 = rand(n, 4) | |
Q4 = copy(Q3) | |
apply_fused_packed_vertical!(Q3, G) | |
apply_fused_packed_vertical_cpp!(Q4, G) | |
@show norm(Q3 - Q4) | |
end | |
struct Fused2x2{Tc,Ts} | |
c1::Tc | |
s1::Ts | |
c2::Tc | |
s2::Ts | |
c3::Tc | |
s3::Ts | |
c4::Tc | |
s4::Ts | |
end | |
generate_rotation() = givensAlgorithm(rand(), rand())[1:2] | |
random_fused_rotations() = Fused2x2(generate_rotation()...,generate_rotation()...,generate_rotation()...,generate_rotation()...) | |
@inline function kernel(a0, a1, a2, a3, G::Fused2x2) | |
# Apply rotation 1 | |
a1′ = muladd( a1, G.c1, a2 * G.s1') | |
a2′ = muladd(-a1, G.s1, a2 * G.c1 ) | |
# Apply rotation 2 | |
a2′′ = muladd( a2′, G.c2, a3 * G.s2') | |
a3′′ = muladd(-a2′, G.s2, a3 * G.c2 ) | |
# Apply rotation 3 | |
a0′′′ = muladd( a0, G.c3, a1′ * G.s3') | |
a1′′′ = muladd(-a0, G.s3, a1′ * G.c3 ) | |
# Apply rotation 4 | |
a1′′′′ = muladd( a1′′′, G.c4, a2′′ * G.s4') | |
a2′′′′ = muladd(-a1′′′, G.s4, a2′′ * G.c4 ) | |
return a0′′′, a1′′′′, a2′′′′, a3′′ | |
end | |
function apply_fused_packed_horizontal!(Q, G) | |
@inbounds for j = axes(Q, 2) | |
a0 = Q[1, j] | |
a1 = Q[2, j] | |
a2 = Q[3, j] | |
a3 = Q[4, j] | |
a0′′′, a1′′′′, a2′′′′, a3′′ = kernel(a0, a1, a2, a3, G) | |
Q[1, j] = a0′′′ | |
Q[2, j] = a1′′′′ | |
Q[3, j] = a2′′′′ | |
Q[4, j] = a3′′ | |
end | |
end | |
function apply_fused_packed_vertical!(Q, G) | |
@inbounds for j = axes(Q, 1) | |
a0 = Q[j, 1] | |
a1 = Q[j, 2] | |
a2 = Q[j, 3] | |
a3 = Q[j, 4] | |
a0′′′, a1′′′′, a2′′′′, a3′′ = kernel(a0, a1, a2, a3, G) | |
Q[j, 1] = a0′′′ | |
Q[j, 2] = a1′′′′ | |
Q[j, 3] = a2′′′′ | |
Q[j, 4] = a3′′ | |
end | |
end | |
function apply_fused_packed_horizontal_cpp!(Q, G) | |
ccall((:fused_horizontal, "./givenlib.so"), Cvoid, | |
(Ptr{Float64}, Int64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64), | |
pointer(Q), size(Q,2), G.c1, G.s1, G.c2, G.s2, G.c3, G.s3, G.c4, G.s4) | |
end | |
function apply_fused_packed_vertical_cpp!(Q, G) | |
ccall((:fused_vertical, "./givenlib.so"), Cvoid, | |
(Ptr{Float64}, Int64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64), | |
pointer(Q), size(Q,1), G.c1, G.s1, G.c2, G.s2, G.c3, G.s3, G.c4, G.s4) | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment