Skip to content

Instantly share code, notes, and snippets.

@andreasnoack
Created May 15, 2015 21:25
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save andreasnoack/914e9d0102347cdd0766 to your computer and use it in GitHub Desktop.
Save andreasnoack/914e9d0102347cdd0766 to your computer and use it in GitHub Desktop.
module LC
const MC = 384
const KC = 384
const NC = 4096
const MR = 4
const NR = 4
const _A = Array(Float64, MC*KC)
const _B = Array(Float64, KC*NC)
const _C = Array(Float64, MR*NR)
const _AB = Array(Float64, MR*NR)
const pAB = pointer(_AB)
p_A = pointer(_A)
p_B = pointer(_B)
p_C = pointer(_C)
function pack_MRxk(k::Integer, A::Array{Float64}, Aoffset::Integer, incRowA::Integer,
incColA::Integer, buffer::Array{Float64}, boffset::Integer)
@inbounds begin
for j = 1:k
@simd for i = 1:MR
buffer[boffset + i] = A[Aoffset + (i - 1)*incRowA + 1]
end
boffset += MR
Aoffset += incColA
end
end
end
function pack_A(mc::Integer, kc::Integer, A::Array{Float64}, Aoffset::Integer,
incRowA::Integer, incColA::Integer, buffer::Array{Float64})
@inbounds begin
mp = div(mc, MR)
_mr = mc % MR
boffset = 0
for i = 1:mp
pack_MRxk(kc, A, Aoffset, incRowA, incColA, buffer, boffset)
boffset += kc*MR
Aoffset += MR*incRowA
end
if _mr > 0
for j = 1:kc
for i = 1:_mr
buffer[boffset + i] = A[Aoffset + (i - 1)*incRowA + 1]
end
for i = _mr:MR
buffer[boffset + i] = 0.0
end
boffset += MR
Aoffset += incColA
end
end
end
end
function pack_kxNR(k::Integer, B::Array{Float64}, Boffset::Integer, incRowB::Integer,
incColB::Integer, buffer::Array{Float64}, boffset::Integer)
@inbounds begin
for i = 1:k
for j = 1:NR
buffer[boffset + j] = B[Boffset + (j - 1)*incColB + 1]
end
boffset += NR
Boffset += incRowB
end
end
end
function pack_B(kc::Integer, nc::Integer, B::Array{Float64}, Boffset::Integer, incRowB::Integer,
incColB::Integer, buffer::Array{Float64})
@inbounds begin
np = div(nc, NR)
_nr = nc % NR
boffset = 0
for j = 1:np
pack_kxNR(kc, B, Boffset, incRowB, incColB, buffer, boffset)
boffset += kc*NR
Boffset += NR*incColB
end
if _nr > 0
for i = 1:kc
for j = 1:_nr
buffer[boffset + j] = B[Boffset + (j - 1)*incColB + 1]
end
for j = _nr + 1:NR
buffer[boffset + j] = 0.0
end
boffset += NR
Boffset += incRowB
end
end
end
end
const cl = "~{esi},~{rax},~{rbx},~{rcx}"*string([",~{xmm$i}" for i = 0:15]...)
const asms =
"\"movl \$0, %esi \n\t"*
"movq \$1, %rax \n\t"*
"movq \$2, %rbx \n\t"*
"movq \$3, %rcx \n\t"*
"movapd (%rax), %xmm0 \n\t"*
"movapd 16(%rax), %xmm1 \n\t"*
"movapd (%rbx), %xmm2 \n\t"*
"xorpd %xmm8, %xmm8 \n\t"*
"xorpd %xmm9, %xmm9 \n\t"*
"xorpd %xmm10, %xmm10 \n\t"*
"xorpd %xmm11, %xmm11 \n\t"*
"xorpd %xmm12, %xmm12 \n\t"*
"xorpd %xmm13, %xmm13 \n\t"*
"xorpd %xmm14, %xmm14 \n\t"*
"xorpd %xmm15, %xmm15 \n\t"*
"xorpd %xmm3, %xmm3 \n\t"*
"xorpd %xmm4, %xmm4 \n\t"*
"xorpd %xmm5, %xmm5 \n\t"*
"xorpd %xmm6, %xmm6 \n\t"*
"xorpd %xmm7, %xmm7 \n\t"*
"testl %esi, %esi \n\t"*
"je .DWRITEBACK \n\t"*
".DLOOP: \n\t"*
"addpd %xmm3, %xmm12 \n\t"*
"movapd 16(%rbx), %xmm3 \n\t"*
"addpd %xmm6, %xmm13 \n\t"*
"movapd %xmm2, %xmm6 \n\t"*
"pshufd \$\$78,%xmm2,%xmm4 \n\t"*
"mulpd %xmm0, %xmm2 \n\t"*
"mulpd %xmm1, %xmm6 \n\t"*
"addpd %xmm5, %xmm14 \n\t"*
"addpd %xmm7, %xmm15 \n\t"*
"movapd %xmm4, %xmm7 \n\t"*
"mulpd %xmm0, %xmm4 \n\t"*
"mulpd %xmm1, %xmm7 \n\t"*
"addpd %xmm2, %xmm8 \n\t"*
"movapd 32(%rbx), %xmm2 \n\t"*
"addpd %xmm6, %xmm9 \n\t"*
"movapd %xmm3, %xmm6 \n\t"*
"pshufd \$\$78,%xmm3, %xmm5 \n\t"*
"mulpd %xmm0, %xmm3 \n\t"*
"mulpd %xmm1, %xmm6 \n\t"*
"addpd %xmm4, %xmm10 \n\t"*
"addpd %xmm7, %xmm11 \n\t"*
"movapd %xmm5, %xmm7 \n\t"*
"mulpd %xmm0, %xmm5 \n\t"*
"movapd 32(%rax), %xmm0 \n\t"*
"mulpd %xmm1, %xmm7 \n\t"*
"movapd 48(%rax), %xmm1 \n\t"*
"addq \$\$32, %rax \n\t"*
"addq \$\$32, %rbx \n\t"*
"decl %esi \n\t"*
"jne .DLOOP \n\t"*
"addpd %xmm3, %xmm12 \n\t"*
"addpd %xmm6, %xmm13 \n\t"*
"addpd %xmm5, %xmm14 \n\t"*
"addpd %xmm7, %xmm15 \n\t"*
".DWRITEBACK: \n\t"*
"movlpd %xmm8, (%rcx) \n\t"*
"movhpd %xmm10, 8(%rcx) \n\t"*
"movlpd %xmm9, 16(%rcx) \n\t"*
"movhpd %xmm11, 24(%rcx) \n\t"*
"addq \$\$32, %rcx \n\t"*
"movlpd %xmm10, (%rcx) \n\t"*
"movhpd %xmm8, 8(%rcx) \n\t"*
"movlpd %xmm11, 16(%rcx) \n\t"*
"movhpd %xmm9, 24(%rcx) \n\t"*
"addq \$\$32, %rcx \n\t"*
"movlpd %xmm12, (%rcx) \n\t"*
"movhpd %xmm14, 8(%rcx) \n\t"*
"movlpd %xmm13, 16(%rcx) \n\t"*
"movhpd %xmm15, 24(%rcx) \n\t"*
"addq \$\$32, %rcx \n\t"*
"movlpd %xmm14, (%rcx) \n\t"*
"movhpd %xmm12, 8(%rcx) \n\t"*
"movlpd %xmm15, 16(%rcx) \n\t"*
"movhpd %xmm13, 24(%rcx) \n\t\", \"r,r,r,r,$cl\"(i32 %0, double* %1, double* %2, double* %3)"
@inline function gemm_micro_kernel(kc::Integer, α::Float64, A::Array{Float64}, Aoffset::Integer,
B::Array{Float64}, Boffset::Integer, β::Float64, C::Array{Float64}, Coffset::Integer,
incRowC::Integer, incColC::Integer)
ccall(:jl_breakpoint, Any, (), )
Base.llvmcall("""
call void asm $asms
ret void""",
Void,
Tuple{Cint, Ptr{Float64}, Ptr{Float64}, Ptr{Float64}},
Cint(kc), pointer(_A, Aoffset + 1), pointer(_B, Boffset + 1), pointer(_AB))
# Update C <- beta*C
if β == 0.0
for j = 1:NR
for i =1:MR
C[Coffset + (i - 1)*incRowC + (j - 1)*incColC + 1] = 0.0
end
end
elseif β != 1.0
for j = 1:NR
for i = 1:MR
C[Coffset + (i - 1)*incRowC + (j - 1)*incColC + 1] *= β
end
end
end
# Update C <- C + alpha*AB (note: the case alpha==0.0 was already treated in
# the above layer dgemm_nn)
if α == 1.0
for j = 1:NR
for i = 1:MR
C[Coffset + (i - 1)*incRowC + (j - 1)*incColC + 1] += _AB[i + (j - 1)*MR]
end
end
else
for j = 1:NR
for i = 1:MR
C[Coffset + (i - 1)*incRowC + (j - 1)*incColC + 1] += α*_AB[i + (j - 1)*MR]
end
end
end
end
function geaxpy(m::Integer, n::Integer, α::Float64, X::Array{Float64}, incRowX::Integer,
incColX::Integer, Y::Array{Float64}, Yoffset::Integer, incRowY::Integer, incColY::Integer)
if α != 1.0
for j = 1:n
for i = 1:m
Y[Yoffset + (i - 1)*incRowY + (j - 1)*incColY + 1] = Y[(i - 1)*incRowY + (j - 1)*incColY + 1] + α*X[(i - 1)*incRowX + (j - 1)*incColX + 1]
end
end
else
for j = 1:n
for i = 1:m
Y[Yoffset + (i - 1)*incRowY + (j - 1)*incColY + 1] = Y[(i - 1)*incRowY + (j - 1)*incColY + 1] + X[(i - 1)*incRowX + (j - 1)*incColX + 1]
end
end
end
end
function gescal(m::Integer,
n::Integer,
α::Float64,
X::Array{Float64},
Xoffset::Integer,
incRowX::Integer,
incColX::Integer)
if α != 0.0
for j = 1:n
for i = 1:m
X[Xoffset + (i - 1)*incRowX + (j - 1)*incColX + 1] *= α
end
end
else
for j = 1:n
for i = 1:m
X[Xoffset + (i - 1)*incRowX + (j - 1)*incColX + 1] = 0.0
end
end
end
end
function gemm_macro_kernel(mc::Integer, nc::Integer, kc::Integer, α::Float64,
β::Float64, C::Array{Float64}, Coffset::Integer, incRowC::Integer, incColC::Integer)
mp = div(mc + MR - 1, MR)
np = div(nc + NR - 1, NR)
_mr = mc % MR
_nr = nc % NR
for j = 1:np
nr = (j != np || _nr == 0) ? NR : _nr
for i = 1:mp
mr = (i != mp || _mr == 0) ? MR : _mr
if mr == MR && nr == NR
gemm_micro_kernel(kc,
α,
_A,
(i - 1)*kc*MR,
_B,
(j - 1)*kc*NR,
β,
C,
Coffset + (i - 1)*MR*incRowC + (j - 1)*NR*incColC,
incRowC,
incColC)
else
gemm_micro_kernel(kc,
α,
_A,
(i - 1)*kc*MR,
_B,
(j - 1)*kc*NR,
0.0,
C,
Coffset,
1,
MR)
gescal(mr, nr, β, C, Coffset + (i - 1)*MR*incRowC + (j - 1)*NR*incColC, incRowC, incColC)
geaxpy(mr, nr, 1.0, _C, 1, MR, C, Coffset + (i - 1)*MR*incRowC + (j - 1)*NR*incColC, incRowC, incColC)
end
end
end
return C
end
function gemm_nn(m::Integer,
n::Integer,
k::Integer,
α::Float64,
A::Matrix{Float64},
incRowA::Integer,
incColA::Integer,
B::Matrix{Float64},
incRowB::Integer,
incColB::Integer,
β::Float64,
C::Matrix{Float64},
incRowC::Integer,
incColC::Integer)
mb = div(m + MC - 1, MC)
nb = div(n + NC - 1, NC)
kb = div(k + KC - 1, KC)
_mc = m % MC
_nc = n % NC
_kc = k % KC
if α == 0.0 || k == 0
gescal(m, n, β, C, incRowC, incColC)
return C
end
for j = 1:nb
nc = (j != nb || _nc == 0) ? NC : _nc
for l = 1:kb
kc = (l != kb || _kc == 0) ? KC : _kc
_β = l == 1 ? β : 1.0
pack_B(kc, nc, B, (l - 1)*KC*incRowB + (j - 1)*NC*incColB, incRowB, incColB, _B)
for i = 1:mb
mc = (i != mb || _mc == 0) ? MC : _mc
pack_A(mc, kc, A, (i - 1)*MC*incRowA + (l - 1)*KC*incColA, incRowA, incColA, _A)
gemm_macro_kernel(mc, nc, kc, α, _β, C, (i - 1)*MC*incRowC + (j - 1)*NC*incColC, incRowC, incColC)
end
end
end
return C
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment