Created
May 27, 2013 21:22
-
-
Save timholy/5659152 to your computer and use it in GitHub Desktop.
Cache-friendly transpose comparison
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using Winston | |
n = [ntuple(length(sizes), i->prod(sizes[i]))...] | |
for i = 1:length(Ts) | |
p = FramedPlot() | |
setattr(p.x1, "log", true) | |
r = results[:,2,i] ./ results[:,:,i] | |
r = r[:,[1,3,4]] | |
ccopy = Curve(n, r[:,1], "color", "green") | |
setattr(ccopy, "label", "copy") | |
cfftw = Curve(n, r[:,2], "color", "black") | |
setattr(cfftw, "label", "FFTW") | |
ccache = Curve(n, r[:,3], "color", "red") | |
setattr(ccache, "label", "cache-friendly") | |
add(p, ccopy) | |
add(p, cfftw) | |
add(p, ccache) | |
cref = Curve(n, ones(length(n)), "color", "black", "type", "dashed") | |
add(p, cref) | |
l = Legend(0.1, 0.9, {ccopy, cfftw, ccache}) | |
add(p, l) | |
setattr(p, "title", string(Ts[i])) | |
setattr(p, "ylabel", "Fold advantage") | |
setattr(p, "xlabel", "# array elements") | |
setattr(p, "yrange", (0,7)) | |
file(p, string(Ts[i])*".png") | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
sizes = [(2,2), (3,4), (8,6), (12,20), (50,25), (60,100), (110,200), (400,400), (759,821), (1543,1781), (3314,3145), (6500,6400)] | |
Ts = (Uint8, Uint16, Uint32, Uint64, Float32, Float64, Complex64, Complex128) | |
results = fill(NaN, length(sizes), 4, length(Ts)) | |
for isz = 1:length(sizes) | |
sz = sizes[isz] | |
niter = iceil(10^7/prod(sz)) | |
niter = min(niter, 10^4) | |
println("Running ", niter, " iterations of size ", sz) | |
for jT = 1:length(Ts) | |
T = Ts[jT] | |
m, n = sz | |
Atmp = reshape(1:m*n, m, n) | |
A = convert(Matrix{T}, Atmp) | |
# For reference, let's see what a direct copy takes | |
B = similar(A) | |
TestTranspose.copy_linear!(B, A) | |
results[isz,1,jT] = (@elapsed (for i = 1:niter TestTranspose.copy_linear!(B, A) end))/niter | |
# Now the transpose algorithms | |
B = A' | |
TestTranspose.transpose_linear!(B, A) | |
@assert B == A' | |
results[isz,2,jT] = (@elapsed (for i = 1:niter TestTranspose.transpose_linear!(B, A) end))/niter | |
if T <: FloatingPoint || T <: Complex | |
TestTranspose.transposefftw!(B, A) | |
@assert B == A' | |
results[isz,3,jT] = (@elapsed (for i = 1:niter TestTranspose.transposefftw!(B, A) end))/niter | |
end | |
TestTranspose.transposecf!(B, A) | |
@assert B == A' | |
results[isz,4,jT] = (@elapsed (for i = 1:niter TestTranspose.transposecf!(B, A) end))/niter | |
end | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module TestTranspose | |
# Simple algorithm using linear indexing | |
function transpose_linear!(B::Matrix, A::Matrix) | |
(m,n) = size(A) | |
for i2 = 1:n | |
j = i2 | |
offset = (j-1)*m | |
for i = offset+1:offset+m | |
B[j] = A[i] | |
j += n | |
end | |
end | |
B | |
end | |
# cache-friendly transpose | |
const szcacheline = 16 | |
const szcache = 2^14 | |
const szblock = ifloor(sqrt((szcache/szcacheline)./[1:16])) | |
function transposecf!{T<:Number}(B::Matrix{T}, A::Matrix{T}) | |
m, n = size(A) | |
s = sizeof(T) | |
blocksize = (s <= length(szblock)) ? szblock[s] : ifloor(sqrt(szcache/szcacheline/sizeof(T))) | |
if m*n <= 30*blocksize*blocksize | |
# Use the simple algorithm | |
for i2 = 1:n | |
j = i2 | |
offset = (j-1)*m | |
for i = offset+1:offset+m | |
B[j] = A[i] | |
j += n | |
end | |
end | |
return B | |
end | |
for outer2 = 1:blocksize:size(A, 2) | |
for outer1 = 1:blocksize:size(A, 1) | |
for inner2 = outer2:min(n,outer2+blocksize) | |
i = (inner2-1)*m + outer1 | |
j = inner2 + (outer1-1)*n | |
for inner1 = outer1:min(m,outer1+blocksize) | |
B[j] = A[i] | |
i += 1 | |
j += n | |
end | |
end | |
end | |
end | |
B | |
end | |
function copy_linear!(B::Matrix, A::Matrix) | |
for i = 1:length(A) | |
B[i] = A[i] | |
end | |
end | |
for (libname, fname, elty) in ((:(FFTW.libfftw) ,"fftw_plan_guru64_r2r",:Float64), | |
(:(FFTW.libfftwf),"fftwf_plan_guru64_r2r",:Float32)) | |
@eval begin | |
function transposefftw!(P::Matrix{$elty}, X::Matrix{$elty}) | |
(n2, n1) = size(X) | |
plan = ccall(($fname,$libname), Ptr{Void}, | |
(Int32, Ptr{Int}, Int32, Ptr{Int}, Ptr{$elty}, Ptr{$elty}, Ptr{Int32}, Uint32), | |
0, C_NULL, 2, [n1,n2,1,n2,1,n1], X, P, [FFTW.HC2R], FFTW.ESTIMATE | FFTW.PRESERVE_INPUT) | |
FFTW.execute($elty, plan) | |
FFTW.destroy_plan($elty, plan) | |
return P | |
end | |
end | |
end | |
for (libname, fname, celty) in ((:(FFTW.libfftw) ,"fftw_plan_guru64_dft",:Complex128), | |
(:(FFTW.libfftwf),"fftwf_plan_guru64_dft",:Complex64)) | |
@eval begin | |
function transposefftw!(P::Matrix{$celty}, X::Matrix{$celty}) | |
(n2, n1) = size(X) | |
plan = ccall(($fname,$libname), Ptr{Void}, | |
(Int32, Ptr{Int}, Int32, Ptr{Int}, Ptr{$celty}, Ptr{$celty}, Int32, Uint32), | |
0, C_NULL, 2, [n1,n2,1,n2,1,n1], X, P, FFTW.FORWARD, FFTW.ESTIMATE | FFTW.PRESERVE_INPUT) | |
FFTW.execute($celty, plan) | |
FFTW.destroy_plan($celty, plan) | |
return P | |
end | |
end | |
end | |
end | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment