Skip to content

Instantly share code, notes, and snippets.

@timholy
Created May 27, 2013 21:22
Show Gist options
  • Save timholy/5659152 to your computer and use it in GitHub Desktop.
Save timholy/5659152 to your computer and use it in GitHub Desktop.
Cache-friendly transpose comparison
using Winston
n = [ntuple(length(sizes), i->prod(sizes[i]))...]
for i = 1:length(Ts)
p = FramedPlot()
setattr(p.x1, "log", true)
r = results[:,2,i] ./ results[:,:,i]
r = r[:,[1,3,4]]
ccopy = Curve(n, r[:,1], "color", "green")
setattr(ccopy, "label", "copy")
cfftw = Curve(n, r[:,2], "color", "black")
setattr(cfftw, "label", "FFTW")
ccache = Curve(n, r[:,3], "color", "red")
setattr(ccache, "label", "cache-friendly")
add(p, ccopy)
add(p, cfftw)
add(p, ccache)
cref = Curve(n, ones(length(n)), "color", "black", "type", "dashed")
add(p, cref)
l = Legend(0.1, 0.9, {ccopy, cfftw, ccache})
add(p, l)
setattr(p, "title", string(Ts[i]))
setattr(p, "ylabel", "Fold advantage")
setattr(p, "xlabel", "# array elements")
setattr(p, "yrange", (0,7))
file(p, string(Ts[i])*".png")
end
sizes = [(2,2), (3,4), (8,6), (12,20), (50,25), (60,100), (110,200), (400,400), (759,821), (1543,1781), (3314,3145), (6500,6400)]
Ts = (Uint8, Uint16, Uint32, Uint64, Float32, Float64, Complex64, Complex128)
results = fill(NaN, length(sizes), 4, length(Ts))
for isz = 1:length(sizes)
sz = sizes[isz]
niter = iceil(10^7/prod(sz))
niter = min(niter, 10^4)
println("Running ", niter, " iterations of size ", sz)
for jT = 1:length(Ts)
T = Ts[jT]
m, n = sz
Atmp = reshape(1:m*n, m, n)
A = convert(Matrix{T}, Atmp)
# For reference, let's see what a direct copy takes
B = similar(A)
TestTranspose.copy_linear!(B, A)
results[isz,1,jT] = (@elapsed (for i = 1:niter TestTranspose.copy_linear!(B, A) end))/niter
# Now the transpose algorithms
B = A'
TestTranspose.transpose_linear!(B, A)
@assert B == A'
results[isz,2,jT] = (@elapsed (for i = 1:niter TestTranspose.transpose_linear!(B, A) end))/niter
if T <: FloatingPoint || T <: Complex
TestTranspose.transposefftw!(B, A)
@assert B == A'
results[isz,3,jT] = (@elapsed (for i = 1:niter TestTranspose.transposefftw!(B, A) end))/niter
end
TestTranspose.transposecf!(B, A)
@assert B == A'
results[isz,4,jT] = (@elapsed (for i = 1:niter TestTranspose.transposecf!(B, A) end))/niter
end
end
module TestTranspose
# Simple algorithm using linear indexing
function transpose_linear!(B::Matrix, A::Matrix)
(m,n) = size(A)
for i2 = 1:n
j = i2
offset = (j-1)*m
for i = offset+1:offset+m
B[j] = A[i]
j += n
end
end
B
end
# cache-friendly transpose
const szcacheline = 16
const szcache = 2^14
const szblock = ifloor(sqrt((szcache/szcacheline)./[1:16]))
function transposecf!{T<:Number}(B::Matrix{T}, A::Matrix{T})
m, n = size(A)
s = sizeof(T)
blocksize = (s <= length(szblock)) ? szblock[s] : ifloor(sqrt(szcache/szcacheline/sizeof(T)))
if m*n <= 30*blocksize*blocksize
# Use the simple algorithm
for i2 = 1:n
j = i2
offset = (j-1)*m
for i = offset+1:offset+m
B[j] = A[i]
j += n
end
end
return B
end
for outer2 = 1:blocksize:size(A, 2)
for outer1 = 1:blocksize:size(A, 1)
for inner2 = outer2:min(n,outer2+blocksize)
i = (inner2-1)*m + outer1
j = inner2 + (outer1-1)*n
for inner1 = outer1:min(m,outer1+blocksize)
B[j] = A[i]
i += 1
j += n
end
end
end
end
B
end
function copy_linear!(B::Matrix, A::Matrix)
for i = 1:length(A)
B[i] = A[i]
end
end
for (libname, fname, elty) in ((:(FFTW.libfftw) ,"fftw_plan_guru64_r2r",:Float64),
(:(FFTW.libfftwf),"fftwf_plan_guru64_r2r",:Float32))
@eval begin
function transposefftw!(P::Matrix{$elty}, X::Matrix{$elty})
(n2, n1) = size(X)
plan = ccall(($fname,$libname), Ptr{Void},
(Int32, Ptr{Int}, Int32, Ptr{Int}, Ptr{$elty}, Ptr{$elty}, Ptr{Int32}, Uint32),
0, C_NULL, 2, [n1,n2,1,n2,1,n1], X, P, [FFTW.HC2R], FFTW.ESTIMATE | FFTW.PRESERVE_INPUT)
FFTW.execute($elty, plan)
FFTW.destroy_plan($elty, plan)
return P
end
end
end
for (libname, fname, celty) in ((:(FFTW.libfftw) ,"fftw_plan_guru64_dft",:Complex128),
(:(FFTW.libfftwf),"fftwf_plan_guru64_dft",:Complex64))
@eval begin
function transposefftw!(P::Matrix{$celty}, X::Matrix{$celty})
(n2, n1) = size(X)
plan = ccall(($fname,$libname), Ptr{Void},
(Int32, Ptr{Int}, Int32, Ptr{Int}, Ptr{$celty}, Ptr{$celty}, Int32, Uint32),
0, C_NULL, 2, [n1,n2,1,n2,1,n1], X, P, FFTW.FORWARD, FFTW.ESTIMATE | FFTW.PRESERVE_INPUT)
FFTW.execute($celty, plan)
FFTW.destroy_plan($celty, plan)
return P
end
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment