Created
August 4, 2015 05:50
-
-
Save jiahao/ad788965335d25972e2d to your computer and use it in GitHub Desktop.
Tall and skinny QR in Julia
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
################################################################# | |
# A simple implementation of TSQR (Tall and Skinny QR) in Julia # | |
# Jiahao Chen <jiahao@mit.edu>, 2015-08-03 # | |
################################################################# | |
# | |
# TSQR is a communication-avoiding QR algorithm designed for | |
# distributed computation on tall, skinny matrices, which are | |
# typical of data matrices in Big Data applications. | |
# | |
# The primary reference is | |
# Communication-optimal parallel and sequential QR and LU factorizations, | |
# by James W. Demmel, Laura Grigori, Mark Frederick Hoemmen and Julien Langou, | |
# UCB/EECS-2008-89, August 4, 2008. LAPACK Working Note LAWNS 204, | |
# http://www.netlib.org/lapack/lawnspdf/lawn204.pdf | |
# | |
# More recent literature has focused on how communication-avoiding QR | |
# can be implemented easily using mapreduce, e.g. David Gleich's presentation | |
# | |
# http://www.slideshare.net/dgleich/whatyoucandowithtsqrfactorization | |
# | |
# This file contains a very simple _serial_ implementation of TSQR to test | |
# how it fares with the ordinary dense QR algorithm using Householder reflectors. | |
############################## | |
# TSQR Version 1: Use fancy QR | |
# LAPACK wrapper to wrap the fancy QR from LAPACK 3.4 | |
import Base.LinAlg.BlasInt | |
#dtpqrt (M, N, L, NB, A, LDA, B, LDB, T, LDT, WORK, INFO) | |
function tpqrt!(l::Int, nb::Int, A::StridedMatrix{Float64}, | |
B::StridedMatrix{Float64}) | |
#chkstride1(A) | |
m, n = size(A) | |
info = Array(BlasInt, 1) | |
T = Array(Float64, nb, n) | |
work = Array(Float64, nb*n) | |
@assert n ≥ nb ≥ 1 | |
#TODO replace ccall symbols with global BLAS settings | |
ccall((:dtpqrt_64_, :libopenblas), Void, | |
(Ptr{BlasInt}, Ptr{BlasInt}, Ptr{BlasInt}, Ptr{BlasInt}, | |
Ptr{Float64}, Ptr{BlasInt}, Ptr{Float64}, Ptr{BlasInt}, | |
Ptr{Float64}, Ptr{BlasInt}, Ptr{Float64}, Ptr{BlasInt}), | |
&m, &n, &l, &nb, A, &max(1,stride(A,2)), B, &max(1,stride(B,2)), | |
T, &max(1,stride(T,2)), work, info) | |
#Base.LinAlg.@lapackerror | |
#XXX The UpperTriangular annotation is correct only for square A. | |
# In general, A will be an upper trapezoidal matrix, but we don't have a type for this yet | |
UpperTriangular(A), B, T | |
end | |
# Simplify the method signature when called on two upper triangular matrices | |
tpqrt!{T}(A::UpperTriangular{T}, B::UpperTriangular{T}, nb::Int=size(A, 2)) = | |
tpqrt!(size(A, 1), nb, A.data, B.data) | |
#The input function to map: Do ordinary QR, extract R part, and wrap with UpperTriangular annotation | |
qrl!(M) = UpperTriangular(qrfact!(M)[:R]) | |
#The input function to reduce: Do the fancy QR specialized for "triangular-pentagonal" matrices | |
#In this case, we only need to reduce over matrices which look like [◥; ◥] | |
#vc = "vcat" | |
vc(P, Q) = vc(UpperTriangular(P), UpperTriangular(Q)) | |
vc{T}(P::UpperTriangular{T}, Q::UpperTriangular{T}) = tpqrt!(P, Q)[1] | |
function tsqr!(M, bsize::Int) | |
m = size(M, 1) | |
nblocks = ceil(Int, m/bsize) | |
mapreduce(qrl!, vc, qrl!(M[1:min(bsize, m), :]), | |
[sub(M, ((i-1)*bsize+1):min(i*bsize, m), :) for i=2:nblocks]) | |
end | |
######################### | |
# Version 2: Use naive QR | |
#Reduction using naive QR, constructing [◥; ◥] explicitly | |
vc2(P, Q) = UpperTriangular(qrfact!([P; Q])[:R]) | |
function tsqr2!(M, bsize::Int) | |
m = size(M, 1) | |
nblocks = ceil(Int, m/bsize) | |
R = mapreduce(qrl!, vc2, qrl!(M[1:min(bsize, m), :]), | |
[sub(M, ((i-1)*bsize+1):min(i*bsize, m), :) for i=2:nblocks]) | |
end | |
#Test problem | |
bsize=500 | |
M = randn(10000, bsize) | |
M2= copy(M) | |
M3= copy(M) | |
#Print timings | |
info("TSQR with fancy QR - DTPQRT") | |
@time R1 = tsqr!(M, bsize) | |
info("TSQR with naive QR") | |
@time R2 = tsqr2!(M2, bsize) | |
info("Ordinary QR") | |
@time R3 = qrl!(M3) | |
#Check that we are getting the right answer up to sign | |
@assert norm(abs(R1) - abs(R3), 1) ≤ √eps() | |
@assert norm(abs(R2) - abs(R3), 1) ≤ √eps() | |
# Results on my Macbook Pro i5 | |
# 2.832194 seconds (591.17 k allocations: 146.427 MB, 2.00% gc time) | |
# 0.893771 seconds (115.54 k allocations: 201.176 MB, 2.49% gc time) | |
# 0.262478 seconds (163 allocations: 2.189 MB) | |
#The ordinary QR is undoubtedly the fastest and most memory-efficient thing | |
#to do when the entire matrix fits in memory. | |
#TSQR using the fancy QR uses less memory but takes the longest. | |
#Is the implementation of DTPQRT less efficient? | |
#It might be possible to rewrite the fancy QR to reuse intermediates, | |
#but then one has to propagate state through the mapreduce, which seems clumsy. | |
#TSQR using naive QR uses more memory but is significantly faster than | |
#the version calling DTPQRT. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment