jiahao/tsqr.jl

## tsqr.jl
#################################################################
# A simple implementation of TSQR (Tall and Skinny QR) in Julia #
# Jiahao Chen <jiahao@mit.edu>, 2015-08-03                      #
#################################################################
#
# TSQR is a communication-avoiding QR algorithm designed for
# distributed computation on tall, skinny matrices, which are
# typical of data matrices in Big Data applications.
#
# The primary reference is
#     Communication-optimal parallel and sequential QR and LU factorizations,
#     by James W. Demmel, Laura Grigori, Mark Frederick Hoemmen and Julien Langou,
#     UCB/EECS-2008-89, August 4, 2008. LAPACK Working Note LAWNS 204,
#     http://www.netlib.org/lapack/lawnspdf/lawn204.pdf
#
# More recent literature has focused on how communication-avoiding QR
# can be implemented easily using mapreduce, e.g. David Gleich's presentation
#
#     http://www.slideshare.net/dgleich/whatyoucandowithtsqrfactorization
#
# This file contains a very simple _serial_ implementation of TSQR to test
# how it fares with the ordinary dense QR algorithm using Householder reflectors.

##############################
# TSQR Version 1: Use fancy QR

# LAPACK wrapper to wrap the fancy QR from LAPACK 3.4
import Base.LinAlg.BlasInt

#dtpqrt (M, N, L, NB, A, LDA, B, LDB, T, LDT, WORK, INFO)
function tpqrt!(l::Int, nb::Int, A::StridedMatrix{Float64},
    B::StridedMatrix{Float64})

    #chkstride1(A)
    m, n = size(A)
    info = Array(BlasInt, 1)
    T = Array(Float64, nb, n)
    work = Array(Float64, nb*n)
    @assert n ≥ nb ≥ 1
    #TODO replace ccall symbols with global BLAS settings
    ccall((:dtpqrt_64_, :libopenblas), Void,
        (Ptr{BlasInt}, Ptr{BlasInt}, Ptr{BlasInt}, Ptr{BlasInt},
         Ptr{Float64}, Ptr{BlasInt}, Ptr{Float64}, Ptr{BlasInt},
         Ptr{Float64}, Ptr{BlasInt}, Ptr{Float64}, Ptr{BlasInt}),
         &m, &n, &l, &nb, A, &max(1,stride(A,2)), B, &max(1,stride(B,2)),
         T, &max(1,stride(T,2)), work, info)

    #Base.LinAlg.@lapackerror
    #XXX The UpperTriangular annotation is correct only for square A.
    # In general, A will be an upper trapezoidal matrix, but we don't have a type for this yet
    UpperTriangular(A), B, T
end

# Simplify the method signature when called on two upper triangular matrices
tpqrt!{T}(A::UpperTriangular{T}, B::UpperTriangular{T}, nb::Int=size(A, 2)) =
    tpqrt!(size(A, 1), nb, A.data, B.data)

#The input function to map: Do ordinary QR, extract R part, and wrap with UpperTriangular annotation
qrl!(M) = UpperTriangular(qrfact!(M)[:R])

#The input function to reduce: Do the fancy QR specialized for "triangular-pentagonal" matrices
#In this case, we only need to reduce over matrices which look like [◥; ◥]
#vc = "vcat"
vc(P, Q) = vc(UpperTriangular(P), UpperTriangular(Q))
vc{T}(P::UpperTriangular{T}, Q::UpperTriangular{T}) = tpqrt!(P, Q)[1]

function tsqr!(M, bsize::Int)
    m = size(M, 1)
    nblocks = ceil(Int, m/bsize)
    mapreduce(qrl!, vc, qrl!(M[1:min(bsize, m), :]),
        [sub(M, ((i-1)*bsize+1):min(i*bsize, m), :) for i=2:nblocks])
end

#########################
# Version 2: Use naive QR

#Reduction using naive QR, constructing [◥; ◥] explicitly
vc2(P, Q) = UpperTriangular(qrfact!([P; Q])[:R])

function tsqr2!(M, bsize::Int)
    m = size(M, 1)
    nblocks = ceil(Int, m/bsize)
    R = mapreduce(qrl!, vc2, qrl!(M[1:min(bsize, m), :]),
        [sub(M, ((i-1)*bsize+1):min(i*bsize, m), :) for i=2:nblocks])
end


#Test problem

bsize=500
M = randn(10000, bsize)
M2= copy(M)
M3= copy(M)

#Print timings
info("TSQR with fancy QR - DTPQRT")
@time R1 = tsqr!(M, bsize)
info("TSQR with naive QR")
@time R2 = tsqr2!(M2, bsize)
info("Ordinary QR")
@time R3 = qrl!(M3)

#Check that we are getting the right answer up to sign
@assert norm(abs(R1) - abs(R3), 1) ≤ √eps()
@assert norm(abs(R2) - abs(R3), 1) ≤ √eps()

# Results on my Macbook Pro i5
#  2.832194 seconds (591.17 k allocations: 146.427 MB, 2.00% gc time)
#  0.893771 seconds (115.54 k allocations: 201.176 MB, 2.49% gc time)
#  0.262478 seconds (163 allocations: 2.189 MB)

#The ordinary QR is undoubtedly the fastest and most memory-efficient thing
#to do when the entire matrix fits in memory.

#TSQR using the fancy QR uses less memory but takes the longest.
#Is the implementation of DTPQRT less efficient?
#It might be possible to rewrite the fancy QR to reuse intermediates,
#but then one has to propagate state through the mapreduce, which seems clumsy.

#TSQR using naive QR uses more memory but is significantly faster than
#the version calling DTPQRT.
	#################################################################
	# A simple implementation of TSQR (Tall and Skinny QR) in Julia #
	# Jiahao Chen <jiahao@mit.edu>, 2015-08-03 #
	#################################################################
	#
	# TSQR is a communication-avoiding QR algorithm designed for
	# distributed computation on tall, skinny matrices, which are
	# typical of data matrices in Big Data applications.
	#
	# The primary reference is
	# Communication-optimal parallel and sequential QR and LU factorizations,
	# by James W. Demmel, Laura Grigori, Mark Frederick Hoemmen and Julien Langou,
	# UCB/EECS-2008-89, August 4, 2008. LAPACK Working Note LAWNS 204,
	# http://www.netlib.org/lapack/lawnspdf/lawn204.pdf
	#
	# More recent literature has focused on how communication-avoiding QR
	# can be implemented easily using mapreduce, e.g. David Gleich's presentation
	#
	# http://www.slideshare.net/dgleich/whatyoucandowithtsqrfactorization
	#
	# This file contains a very simple _serial_ implementation of TSQR to test
	# how it fares with the ordinary dense QR algorithm using Householder reflectors.

	##############################
	# TSQR Version 1: Use fancy QR

	# LAPACK wrapper to wrap the fancy QR from LAPACK 3.4
	import Base.LinAlg.BlasInt

	#dtpqrt (M, N, L, NB, A, LDA, B, LDB, T, LDT, WORK, INFO)
	function tpqrt!(l::Int, nb::Int, A::StridedMatrix{Float64},
	B::StridedMatrix{Float64})

	#chkstride1(A)
	m, n = size(A)
	info = Array(BlasInt, 1)
	T = Array(Float64, nb, n)
	work = Array(Float64, nb*n)
	@assert n ≥ nb ≥ 1
	#TODO replace ccall symbols with global BLAS settings
	ccall((:dtpqrt_64_, :libopenblas), Void,
	(Ptr{BlasInt}, Ptr{BlasInt}, Ptr{BlasInt}, Ptr{BlasInt},
	Ptr{Float64}, Ptr{BlasInt}, Ptr{Float64}, Ptr{BlasInt},
	Ptr{Float64}, Ptr{BlasInt}, Ptr{Float64}, Ptr{BlasInt}),
	&m, &n, &l, &nb, A, &max(1,stride(A,2)), B, &max(1,stride(B,2)),
	T, &max(1,stride(T,2)), work, info)

	#Base.LinAlg.@lapackerror
	#XXX The UpperTriangular annotation is correct only for square A.
	# In general, A will be an upper trapezoidal matrix, but we don't have a type for this yet
	UpperTriangular(A), B, T
	end

	# Simplify the method signature when called on two upper triangular matrices
	tpqrt!{T}(A::UpperTriangular{T}, B::UpperTriangular{T}, nb::Int=size(A, 2)) =
	tpqrt!(size(A, 1), nb, A.data, B.data)

	#The input function to map: Do ordinary QR, extract R part, and wrap with UpperTriangular annotation
	qrl!(M) = UpperTriangular(qrfact!(M)[:R])

	#The input function to reduce: Do the fancy QR specialized for "triangular-pentagonal" matrices
	#In this case, we only need to reduce over matrices which look like [◥; ◥]
	#vc = "vcat"
	vc(P, Q) = vc(UpperTriangular(P), UpperTriangular(Q))
	vc{T}(P::UpperTriangular{T}, Q::UpperTriangular{T}) = tpqrt!(P, Q)[1]

	function tsqr!(M, bsize::Int)
	m = size(M, 1)
	nblocks = ceil(Int, m/bsize)
	mapreduce(qrl!, vc, qrl!(M[1:min(bsize, m), :]),
	[sub(M, ((i-1)bsize+1):min(ibsize, m), :) for i=2:nblocks])
	end

	#########################
	# Version 2: Use naive QR

	#Reduction using naive QR, constructing [◥; ◥] explicitly
	vc2(P, Q) = UpperTriangular(qrfact!([P; Q])[:R])

	function tsqr2!(M, bsize::Int)
	m = size(M, 1)
	nblocks = ceil(Int, m/bsize)
	R = mapreduce(qrl!, vc2, qrl!(M[1:min(bsize, m), :]),
	[sub(M, ((i-1)bsize+1):min(ibsize, m), :) for i=2:nblocks])
	end



	#Test problem

	bsize=500
	M = randn(10000, bsize)
	M2= copy(M)
	M3= copy(M)

	#Print timings
	info("TSQR with fancy QR - DTPQRT")
	@time R1 = tsqr!(M, bsize)
	info("TSQR with naive QR")
	@time R2 = tsqr2!(M2, bsize)
	info("Ordinary QR")
	@time R3 = qrl!(M3)

	#Check that we are getting the right answer up to sign
	@assert norm(abs(R1) - abs(R3), 1) ≤ √eps()
	@assert norm(abs(R2) - abs(R3), 1) ≤ √eps()

	# Results on my Macbook Pro i5
	# 2.832194 seconds (591.17 k allocations: 146.427 MB, 2.00% gc time)
	# 0.893771 seconds (115.54 k allocations: 201.176 MB, 2.49% gc time)
	# 0.262478 seconds (163 allocations: 2.189 MB)

	#The ordinary QR is undoubtedly the fastest and most memory-efficient thing
	#to do when the entire matrix fits in memory.

	#TSQR using the fancy QR uses less memory but takes the longest.
	#Is the implementation of DTPQRT less efficient?
	#It might be possible to rewrite the fancy QR to reuse intermediates,
	#but then one has to propagate state through the mapreduce, which seems clumsy.

	#TSQR using naive QR uses more memory but is significantly faster than
	#the version calling DTPQRT.