dmbates/dls.jl

## dls.jl
for (potrs, elty) in (("dpotrs_", :Float64), ("spotrs_", :Float32),
                      ("zpotrs_", :Complex128), ("cpotrs_", :Complex64))
    @eval begin
        function _jl_lapack_potrs(uplo, n, nrhs, A::StridedMatrix{$elty}, lda, B, ldb)
            info = Array(Int32, 1)
            ccall(dlsym(_jl_liblapack, $potrs), Void,
                  (Ptr{Uint8}, Ptr{Int32}, Ptr{Int32}, Ptr{Float64}, Ptr{Int32},
                   Ptr{Float64}, Ptr{Int32}, Ptr{Int32}),
                  uplo, &n, &nrhs, A, &lda, B, &ldb, info)
            return info[1]
        end
    end
end

function (\)(A::DArray{Float64,2,1}, B::DArray{Float64,1,1})
    if (any(procs(A) != procs(B)) || any(dist(A) != dist(B)))
        error("Arrays A and B must be distributed similarly")
    end
    AtA = mapreduce(+, fetch, {@spawnat p _jl_syrk('T', localize(A)) for p in procs(A)})
    AtB = mapreduce(+, fetch, {@spawnat p localize(A)' * localize(B) for p in procs(A)})
    R   = chol(AtA)
    info = _jl_lapack_potrs("U", size(R, 2), 1, R, stride(R, 2), AtB, size(B, 1))
    if info == 0; return AtB; end
    if info > 0; error("matrix not positive definite"); end
    error("error in CHOL")
end

## gistfile1.md

      
    Raw
  

              gistfile1.md
            
          
    My objective is to solve least squares problems with distributed arrays.  The model matrix is distributed on the first dimension and the response vector is distributed in a compatible way.  The calculation of A'*A and A'*B can be distributed with a mapreduce.
The initial for loop generates a series of methods for the Julia function _jl_lapack_potrs, each of which calls the appropriate*potrs subroutine in Lapack.
I suppose that one could be smarter about allowing B to be a distributed array of 1 or 2 dimensions but this is all I need at the moment.
There is a satisfying performance boost from the distributed arrays.  Using
julia -p 3

on a 4-core processor I get
julia> sum([@elapsed X \ Y for i=1:5])
1.4875037670135498

julia> sum([@elapsed dX \ dY for i=1:5])
0.44181084632873535

where dX is a DArray{Float64,2,1} of dimension (9999,100) and size(dY) is (9999,).  The arrays X and Y are the conversions of dX and dY to plain-old arrays.  I ran this timing in version
Version 0.0.0+86067061.rd8be
Commit d8be98d19e (2012-05-15 02:10:07)

I should mention that part of the performance boost is due to using a Cholesky decomposition instead of the QR decomposition used in the \ operator for rectangular left-hand sides.
	for (potrs, elty) in (("dpotrs_", :Float64), ("spotrs_", :Float32),
	("zpotrs_", :Complex128), ("cpotrs_", :Complex64))
	@eval begin
	function _jl_lapack_potrs(uplo, n, nrhs, A::StridedMatrix{$elty}, lda, B, ldb)
	info = Array(Int32, 1)
	ccall(dlsym(_jl_liblapack, $potrs), Void,
	(Ptr{Uint8}, Ptr{Int32}, Ptr{Int32}, Ptr{Float64}, Ptr{Int32},
	Ptr{Float64}, Ptr{Int32}, Ptr{Int32}),
	uplo, &n, &nrhs, A, &lda, B, &ldb, info)
	return info[1]
	end
	end
	end

	function (\)(A::DArray{Float64,2,1}, B::DArray{Float64,1,1})
	if (any(procs(A) != procs(B)) \|\| any(dist(A) != dist(B)))
	error("Arrays A and B must be distributed similarly")
	end
	AtA = mapreduce(+, fetch, {@spawnat p _jl_syrk('T', localize(A)) for p in procs(A)})
	AtB = mapreduce(+, fetch, {@spawnat p localize(A)' * localize(B) for p in procs(A)})
	R = chol(AtA)
	info = _jl_lapack_potrs("U", size(R, 2), 1, R, stride(R, 2), AtB, size(B, 1))
	if info == 0; return AtB; end
	if info > 0; error("matrix not positive definite"); end
	error("error in CHOL")
	end