Skip to content

Instantly share code, notes, and snippets.

@Roger-luo
Created June 11, 2020 02:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Roger-luo/e7704108157ddea579e140354c4ba6df to your computer and use it in GitHub Desktop.
Save Roger-luo/e7704108157ddea579e140354c4ba6df to your computer and use it in GitHub Desktop.
using CUDA
using ExponentialUtilities
using LinearAlgebra
using BenchmarkTools
using ExponentialUtilities: getV, getH, get_cache, _exp!
using LinearAlgebra: BlasReal, BlasComplex
using SparseArrays
using CUDA: CUBLAS
CUDA.allowscalar(false)
# CUDA patch
# symmetric mul!
# level 2
@inline function LinearAlgebra.mul!(y::CuVector{T}, A::Hermitian{T,<:CuMatrix}, x::CuVector{T},
α::Number, β::Number) where {T<:BlasReal}
alpha, beta = promote(α, β, zero(T))
if alpha isa Union{Bool,T} && beta isa Union{Bool,T}
return CUBLAS.symv!(A.uplo, alpha, A.data, x, beta, y)
else
error("only supports BLAS type, got $T")
end
end
@inline function LinearAlgebra.mul!(y::CuVector{T}, A::Hermitian{T,<:CuMatrix}, x::CuVector{T},
α::Number, β::Number) where {T<:BlasComplex}
alpha, beta = promote(α, β, zero(T))
if alpha isa Union{Bool,T} && beta isa Union{Bool,T}
return CUBLAS.hemv!(A.uplo, alpha, A.data, x, beta, y)
else
error("only supports BLAS type, got $T")
end
end
# level 3
@inline function LinearAlgebra.mul!(C::CuMatrix{T}, A::Hermitian{T,<:CuMatrix}, B::CuMatrix{T},
α::Number, β::Number) where {T<:BlasReal}
alpha, beta = promote(α, β, zero(T))
if alpha isa Union{Bool,T} && beta isa Union{Bool,T}
return CUBLAS.symm!('L', A.uplo, alpha, A.data, B, beta, C)
else
error("only supports BLAS type, got $T")
end
end
@inline function LinearAlgebra.mul!(C::CuMatrix{T}, A::CuMatrix{T}, B::Hermitian{T,<:CuMatrix},
α::Number, β::Number) where {T<:BlasReal}
alpha, beta = promote(α, β, zero(T))
if alpha isa Union{Bool,T} && beta isa Union{Bool,T}
return CUBLAS.symm!('R', B.uplo, alpha, B.data, A, beta, C)
else
error("only supports BLAS type, got $T")
end
end
@inline function LinearAlgebra.mul!(C::CuMatrix{T}, A::Hermitian{T,<:CuMatrix}, B::CuMatrix{T},
α::Number, β::Number) where {T<:BlasComplex}
alpha, beta = promote(α, β, zero(T))
if alpha isa Union{Bool,T} && beta isa Union{Bool,T}
return CUBLAS.hemm!('L', A.uplo, alpha, A.data, B, beta, C)
else
error("only supports BLAS type, got $T")
end
end
@inline function LinearAlgebra.mul!(C::CuMatrix{T}, A::CuMatrix{T}, B::Hermitian{T,<:CuMatrix},
α::Number, β::Number) where {T<:BlasComplex}
alpha, beta = promote(α, β, zero(T))
if alpha isa Union{Bool,T} && beta isa Union{Bool,T}
return CUBLAS.hemm!('R', B.uplo, alpha, B.data, A, beta, C)
else
error("only supports BLAS type, got $T")
end
end
# CUDA.expv!
function ExponentialUtilities.expv!(w::CuVector{Tw}, t::Real, Ks::KrylovSubspace{T, U};
cache=nothing, dexpHe::CuVector = CuVector{U}(undef, Ks.m)) where {Tw, T, U}
m, beta, V, H = Ks.m, Ks.beta, getV(Ks), getH(Ks)
@assert length(w) == size(V, 1) "Dimension mismatch"
if cache == nothing
cache = Matrix{U}(undef, m, m)
elseif isa(cache, ExpvCache)
cache = get_cache(cache, m)
else
throw(ArgumentError("Cache must be an ExpvCache"))
end
copyto!(cache, @view(H[1:m, :]))
if ishermitian(cache)
# Optimize the case for symtridiagonal H
F = eigen!(SymTridiagonal(cache))
expHe = F.vectors * (exp.(lmul!(t,F.values)) .* @view(F.vectors[1, :]))
else
lmul!(t, cache); expH = cache
_exp!(expH)
expHe = @view(expH[:, 1])
end
copyto!(dexpHe, expHe)
lmul!(beta, mul!(w, @view(V[:, 1:m]), dexpHe)) # exp(A) ≈ norm(b) * V * exp(H)e
end
# m = 100
# A = rand(100, m);
# b = rand(m);
# t = 2.0
T = Float32
U = Float32
n = 10_000
krylov_niteration=min(30, n)
maxiter=30
t = 1e-2
dA = CUDA.rand(T, n, n);
db = CUDA.rand(T, n);
A = Array(dA);
b = Array(db);
st = similar(b);
dst = similar(db)
augmented=false
dV = CuMatrix{T}(undef, n + augmented, maxiter + 1);
H = fill(zero(U), maxiter + 1, maxiter + !iszero(augmented));
dKs = KrylovSubspace{T, U, real(T), CuMatrix{T}, Matrix{U}}(maxiter, maxiter, augmented, zero(real(T)), dV, H);
Ks = KrylovSubspace{T, U}(n, maxiter);
dexpHe = CuVector{T}(undef, maxiter)
t1 = @benchmark CUDA.@sync begin
arnoldi!(dKs, dA, db, ishermitian=false);
expv!($dst, t, $dKs; dexpHe=$dexpHe)
end
t2 = @benchmark begin
arnoldi!(Ks, A, b, ishermitian=false);
expv!($st, t, $Ks)
end
minimum(t2).time / minimum(t1).time
# Hermitian
A = Hermitian(randn(T, n, n));
b = randn(T, n);
dA = Hermitian(CuArray(A));
db = CuArray(b);
st = similar(b);
dst = similar(db)
augmented=false
dV = CuMatrix{T}(undef, n + augmented, maxiter + 1);
H = fill(zero(U), maxiter + 1, maxiter + !iszero(augmented));
dKs = KrylovSubspace{T, U, real(T), CuMatrix{T}, Matrix{U}}(maxiter, maxiter, augmented, zero(real(T)), dV, H);
Ks = KrylovSubspace{T, U}(n, maxiter);
t1 = @benchmark CUDA.@sync begin
arnoldi!(dKs, dA, db, ishermitian=true);
expv!($dst, t, $dKs; dexpHe=$dexpHe)
end
t2 = @benchmark begin
arnoldi!(Ks, A, b, ishermitian=true);
expv!($st, t, $Ks)
end
minimum(t2).time / minimum(t1).time
# sparse
A = sprandn(T, n, n, 0.2);
b = randn(T, n);
dA = CUDA.CUSPARSE.CuSparseMatrixCSR(A);
db = CuArray(b);
st = similar(b);
dst = similar(db)
augmented=false
dV = CuMatrix{T}(undef, n + augmented, maxiter + 1);
H = fill(zero(U), maxiter + 1, maxiter + !iszero(augmented));
dKs = KrylovSubspace{T, U, real(T), CuMatrix{T}, Matrix{U}}(maxiter, maxiter, augmented, zero(real(T)), dV, H);
Ks = KrylovSubspace{T, U}(n, maxiter);
t1 = @benchmark CUDA.@sync begin
arnoldi!(dKs, dA, db, ishermitian=false);
expv!($dst, t, $dKs; dexpHe=$dexpHe)
end
t2 = @benchmark begin
arnoldi!(Ks, A, b, ishermitian=false);
expv!($st, t, $Ks)
end
minimum(t2).time / minimum(t1).time
# sparse Hermitian
A = Hermitian(sprandn(T, n, n, 0.2));
b = randn(T, n);
dA = Hermitian(CUDA.CUSPARSE.CuSparseMatrixCSR(parent(A)));
db = CuArray(b);
st = similar(b);
dst = similar(db)
augmented=false
dV = CuMatrix{T}(undef, n + augmented, maxiter + 1);
H = fill(zero(U), maxiter + 1, maxiter + !iszero(augmented));
dKs = KrylovSubspace{T, U, real(T), CuMatrix{T}, Matrix{U}}(maxiter, maxiter, augmented, zero(real(T)), dV, H);
Ks = KrylovSubspace{T, U}(n, maxiter);
t1 = @benchmark CUDA.@sync begin
arnoldi!(dKs, dA, db, ishermitian=true);
expv!($dst, t, $dKs; dexpHe=$dexpHe)
end
t2 = @benchmark begin
arnoldi!(Ks, A, b, ishermitian=true);
expv!($st, t, $Ks)
end
minimum(t2).time / minimum(t1).time
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment