Skip to content

Instantly share code, notes, and snippets.

@willtebbutt
Last active February 6, 2022 14:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save willtebbutt/cc268e614d2b73156f5bc24e7408613d to your computer and use it in GitHub Desktop.
Save willtebbutt/cc268e614d2b73156f5bc24e7408613d to your computer and use it in GitHub Desktop.
GPs on the GPU
using Revise
using AbstractGPs
using BenchmarkTools
using CUDA
using KernelFunctions
using LinearAlgebra
using Random
using AbstractGPs: AbstractGP, FiniteGP, ZeroMean
CUDA.allowscalar(false)
x_cpu = ColVecs(randn(25, 1_000))
x_gpu = ColVecs(cu(x_cpu.X))
function KernelFunctions.kernelmatrix(::SEKernel, x::ColVecs{<:Real, <:CuMatrix{<:Real}})
X = x.X
X_ = sum(abs2, X; dims=1)
D = X_ .+ X_' .- 2 .* X' * X
return exp.(.-D ./ 2)
end
function benchmark_stuff()
K_cpu = kernelmatrix(SEKernel(), x_cpu)
K_gpu = collect(kernelmatrix(SEKernel(), x_gpu))
@show maximum(abs.(K_cpu .- K_gpu))
println("CPU")
display(@benchmark kernelmatrix(SEKernel(), $x_cpu))
println()
println("GPU")
display(@benchmark kernelmatrix(SEKernel(), $x_gpu))
println()
end
function AbstractGPs._map_meanfunction(::ZeroMean{T}, x::ColVecs{<:Real, <:CuMatrix{<:Real}}) where {T}
return CUDA.zeros(T, length(x))
end
# We could easily add a hook for the thing that generates ε into our current code, to make specialisation easier here.
function AbstractGPs.rand(
rng::AbstractRNG, f::FiniteGP{<:AbstractGP, <:ColVecs{<:Real, <:CuMatrix}}, N::Int
)
m, C_mat = mean_and_cov(f)
C = cholesky(AbstractGPs._symmetric(C_mat))
ε = CUDA.randn(promote_type(eltype(m), eltype(C)), length(m), N)
return m .+ C.U' * ε
end
function sample_gp_cpu(x)
f = GP(SEKernel())
return rand(Xoshiro(1), f(x, 0.1))
end
function sample_gp_gpu(x)
f = GP(ZeroMean{Float32}(), SEKernel())
S = 0.1f0 * CUDA.ones(length(x))
return vec(rand(Xoshiro(1), f(x, S), 1))
end
function benchmark_sampling()
println("CPU")
display(@benchmark sample_gp_cpu($x_cpu))
println()
println("GPU")
display(@benchmark sample_gp_gpu($x_gpu))
println()
end
# Looks like CUDA is missing this definition. Shouldn't be a problem to add.
function LinearAlgebra.logdet(C::Cholesky{<:Real, <:CuMatrix{<:Real}})
return 2 * sum(diag(log.(C.factors)))
end
function logpdf_gp_cpu(x, y)
f = GP(SEKernel())
return logpdf(f(x, 0.1), y)
end
function logpdf_gp_gpu(x, y)
f = GP(ZeroMean{Float32}(), SEKernel())
S = 0.1f0 * CUDA.ones(length(x))
return logpdf(f(x, S), y)
end
function benchmark_logpdf()
y_cpu = sample_gp_cpu(x_cpu)
y_gpu = sample_gp_gpu(x_gpu)
println("CPU")
display(@benchmark logpdf_gp_cpu($x_cpu, $y_cpu))
println()
@show logpdf_gp_gpu(x_gpu, y_gpu)
println("GPU")
display(@benchmark logpdf_gp_gpu($x_gpu, $y_gpu))
println()
end
Revise.track(@__FILE__)
# julia> benchmark_sampling()
# CPU
# BenchmarkTools.Trial: 157 samples with 1 evaluation.
# Range (min … max): 28.447 ms … 41.251 ms ┊ GC (min … max): 0.00% … 0.00%
# Time (median): 29.841 ms ┊ GC (median): 0.00%
# Time (mean ± σ): 31.895 ms ± 3.375 ms ┊ GC (mean ± σ): 5.69% ± 7.03%
# ▄█ █
# ██▄▁▁▁▃█▇▁▁▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▃▅▅▄▅▃▁▃▄▃▁▁▅▁▄▃▁▁▁▃▃▆▃▃▆▃▃▁▃▃▁▁▃ ▃
# 28.4 ms Histogram: frequency by time 38.6 ms <
# Memory estimate: 30.56 MiB, allocs estimate: 24.
# GPU
# BenchmarkTools.Trial: 1963 samples with 1 evaluation.
# Range (min … max): 1.673 ms … 326.435 ms ┊ GC (min … max): 0.00% … 2.11%
# Time (median): 1.798 ms ┊ GC (median): 0.00%
# Time (mean ± σ): 2.630 ms ± 16.350 ms ┊ GC (mean ± σ): 0.65% ± 0.10%
# ▂▁▁▃▂▃▄▄▃▃▂▃█▄▇▃▃▅▂▅▆▃▃█▃▄▄▃▁▁▁
# ▃▄▆▇████████████████████████████████▇▇▄▄▃▃▃▃▃▂▃▃▁▂▃▃▂▃▂▂▃▃▂ ▅
# 1.67 ms Histogram: frequency by time 2.02 ms <
# Memory estimate: 19.80 KiB, allocs estimate: 356.
# julia> benchmark_logpdf()
# abs(logpdf_gp_cpu(x_cpu, y_cpu) - logpdf_gp_gpu(x_gpu, y_gpu)) = 0.00013706540744351514
# CPU
# BenchmarkTools.Trial: 156 samples with 1 evaluation.
# Range (min … max): 24.175 ms … 46.282 ms ┊ GC (min … max): 0.00% … 10.24%
# Time (median): 29.421 ms ┊ GC (median): 0.00%
# Time (mean ± σ): 32.225 ms ± 4.875 ms ┊ GC (mean ± σ): 5.79% ± 7.10%
# █▄ ▂▅ ▂ ▁ ▂ ▁ ▁ ▁
# ▅▁▁▅▁▁▁▁▁▁▁██▆██▅▁▁▁▁▁▁▁▅▆▅▁█▆▇█▁▆██▆▇▅▁█▆▁█▆▁▁▁█▁▁▁▁▁▁▁▁▁█ ▅
# 24.2 ms Histogram: log(frequency) by time 44.4 ms <
# Memory estimate: 30.55 MiB, allocs estimate: 15.
# GPU
# BenchmarkTools.Trial: 1657 samples with 1 evaluation.
# Range (min … max): 1.876 ms … 327.119 ms ┊ GC (min … max): 0.00% … 2.06%
# Time (median): 2.026 ms ┊ GC (median): 0.00%
# Time (mean ± σ): 3.012 ms ± 17.778 ms ┊ GC (mean ± σ): 0.67% ± 0.11%
# ▁▁▂▆▅▅▄▆▆▇▄▇▆▄▆▇█▅▅▂
# ▃▆█████████████████████▅▄▃▃▃▃▃▃▂▃▃▂▁▂▁▂▂▁▁▁▂▂▁▂▁▁▁▁▁▁▂▁▂▂▁▂ ▄
# 1.88 ms Histogram: frequency by time 2.54 ms <
# Memory estimate: 22.97 KiB, allocs estimate: 415.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment