Last active
January 20, 2020 09:16
-
-
Save GiggleLiu/d72b04fd4d2123a4dba0d024e210da6c to your computer and use it in GitHub Desktop.
CUDAnative based einsum! on GPU - the prototype
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using TupleTools | |
using Base.Cartesian | |
using CuArrays, CUDAnative | |
""" | |
A naive implementation of `einsum!` | |
* `ixs`: input tensor indices, | |
* `xs`: input tensors, | |
* `iy`: output tensor indices, | |
* `y`: accumulated tensor, notice it is initialized to 0 as output! | |
""" | |
function naive_einsum!(ixs::Tuple, xs, iy::NTuple{NO, Int}, y) where NO | |
# outer legs and inner legs | |
outer_indices = unique(iy) | |
inner_indices = setdiff(TupleTools.vcat(ixs...), outer_indices) | |
# find size for each leg | |
all_indices = TupleTools.vcat(ixs..., iy) | |
all_sizes = TupleTools.vcat(size.(xs)..., size(y)) | |
outer_sizes = Tuple(all_sizes[i] for i in indexin(outer_indices, [all_indices...])) | |
inner_sizes = Tuple(all_sizes[i] for i in indexin(inner_indices, [all_indices...])) | |
# cartesian indices for outer and inner legs | |
outer_ci = CartesianIndices(outer_sizes) | |
inner_ci = CartesianIndices(inner_sizes) | |
# for indexing tensors (leg binding) | |
indices = (outer_indices..., inner_indices...) | |
locs_xs = Tuple(Tuple(findfirst(isequal(i), indices) for i in ix) for ix in ixs) | |
locs_y = Tuple(findfirst(isequal(i), outer_indices) for i in iy) | |
loop!(locs_xs, xs, locs_y, y, outer_ci, inner_ci) | |
end | |
"""take an index subset from `ind`""" | |
index_map(ind::CartesianIndex, locs::Tuple) = CartesianIndex(TupleTools.getindices(Tuple(ind), locs)) | |
"""indiex tensors, and return the product of elements""" | |
@inline @generated function map_prod(::Type{T}, xs::Tuple, ind::CartesianIndex, locs_xs::NTuple{N}) where {N, T} | |
quote | |
p = one(T) | |
@nexprs $N i -> @inbounds p *= xs[i][index_map(ind, locs_xs[i])] | |
end | |
end | |
"""decide the number of threads and blocks to be launched.""" | |
@inline function cudiv(x::Int) | |
max_threads = 256 | |
threads_x = min(max_threads, x) | |
threads_x, ceil(Int, x/threads_x) | |
end | |
""" | |
loop and accumulate products to y, the GPU version. | |
## References | |
* CUDAnative.jl: https://github.com/JuliaGPU/CUDAnative.jl | |
""" | |
function loop!(locs_xs::NTuple{N}, xs::NTuple{N, CuArray}, locs_y, y::CuArray{T}, outer_ci::CartesianIndices, inner_ci::CartesianIndices) where {N, T} | |
function loop_kernel(locs_xs, xs, locs_y, y, outer_ci, inner_ci) | |
i = (blockIdx().x-1) * blockDim().x + threadIdx().x | |
i > length(outer_ci) && return nothing | |
@inbounds ind_y = outer_ci[i] | |
iy = index_map(ind_y, locs_y) | |
# To avoid race condition (https://mc.stanford.edu/cgi-bin/images/3/34/Darve_cme343_cuda_3.pdf), | |
# inner loops (cumulative operations) can not be avoided inside a single CUDA core, | |
# which means, reduction of dimensions can be slow. | |
# to increase the parallism, we should use a different strategy described in | |
# http://people.cs.vt.edu/yongcao/teaching/cs5234/spring2013/slides/Lecture10.pdf | |
for ind_x in inner_ci | |
ind_xy = CartesianIndex(TupleTools.vcat(ind_y.I, ind_x.I)) | |
@inbounds y[iy] += map_prod(T, xs, ind_xy, locs_xs) | |
end | |
nothing | |
end | |
X, Y = cudiv(length(outer_ci)) | |
@cuda threads=X blocks=Y loop_kernel(locs_xs, xs, locs_y, y, outer_ci, inner_ci) | |
y | |
end | |
""" | |
loop and accumulate products to y, the GPU version, the CPU version. | |
""" | |
function loop!(locs_xs::NTuple{N}, xs::NTuple{N, AbstractArray}, locs_y, y::AbstractArray{T}, outer_ci::CartesianIndices, inner_ci::CartesianIndices) where {N, T} | |
@simd for i in outer_ci | |
@inbounds ind_y = outer_ci[i] | |
iy = index_map(ind_y, locs_y) | |
for ind_x in inner_ci | |
ind_xy = CartesianIndex(TupleTools.vcat(ind_y.I, ind_x.I)) | |
@inbounds y[iy] += map_prod(T, xs, ind_xy, locs_xs) | |
end | |
end | |
y | |
end | |
using Test | |
a = randn(3, 3) |> CuArray | |
@test naive_einsum!(((1,2), (2,3)), (a, a), (1,3), zeros(3,3) |> CuArray) ≈ a*a | |
# benchmarks | |
using BenchmarkTools | |
a = randn(Float32, 50, 50) | |
c = zeros(Float32, 50, 50) | |
y = zeros(Float32, 50, 50, 50) | |
@benchmark naive_einsum!(((1,2), (2,3)), (a,a), (1,3), c) seconds = 1 | |
@benchmark naive_einsum!(((1,2), (1,3), (1,4)), ($a,$a,$a), (2,3,4), $y) seconds = 1 | |
@benchmark a*a seconds = 1 | |
a = a |> CuArray | |
c = c |> CuArray | |
y = y |> CuArray | |
@benchmark naive_einsum!(((1,2), (2,3)), (a,a), (1,3), c) seconds = 1 | |
@benchmark naive_einsum!(((1,2), (1,3), (1,4)), ($a,$a,$a), (2,3,4), $y) seconds = 1 | |
@benchmark a*a seconds = 1 |
CPU benchmark (Single Precision)
julia> @benchmark naive_einsum!(((1,2), (2,3)), (a,a), (1,3), c) seconds = 1
BenchmarkTools.Trial:
memory estimate: 5.19 KiB
allocs estimate: 76
--------------
minimum time: 329.733 μs (0.00% GC)
median time: 333.052 μs (0.00% GC)
mean time: 337.718 μs (0.00% GC)
maximum time: 1.127 ms (0.00% GC)
--------------
samples: 2947
evals/sample: 1
julia> @benchmark naive_einsum!(((1,2), (1,3), (1,4)), ($a,$a,$a), (2,3,4), $y) seconds = 1
BenchmarkTools.Trial:
memory estimate: 6.16 KiB
allocs estimate: 87
--------------
minimum time: 32.770 ms (0.00% GC)
median time: 33.418 ms (0.00% GC)
mean time: 33.526 ms (0.00% GC)
maximum time: 36.492 ms (0.00% GC)
--------------
samples: 30
evals/sample: 1
julia> @benchmark a*a seconds = 1
BenchmarkTools.Trial:
memory estimate: 9.94 KiB
allocs estimate: 1
--------------
minimum time: 9.463 μs (0.00% GC)
median time: 9.724 μs (0.00% GC)
mean time: 10.295 μs (0.00% GC)
maximum time: 571.920 μs (0.00% GC)
--------------
samples: 10000
evals/sample: 1
GPU benchmark (Single Precision)
julia> @benchmark naive_einsum!(((1,2), (2,3)), (a,a), (1,3), c) seconds = 1
BenchmarkTools.Trial:
memory estimate: 8.64 KiB
allocs estimate: 150
--------------
minimum time: 29.382 μs (0.00% GC)
median time: 33.277 μs (0.00% GC)
mean time: 44.514 μs (23.13% GC)
maximum time: 78.504 ms (99.84% GC)
--------------
samples: 10000
evals/sample: 1
julia> @benchmark naive_einsum!(((1,2), (1,3), (1,4)), ($a,$a,$a), (2,3,4), $y) seconds = 1
^[[BBenchmarkTools.Trial:
memory estimate: 10.41 KiB
allocs estimate: 162
--------------
minimum time: 36.074 μs (0.00% GC)
median time: 193.258 μs (0.00% GC)
mean time: 163.580 μs (9.80% GC)
maximum time: 79.719 ms (99.85% GC)
--------------
samples: 6043
evals/sample: 1
julia> @benchmark a*a seconds = 1
BenchmarkTools.Trial:
memory estimate: 592 bytes
allocs estimate: 19
--------------
minimum time: 7.418 μs (0.00% GC)
median time: 10.534 μs (0.00% GC)
mean time: 11.524 μs (0.00% GC)
maximum time: 91.886 μs (0.00% GC)
--------------
samples: 10000
evals/sample: 1
CPU/GPU information
julia> versioninfo()
Julia Version 1.1.0
Commit 80516ca202 (2019-01-21 21:24 UTC)
Platform Info:
OS: Linux (x86_64-pc-linux-gnu)
CPU: Intel(R) Xeon(R) CPU E5-2650 v4 @ 2.20GHz
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-6.0.1 (ORCJIT, broadwell)
julia> LinearAlgebra.BLAS.vendor()
:openblas64
CPU benchmark (Double Precision)
julia> @benchmark naive_einsum!(((1,2), (2,3)), (a,a), (1,3), c) seconds = 1
BenchmarkTools.Trial:
memory estimate: 4.55 KiB
allocs estimate: 61
--------------
minimum time: 325.154 μs (0.00% GC)
median time: 327.675 μs (0.00% GC)
mean time: 333.885 μs (0.00% GC)
maximum time: 431.217 μs (0.00% GC)
--------------
samples: 2980
evals/sample: 1
julia> @benchmark naive_einsum!(((1,2), (1,3), (1,4)), ($a,$a,$a), (2,3,4), $y) seconds = 1
BenchmarkTools.Trial:
memory estimate: 5.17 KiB
allocs estimate: 66
--------------
minimum time: 32.724 ms (0.00% GC)
median time: 33.061 ms (0.00% GC)
mean time: 33.193 ms (0.00% GC)
maximum time: 35.801 ms (0.00% GC)
--------------
samples: 31
evals/sample: 1
julia> @benchmark a*a seconds = 1
BenchmarkTools.Trial:
memory estimate: 19.64 KiB
allocs estimate: 2
--------------
minimum time: 15.668 μs (0.00% GC)
median time: 16.284 μs (0.00% GC)
mean time: 17.773 μs (5.29% GC)
maximum time: 1.735 ms (97.46% GC)
--------------
samples: 10000
evals/sample: 1
CPU benchmark (Double Precision)
julia> @benchmark naive_einsum!(((1,2), (2,3)), (a,a), (1,3), c) seconds = 1
BenchmarkTools.Trial:
memory estimate: 8.00 KiB
allocs estimate: 135
--------------
minimum time: 24.836 μs (0.00% GC)
median time: 28.040 μs (0.00% GC)
mean time: 30.410 μs (3.91% GC)
maximum time: 4.185 ms (98.44% GC)
--------------
samples: 10000
evals/sample: 1
julia> @benchmark naive_einsum!(((1,2), (1,3), (1,4)), ($a,$a,$a), (2,3,4), $y) seconds = 1
BenchmarkTools.Trial:
memory estimate: 9.42 KiB
allocs estimate: 141
--------------
minimum time: 29.886 μs (0.00% GC)
median time: 215.284 μs (0.00% GC)
mean time: 177.240 μs (0.91% GC)
maximum time: 4.749 ms (98.66% GC)
--------------
samples: 5583
evals/sample: 1
julia> @benchmark a*a seconds = 1
BenchmarkTools.Trial:
memory estimate: 592 bytes
allocs estimate: 19
--------------
minimum time: 7.608 μs (0.00% GC)
median time: 10.062 μs (0.00% GC)
mean time: 9.750 μs (0.00% GC)
maximum time: 111.823 μs (0.00% GC)
--------------
samples: 10000
evals/sample: 1
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
For someone who want to optimize this code
Some performance tips
https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html
https://juliagpu.gitlab.io/CUDAnative.jl/man/performance/
Tooling
CUDA profiler
nvprof
, CUDA memory checkcuda-memcheck
and macros in CUDAnative.