Skip to content

Instantly share code, notes, and snippets.

@GiggleLiu
Last active January 20, 2020 09:16
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save GiggleLiu/d72b04fd4d2123a4dba0d024e210da6c to your computer and use it in GitHub Desktop.
Save GiggleLiu/d72b04fd4d2123a4dba0d024e210da6c to your computer and use it in GitHub Desktop.
CUDAnative based einsum! on GPU - the prototype
using TupleTools
using Base.Cartesian
using CuArrays, CUDAnative
"""
A naive implementation of `einsum!`
* `ixs`: input tensor indices,
* `xs`: input tensors,
* `iy`: output tensor indices,
* `y`: accumulated tensor, notice it is initialized to 0 as output!
"""
function naive_einsum!(ixs::Tuple, xs, iy::NTuple{NO, Int}, y) where NO
# outer legs and inner legs
outer_indices = unique(iy)
inner_indices = setdiff(TupleTools.vcat(ixs...), outer_indices)
# find size for each leg
all_indices = TupleTools.vcat(ixs..., iy)
all_sizes = TupleTools.vcat(size.(xs)..., size(y))
outer_sizes = Tuple(all_sizes[i] for i in indexin(outer_indices, [all_indices...]))
inner_sizes = Tuple(all_sizes[i] for i in indexin(inner_indices, [all_indices...]))
# cartesian indices for outer and inner legs
outer_ci = CartesianIndices(outer_sizes)
inner_ci = CartesianIndices(inner_sizes)
# for indexing tensors (leg binding)
indices = (outer_indices..., inner_indices...)
locs_xs = Tuple(Tuple(findfirst(isequal(i), indices) for i in ix) for ix in ixs)
locs_y = Tuple(findfirst(isequal(i), outer_indices) for i in iy)
loop!(locs_xs, xs, locs_y, y, outer_ci, inner_ci)
end
"""take an index subset from `ind`"""
index_map(ind::CartesianIndex, locs::Tuple) = CartesianIndex(TupleTools.getindices(Tuple(ind), locs))
"""indiex tensors, and return the product of elements"""
@inline @generated function map_prod(::Type{T}, xs::Tuple, ind::CartesianIndex, locs_xs::NTuple{N}) where {N, T}
quote
p = one(T)
@nexprs $N i -> @inbounds p *= xs[i][index_map(ind, locs_xs[i])]
end
end
"""decide the number of threads and blocks to be launched."""
@inline function cudiv(x::Int)
max_threads = 256
threads_x = min(max_threads, x)
threads_x, ceil(Int, x/threads_x)
end
"""
loop and accumulate products to y, the GPU version.
## References
* CUDAnative.jl: https://github.com/JuliaGPU/CUDAnative.jl
"""
function loop!(locs_xs::NTuple{N}, xs::NTuple{N, CuArray}, locs_y, y::CuArray{T}, outer_ci::CartesianIndices, inner_ci::CartesianIndices) where {N, T}
function loop_kernel(locs_xs, xs, locs_y, y, outer_ci, inner_ci)
i = (blockIdx().x-1) * blockDim().x + threadIdx().x
i > length(outer_ci) && return nothing
@inbounds ind_y = outer_ci[i]
iy = index_map(ind_y, locs_y)
# To avoid race condition (https://mc.stanford.edu/cgi-bin/images/3/34/Darve_cme343_cuda_3.pdf),
# inner loops (cumulative operations) can not be avoided inside a single CUDA core,
# which means, reduction of dimensions can be slow.
# to increase the parallism, we should use a different strategy described in
# http://people.cs.vt.edu/yongcao/teaching/cs5234/spring2013/slides/Lecture10.pdf
for ind_x in inner_ci
ind_xy = CartesianIndex(TupleTools.vcat(ind_y.I, ind_x.I))
@inbounds y[iy] += map_prod(T, xs, ind_xy, locs_xs)
end
nothing
end
X, Y = cudiv(length(outer_ci))
@cuda threads=X blocks=Y loop_kernel(locs_xs, xs, locs_y, y, outer_ci, inner_ci)
y
end
"""
loop and accumulate products to y, the GPU version, the CPU version.
"""
function loop!(locs_xs::NTuple{N}, xs::NTuple{N, AbstractArray}, locs_y, y::AbstractArray{T}, outer_ci::CartesianIndices, inner_ci::CartesianIndices) where {N, T}
@simd for i in outer_ci
@inbounds ind_y = outer_ci[i]
iy = index_map(ind_y, locs_y)
for ind_x in inner_ci
ind_xy = CartesianIndex(TupleTools.vcat(ind_y.I, ind_x.I))
@inbounds y[iy] += map_prod(T, xs, ind_xy, locs_xs)
end
end
y
end
using Test
a = randn(3, 3) |> CuArray
@test naive_einsum!(((1,2), (2,3)), (a, a), (1,3), zeros(3,3) |> CuArray) ≈ a*a
# benchmarks
using BenchmarkTools
a = randn(Float32, 50, 50)
c = zeros(Float32, 50, 50)
y = zeros(Float32, 50, 50, 50)
@benchmark naive_einsum!(((1,2), (2,3)), (a,a), (1,3), c) seconds = 1
@benchmark naive_einsum!(((1,2), (1,3), (1,4)), ($a,$a,$a), (2,3,4), $y) seconds = 1
@benchmark a*a seconds = 1
a = a |> CuArray
c = c |> CuArray
y = y |> CuArray
@benchmark naive_einsum!(((1,2), (2,3)), (a,a), (1,3), c) seconds = 1
@benchmark naive_einsum!(((1,2), (1,3), (1,4)), ($a,$a,$a), (2,3,4), $y) seconds = 1
@benchmark a*a seconds = 1
@GiggleLiu
Copy link
Author

For someone who want to optimize this code

Some performance tips

https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html
https://juliagpu.gitlab.io/CUDAnative.jl/man/performance/

Tooling

CUDA profiler nvprof, CUDA memory check cuda-memcheck and macros in CUDAnative.

@GiggleLiu
Copy link
Author

CPU benchmark (Single Precision)

julia> @benchmark naive_einsum!(((1,2), (2,3)), (a,a), (1,3), c) seconds = 1
BenchmarkTools.Trial: 
  memory estimate:  5.19 KiB
  allocs estimate:  76
  --------------
  minimum time:     329.733 μs (0.00% GC)
  median time:      333.052 μs (0.00% GC)
  mean time:        337.718 μs (0.00% GC)
  maximum time:     1.127 ms (0.00% GC)
  --------------
  samples:          2947
  evals/sample:     1

julia> @benchmark naive_einsum!(((1,2), (1,3), (1,4)), ($a,$a,$a), (2,3,4), $y) seconds = 1

BenchmarkTools.Trial: 
  memory estimate:  6.16 KiB
  allocs estimate:  87
  --------------
  minimum time:     32.770 ms (0.00% GC)
  median time:      33.418 ms (0.00% GC)
  mean time:        33.526 ms (0.00% GC)
  maximum time:     36.492 ms (0.00% GC)
  --------------
  samples:          30
  evals/sample:     1

julia> @benchmark a*a seconds = 1
BenchmarkTools.Trial: 
  memory estimate:  9.94 KiB
  allocs estimate:  1
  --------------
  minimum time:     9.463 μs (0.00% GC)
  median time:      9.724 μs (0.00% GC)
  mean time:        10.295 μs (0.00% GC)
  maximum time:     571.920 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

GPU benchmark (Single Precision)

julia> @benchmark naive_einsum!(((1,2), (2,3)), (a,a), (1,3), c) seconds = 1
BenchmarkTools.Trial: 
  memory estimate:  8.64 KiB
  allocs estimate:  150
  --------------
  minimum time:     29.382 μs (0.00% GC)
  median time:      33.277 μs (0.00% GC)
  mean time:        44.514 μs (23.13% GC)
  maximum time:     78.504 ms (99.84% GC)
  --------------
  samples:          10000
  evals/sample:     1

julia> @benchmark naive_einsum!(((1,2), (1,3), (1,4)), ($a,$a,$a), (2,3,4), $y) seconds = 1

^[[BBenchmarkTools.Trial: 
  memory estimate:  10.41 KiB
  allocs estimate:  162
  --------------
  minimum time:     36.074 μs (0.00% GC)
  median time:      193.258 μs (0.00% GC)
  mean time:        163.580 μs (9.80% GC)
  maximum time:     79.719 ms (99.85% GC)
  --------------
  samples:          6043
  evals/sample:     1

julia> @benchmark a*a seconds = 1
BenchmarkTools.Trial: 
  memory estimate:  592 bytes
  allocs estimate:  19
  --------------
  minimum time:     7.418 μs (0.00% GC)
  median time:      10.534 μs (0.00% GC)
  mean time:        11.524 μs (0.00% GC)
  maximum time:     91.886 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

@GiggleLiu
Copy link
Author

CPU/GPU information
julia> versioninfo()
Julia Version 1.1.0
Commit 80516ca202 (2019-01-21 21:24 UTC)
Platform Info:
  OS: Linux (x86_64-pc-linux-gnu)
  CPU: Intel(R) Xeon(R) CPU E5-2650 v4 @ 2.20GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-6.0.1 (ORCJIT, broadwell)

julia> LinearAlgebra.BLAS.vendor()
:openblas64

@GiggleLiu
Copy link
Author

CPU benchmark (Double Precision)

julia> @benchmark naive_einsum!(((1,2), (2,3)), (a,a), (1,3), c) seconds = 1
BenchmarkTools.Trial: 
  memory estimate:  4.55 KiB
  allocs estimate:  61
  --------------
  minimum time:     325.154 μs (0.00% GC)
  median time:      327.675 μs (0.00% GC)
  mean time:        333.885 μs (0.00% GC)
  maximum time:     431.217 μs (0.00% GC)
  --------------
  samples:          2980
  evals/sample:     1

julia> @benchmark naive_einsum!(((1,2), (1,3), (1,4)), ($a,$a,$a), (2,3,4), $y) seconds = 1

BenchmarkTools.Trial: 
  memory estimate:  5.17 KiB
  allocs estimate:  66
  --------------
  minimum time:     32.724 ms (0.00% GC)
  median time:      33.061 ms (0.00% GC)
  mean time:        33.193 ms (0.00% GC)
  maximum time:     35.801 ms (0.00% GC)
  --------------
  samples:          31
  evals/sample:     1

julia> @benchmark a*a seconds = 1
BenchmarkTools.Trial: 
  memory estimate:  19.64 KiB
  allocs estimate:  2
  --------------
  minimum time:     15.668 μs (0.00% GC)
  median time:      16.284 μs (0.00% GC)
  mean time:        17.773 μs (5.29% GC)
  maximum time:     1.735 ms (97.46% GC)
  --------------
  samples:          10000
  evals/sample:     1

CPU benchmark (Double Precision)

julia> @benchmark naive_einsum!(((1,2), (2,3)), (a,a), (1,3), c) seconds = 1
BenchmarkTools.Trial: 
  memory estimate:  8.00 KiB
  allocs estimate:  135
  --------------
  minimum time:     24.836 μs (0.00% GC)
  median time:      28.040 μs (0.00% GC)
  mean time:        30.410 μs (3.91% GC)
  maximum time:     4.185 ms (98.44% GC)
  --------------
  samples:          10000
  evals/sample:     1

julia> @benchmark naive_einsum!(((1,2), (1,3), (1,4)), ($a,$a,$a), (2,3,4), $y) seconds = 1

BenchmarkTools.Trial: 
  memory estimate:  9.42 KiB
  allocs estimate:  141
  --------------
  minimum time:     29.886 μs (0.00% GC)
  median time:      215.284 μs (0.00% GC)
  mean time:        177.240 μs (0.91% GC)
  maximum time:     4.749 ms (98.66% GC)
  --------------
  samples:          5583
  evals/sample:     1

julia> @benchmark a*a seconds = 1
BenchmarkTools.Trial: 
  memory estimate:  592 bytes
  allocs estimate:  19
  --------------
  minimum time:     7.608 μs (0.00% GC)
  median time:      10.062 μs (0.00% GC)
  mean time:        9.750 μs (0.00% GC)
  maximum time:     111.823 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment