Last active
January 20, 2020 09:16
-
-
Save GiggleLiu/d72b04fd4d2123a4dba0d024e210da6c to your computer and use it in GitHub Desktop.
CUDAnative based einsum! on GPU - the prototype
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using TupleTools | |
using Base.Cartesian | |
using CuArrays, CUDAnative | |
""" | |
A naive implementation of `einsum!` | |
* `ixs`: input tensor indices, | |
* `xs`: input tensors, | |
* `iy`: output tensor indices, | |
* `y`: accumulated tensor, notice it is initialized to 0 as output! | |
""" | |
function naive_einsum!(ixs::Tuple, xs, iy::NTuple{NO, Int}, y) where NO | |
# outer legs and inner legs | |
outer_indices = unique(iy) | |
inner_indices = setdiff(TupleTools.vcat(ixs...), outer_indices) | |
# find size for each leg | |
all_indices = TupleTools.vcat(ixs..., iy) | |
all_sizes = TupleTools.vcat(size.(xs)..., size(y)) | |
outer_sizes = Tuple(all_sizes[i] for i in indexin(outer_indices, [all_indices...])) | |
inner_sizes = Tuple(all_sizes[i] for i in indexin(inner_indices, [all_indices...])) | |
# cartesian indices for outer and inner legs | |
outer_ci = CartesianIndices(outer_sizes) | |
inner_ci = CartesianIndices(inner_sizes) | |
# for indexing tensors (leg binding) | |
indices = (outer_indices..., inner_indices...) | |
locs_xs = Tuple(Tuple(findfirst(isequal(i), indices) for i in ix) for ix in ixs) | |
locs_y = Tuple(findfirst(isequal(i), outer_indices) for i in iy) | |
loop!(locs_xs, xs, locs_y, y, outer_ci, inner_ci) | |
end | |
"""take an index subset from `ind`""" | |
index_map(ind::CartesianIndex, locs::Tuple) = CartesianIndex(TupleTools.getindices(Tuple(ind), locs)) | |
"""indiex tensors, and return the product of elements""" | |
@inline @generated function map_prod(::Type{T}, xs::Tuple, ind::CartesianIndex, locs_xs::NTuple{N}) where {N, T} | |
quote | |
p = one(T) | |
@nexprs $N i -> @inbounds p *= xs[i][index_map(ind, locs_xs[i])] | |
end | |
end | |
"""decide the number of threads and blocks to be launched.""" | |
@inline function cudiv(x::Int) | |
max_threads = 256 | |
threads_x = min(max_threads, x) | |
threads_x, ceil(Int, x/threads_x) | |
end | |
""" | |
loop and accumulate products to y, the GPU version. | |
## References | |
* CUDAnative.jl: https://github.com/JuliaGPU/CUDAnative.jl | |
""" | |
function loop!(locs_xs::NTuple{N}, xs::NTuple{N, CuArray}, locs_y, y::CuArray{T}, outer_ci::CartesianIndices, inner_ci::CartesianIndices) where {N, T} | |
function loop_kernel(locs_xs, xs, locs_y, y, outer_ci, inner_ci) | |
i = (blockIdx().x-1) * blockDim().x + threadIdx().x | |
i > length(outer_ci) && return nothing | |
@inbounds ind_y = outer_ci[i] | |
iy = index_map(ind_y, locs_y) | |
# To avoid race condition (https://mc.stanford.edu/cgi-bin/images/3/34/Darve_cme343_cuda_3.pdf), | |
# inner loops (cumulative operations) can not be avoided inside a single CUDA core, | |
# which means, reduction of dimensions can be slow. | |
# to increase the parallism, we should use a different strategy described in | |
# http://people.cs.vt.edu/yongcao/teaching/cs5234/spring2013/slides/Lecture10.pdf | |
for ind_x in inner_ci | |
ind_xy = CartesianIndex(TupleTools.vcat(ind_y.I, ind_x.I)) | |
@inbounds y[iy] += map_prod(T, xs, ind_xy, locs_xs) | |
end | |
nothing | |
end | |
X, Y = cudiv(length(outer_ci)) | |
@cuda threads=X blocks=Y loop_kernel(locs_xs, xs, locs_y, y, outer_ci, inner_ci) | |
y | |
end | |
""" | |
loop and accumulate products to y, the GPU version, the CPU version. | |
""" | |
function loop!(locs_xs::NTuple{N}, xs::NTuple{N, AbstractArray}, locs_y, y::AbstractArray{T}, outer_ci::CartesianIndices, inner_ci::CartesianIndices) where {N, T} | |
@simd for i in outer_ci | |
@inbounds ind_y = outer_ci[i] | |
iy = index_map(ind_y, locs_y) | |
for ind_x in inner_ci | |
ind_xy = CartesianIndex(TupleTools.vcat(ind_y.I, ind_x.I)) | |
@inbounds y[iy] += map_prod(T, xs, ind_xy, locs_xs) | |
end | |
end | |
y | |
end | |
using Test | |
a = randn(3, 3) |> CuArray | |
@test naive_einsum!(((1,2), (2,3)), (a, a), (1,3), zeros(3,3) |> CuArray) ≈ a*a | |
# benchmarks | |
using BenchmarkTools | |
a = randn(Float32, 50, 50) | |
c = zeros(Float32, 50, 50) | |
y = zeros(Float32, 50, 50, 50) | |
@benchmark naive_einsum!(((1,2), (2,3)), (a,a), (1,3), c) seconds = 1 | |
@benchmark naive_einsum!(((1,2), (1,3), (1,4)), ($a,$a,$a), (2,3,4), $y) seconds = 1 | |
@benchmark a*a seconds = 1 | |
a = a |> CuArray | |
c = c |> CuArray | |
y = y |> CuArray | |
@benchmark naive_einsum!(((1,2), (2,3)), (a,a), (1,3), c) seconds = 1 | |
@benchmark naive_einsum!(((1,2), (1,3), (1,4)), ($a,$a,$a), (2,3,4), $y) seconds = 1 | |
@benchmark a*a seconds = 1 |
Author
GiggleLiu
commented
May 28, 2019
CPU benchmark (Double Precision)
julia> @benchmark naive_einsum!(((1,2), (2,3)), (a,a), (1,3), c) seconds = 1
BenchmarkTools.Trial:
memory estimate: 4.55 KiB
allocs estimate: 61
--------------
minimum time: 325.154 μs (0.00% GC)
median time: 327.675 μs (0.00% GC)
mean time: 333.885 μs (0.00% GC)
maximum time: 431.217 μs (0.00% GC)
--------------
samples: 2980
evals/sample: 1
julia> @benchmark naive_einsum!(((1,2), (1,3), (1,4)), ($a,$a,$a), (2,3,4), $y) seconds = 1
BenchmarkTools.Trial:
memory estimate: 5.17 KiB
allocs estimate: 66
--------------
minimum time: 32.724 ms (0.00% GC)
median time: 33.061 ms (0.00% GC)
mean time: 33.193 ms (0.00% GC)
maximum time: 35.801 ms (0.00% GC)
--------------
samples: 31
evals/sample: 1
julia> @benchmark a*a seconds = 1
BenchmarkTools.Trial:
memory estimate: 19.64 KiB
allocs estimate: 2
--------------
minimum time: 15.668 μs (0.00% GC)
median time: 16.284 μs (0.00% GC)
mean time: 17.773 μs (5.29% GC)
maximum time: 1.735 ms (97.46% GC)
--------------
samples: 10000
evals/sample: 1
CPU benchmark (Double Precision)
julia> @benchmark naive_einsum!(((1,2), (2,3)), (a,a), (1,3), c) seconds = 1
BenchmarkTools.Trial:
memory estimate: 8.00 KiB
allocs estimate: 135
--------------
minimum time: 24.836 μs (0.00% GC)
median time: 28.040 μs (0.00% GC)
mean time: 30.410 μs (3.91% GC)
maximum time: 4.185 ms (98.44% GC)
--------------
samples: 10000
evals/sample: 1
julia> @benchmark naive_einsum!(((1,2), (1,3), (1,4)), ($a,$a,$a), (2,3,4), $y) seconds = 1
BenchmarkTools.Trial:
memory estimate: 9.42 KiB
allocs estimate: 141
--------------
minimum time: 29.886 μs (0.00% GC)
median time: 215.284 μs (0.00% GC)
mean time: 177.240 μs (0.91% GC)
maximum time: 4.749 ms (98.66% GC)
--------------
samples: 5583
evals/sample: 1
julia> @benchmark a*a seconds = 1
BenchmarkTools.Trial:
memory estimate: 592 bytes
allocs estimate: 19
--------------
minimum time: 7.608 μs (0.00% GC)
median time: 10.062 μs (0.00% GC)
mean time: 9.750 μs (0.00% GC)
maximum time: 111.823 μs (0.00% GC)
--------------
samples: 10000
evals/sample: 1
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment