Skip to content

Instantly share code, notes, and snippets.

@Moelf

Moelf/repl.jl Secret

Last active February 28, 2023 19:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Moelf/e75fdb68034f13e070f92302b1faf622 to your computer and use it in GitHub Desktop.
Save Moelf/e75fdb68034f13e070f92302b1faf622 to your computer and use it in GitHub Desktop.
RNTuple Split decoding on Julia with GPU
julia> bytes = rand(UInt8, 2^16);
julia> unpackcpu(bytes) == Array(unpackgpu(CuArray(bytes)))
true
julia> @btime unpackcpu(bytes) setup=(bytes=rand(UInt8, 65536)); # CPU algorithm, cpu array
2.885 μs (2 allocations: 64.11 KiB)
julia> @btime unpackgpu(bytes) setup=(bytes=rand(UInt8, 65536)); # GPU algorithm, cpu array
34.646 μs (2 allocations: 64.11 KiB)
julia> @btime unpackgpu(bytes) setup=(bytes=CUDA.rand(UInt8, 65536)); # GPU algorithm, gpu array
14.076 μs (136 allocations: 6.41 KiB)
julia> @btime unpackgpu2(bytes) setup=(bytes=CUDA.rand(UInt8, 65536)); # GPU algorithm, cpu array
6.072 μs (48 allocations: 2.03 KiB)
using CUDA
function unpackcpu(src::Vector{UInt8})
count = length(src)÷4
res = similar(src)
dst = reinterpret(Int32, res)
@inbounds for i = 1:count
Base.Cartesian.@nexprs 4 j -> b_j = Int32(src[(j-1)*count + i])<<(8*(j-1))
dst[i] = (b_1 | b_2) | (b_3 | b_4)
end
res
end
@views function unpackgpu(src::AbstractVector{UInt8})
count = length(src)÷4
res = similar(src)
res[1:4:end] = src[0*count + 1: 1*count]
res[2:4:end] = src[1*count + 1: 2*count]
res[3:4:end] = src[2*count + 1: 3*count]
res[4:4:end] = src[3*count + 1: 4*count]
res
end
function unpackgpu2(src)
vec(permutedims(reshape(src, :, 4)))
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment