Skip to content

Instantly share code, notes, and snippets.

@kose-y
Created August 7, 2020 01:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kose-y/afe9f14fb4075ce276cb752d6391928b to your computer and use it in GitHub Desktop.
Save kose-y/afe9f14fb4075ce276cb752d6391928b to your computer and use it in GitHub Desktop.
CUDA bitonic sort with Julia
# Code based on: https://discourse.julialang.org/t/gpu-sort-wip-gpu-1000x-faster-than-cpu-i-must-be-doing-something-wrong/20310/4
# bitonic sorts a vector of any length.
using CUDA
function bisort!(shared, j, k)
tid = UInt(((blockIdx().x - 1) * blockDim().x + threadIdx().x)-1)
ixj = tid⊻UInt(j)
if ixj > tid
if (tid & k) == 0
if shared[tid+1] > shared[ixj+1]
tmp = shared[ixj+1]
shared[ixj+1] = shared[tid+1]
shared[tid+1] = tmp
end
else
if shared[tid+1] < shared[ixj+1]
tmp = shared[ixj+1]
shared[ixj+1] = shared[tid+1]
shared[tid+1] = tmp
end
end
end
return
end
function bitonicsort_pow2!(cushared)
CUDA.@sync begin
k = UInt(2)
NUM = length(cushared)
nblocks = ceil(NUM/256) |> Int
while (k <= NUM)
j = div(k, 2)
while j >= 1
@cuda threads = 256 blocks = nblocks bisort!(cushared, j, k)
j = div(j,2)
end
k = UInt(k*2)
end
end
end
function bitonicsort!(cushared)
N = length(cushared)
K = convert(Int, ceil(log2(N)))
paddedlen = 1<<K
cushared2 = similar(cushared, paddedlen)
fill!(cushared2, Inf)
cushared2[1:N] .= @view cushared[1:N]
bitonicsort_pow2!(cushared2)
cushared .= @view cushared2[1:N]
cushared
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment