kose-y/bitonic-sort.jl

## bitonic-sort.jl
# Code based on: https://discourse.julialang.org/t/gpu-sort-wip-gpu-1000x-faster-than-cpu-i-must-be-doing-something-wrong/20310/4
# bitonic sorts a vector of any length.
using CUDA
function bisort!(shared, j, k)
    tid = UInt(((blockIdx().x - 1) * blockDim().x + threadIdx().x)-1)
    ixj = tid⊻UInt(j)
    if ixj > tid
        if (tid & k) == 0
            if shared[tid+1] > shared[ixj+1]
                tmp = shared[ixj+1]
                shared[ixj+1] = shared[tid+1]
                shared[tid+1] = tmp
            end
        else
            if shared[tid+1] < shared[ixj+1]
                tmp = shared[ixj+1]
                shared[ixj+1] = shared[tid+1]
                shared[tid+1] = tmp
            end
        end
    end
    return
end

function bitonicsort_pow2!(cushared)
    CUDA.@sync begin
        k = UInt(2)
        NUM = length(cushared)
        nblocks = ceil(NUM/256) |> Int
        while (k <= NUM)
            j = div(k, 2)
            while j >= 1
                @cuda threads = 256 blocks = nblocks bisort!(cushared, j, k)
                j =  div(j,2)
            end
            k = UInt(k*2)
        end
    end
end

function bitonicsort!(cushared)
    N = length(cushared)
    K = convert(Int, ceil(log2(N)))
    paddedlen = 1<<K
    cushared2 = similar(cushared, paddedlen)
    fill!(cushared2, Inf)
    cushared2[1:N] .= @view cushared[1:N]
    bitonicsort_pow2!(cushared2)
    cushared .= @view cushared2[1:N]
    cushared
end
	# Code based on: https://discourse.julialang.org/t/gpu-sort-wip-gpu-1000x-faster-than-cpu-i-must-be-doing-something-wrong/20310/4
	# bitonic sorts a vector of any length.
	using CUDA
	function bisort!(shared, j, k)
	tid = UInt(((blockIdx().x - 1) * blockDim().x + threadIdx().x)-1)
	ixj = tid⊻UInt(j)
	if ixj > tid
	if (tid & k) == 0
	if shared[tid+1] > shared[ixj+1]
	tmp = shared[ixj+1]
	shared[ixj+1] = shared[tid+1]
	shared[tid+1] = tmp
	end
	else
	if shared[tid+1] < shared[ixj+1]
	tmp = shared[ixj+1]
	shared[ixj+1] = shared[tid+1]
	shared[tid+1] = tmp
	end
	end
	end
	return
	end

	function bitonicsort_pow2!(cushared)
	CUDA.@sync begin
	k = UInt(2)
	NUM = length(cushared)
	nblocks = ceil(NUM/256) \|> Int
	while (k <= NUM)
	j = div(k, 2)
	while j >= 1
	@cuda threads = 256 blocks = nblocks bisort!(cushared, j, k)
	j = div(j,2)
	end
	k = UInt(k*2)
	end
	end
	end

	function bitonicsort!(cushared)
	N = length(cushared)
	K = convert(Int, ceil(log2(N)))
	paddedlen = 1<<K
	cushared2 = similar(cushared, paddedlen)
	fill!(cushared2, Inf)
	cushared2[1:N] .= @view cushared[1:N]
	bitonicsort_pow2!(cushared2)
	cushared .= @view cushared2[1:N]
	cushared
	end