Skip to content

Instantly share code, notes, and snippets.

@maleadt
Created April 13, 2020 07:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save maleadt/08bd84906382ed8b9c57e154356f6a0b to your computer and use it in GitHub Desktop.
Save maleadt/08bd84906382ed8b9c57e154356f6a0b to your computer and use it in GitHub Desktop.
GPU sort using dynamic parallelism (WIP, slow)
using Test
using CUDA
const MAX_DEPTH = 16
const SELECTION_SORT = 32
function selection_sort(data, left, right)
@inbounds for i in left:right
min_val = data[i]
min_idx = i
# find the smallest value in the range [left, right]
for j in i+1:right
val_j = data[j]
if val_j < min_val
min_idx = j
min_val = val_j
end
end
# swap the values
if i != min_idx
data[min_idx] = data[i]
data[i] = min_val
end
end
end
function quicksort(data, left, right, depth)
# If we're too deep or there are few elements left, we use an insertion sort...
if depth >= MAX_DEPTH || right-left <= SELECTION_SORT
selection_sort(data, left, right)
return
end
lptr = left
rptr = right
pivot = @inbounds data[(left+right)÷2]
# do the partitioning
@inbounds while lptr <= rptr
# find the next left- and right-hand values to swap
lval = data[lptr]
rval = data[rptr]
# move the left pointer as long as the pointed element is smaller than the pivot
while lval < pivot && lptr < right
lptr += 1
lval = data[lptr]
end
# move the right pointer as long as the pointed element is larger than the pivot
while rval > pivot && rptr > left
rptr -= 1
rval = data[rptr]
end
# if the swap points are valid, do the swap!
if lptr <= rptr
data[lptr] = rval
data[rptr] = lval
lptr += 1
rptr -= 1
end
end
# launch a new block to sort the left part
if left < rptr
s = CuDeviceStream()
@cuda dynamic=true stream=s quicksort(data, left, rptr, depth+1)
CUDAnative.unsafe_destroy!(s)
end
# launch a new block to sort the right part
if lptr < right
s = CuDeviceStream()
@cuda dynamic=true stream=s quicksort(data, lptr, right, depth+1)
CUDAnative.unsafe_destroy!(s)
end
return
end
function Base.sort!(a::CuArray)
limit!(CUDAdrv.LIMIT_DEV_RUNTIME_SYNC_DEPTH, MAX_DEPTH)
left = 1
right = length(a)
@cuda quicksort(a, left, right, 0)
return a
end
using Test
using BenchmarkTools
function main(N=1000)
a = rand(N)
@btime sort!(a) setup=(a = copy($a))
b = CuArray(a)
@test sort!(copy(a)) == Array(sort!(copy(b)))
CUDA.@profile for i in 1:10
sort!(copy(b))
end
@btime CuArrays.@sync(sort!(b)) setup=(b = copy($b))
@btime begin
x = Array(b)
sort!(b)
copyto!(x, b)
end setup=(b = copy($b))
return
end
isinteractive() || main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment