Created
April 13, 2020 07:19
-
-
Save maleadt/08bd84906382ed8b9c57e154356f6a0b to your computer and use it in GitHub Desktop.
GPU sort using dynamic parallelism (WIP, slow)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using Test | |
using CUDA | |
const MAX_DEPTH = 16 | |
const SELECTION_SORT = 32 | |
function selection_sort(data, left, right) | |
@inbounds for i in left:right | |
min_val = data[i] | |
min_idx = i | |
# find the smallest value in the range [left, right] | |
for j in i+1:right | |
val_j = data[j] | |
if val_j < min_val | |
min_idx = j | |
min_val = val_j | |
end | |
end | |
# swap the values | |
if i != min_idx | |
data[min_idx] = data[i] | |
data[i] = min_val | |
end | |
end | |
end | |
function quicksort(data, left, right, depth) | |
# If we're too deep or there are few elements left, we use an insertion sort... | |
if depth >= MAX_DEPTH || right-left <= SELECTION_SORT | |
selection_sort(data, left, right) | |
return | |
end | |
lptr = left | |
rptr = right | |
pivot = @inbounds data[(left+right)÷2] | |
# do the partitioning | |
@inbounds while lptr <= rptr | |
# find the next left- and right-hand values to swap | |
lval = data[lptr] | |
rval = data[rptr] | |
# move the left pointer as long as the pointed element is smaller than the pivot | |
while lval < pivot && lptr < right | |
lptr += 1 | |
lval = data[lptr] | |
end | |
# move the right pointer as long as the pointed element is larger than the pivot | |
while rval > pivot && rptr > left | |
rptr -= 1 | |
rval = data[rptr] | |
end | |
# if the swap points are valid, do the swap! | |
if lptr <= rptr | |
data[lptr] = rval | |
data[rptr] = lval | |
lptr += 1 | |
rptr -= 1 | |
end | |
end | |
# launch a new block to sort the left part | |
if left < rptr | |
s = CuDeviceStream() | |
@cuda dynamic=true stream=s quicksort(data, left, rptr, depth+1) | |
CUDAnative.unsafe_destroy!(s) | |
end | |
# launch a new block to sort the right part | |
if lptr < right | |
s = CuDeviceStream() | |
@cuda dynamic=true stream=s quicksort(data, lptr, right, depth+1) | |
CUDAnative.unsafe_destroy!(s) | |
end | |
return | |
end | |
function Base.sort!(a::CuArray) | |
limit!(CUDAdrv.LIMIT_DEV_RUNTIME_SYNC_DEPTH, MAX_DEPTH) | |
left = 1 | |
right = length(a) | |
@cuda quicksort(a, left, right, 0) | |
return a | |
end | |
using Test | |
using BenchmarkTools | |
function main(N=1000) | |
a = rand(N) | |
@btime sort!(a) setup=(a = copy($a)) | |
b = CuArray(a) | |
@test sort!(copy(a)) == Array(sort!(copy(b))) | |
CUDA.@profile for i in 1:10 | |
sort!(copy(b)) | |
end | |
@btime CuArrays.@sync(sort!(b)) setup=(b = copy($b)) | |
@btime begin | |
x = Array(b) | |
sort!(b) | |
copyto!(x, b) | |
end setup=(b = copy($b)) | |
return | |
end | |
isinteractive() || main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment