CPU bitonic sort
shared = rand(16) | |
bisort(shared) = begin | |
NUM = UInt(length(shared)) | |
k = UInt(2) | |
while (k <= NUM) | |
j = div(k,2) | |
while(j >=1) | |
for tid in UInt(0):UInt(NUM-1) | |
ixj = tid⊻j | |
if(ixj > tid) | |
if((tid & k) == 0) | |
if (shared[tid+1] > shared[ixj+1]) | |
tmp = shared[ixj+1] | |
shared[ixj+1] = shared[tid+1] | |
shared[tid+1] = tmp | |
end | |
else | |
if (shared[tid+1] < shared[ixj+1]) | |
tmp = shared[ixj+1] | |
shared[ixj+1] = shared[tid+1] | |
shared[tid+1] = tmp | |
end | |
end | |
end | |
end | |
j = div(j,2) | |
end | |
k = UInt(k*2) | |
end | |
shared | |
end |
bisort(shared, NUM) = begin | |
tid = UInt(((blockIdx().x - 1) * blockDim().x + threadIdx().x)-1) | |
k = UInt(2) | |
while (k <= NUM) | |
j = div(k, 2) | |
while j >= 1 | |
ixj = tid⊻UInt(j) | |
if ixj > tid | |
if (tid & k) == 0 | |
if shared[tid+1] > shared[ixj+1] | |
tmp = shared[ixj+1] | |
shared[ixj+1] = shared[tid+1] | |
shared[tid+1] = tmp | |
end | |
else | |
if shared[tid+1] < shared[ixj+1] | |
tmp = shared[ixj+1] | |
shared[ixj+1] = shared[tid+1] | |
shared[tid+1] = tmp | |
end | |
end | |
end | |
j = div(j,2) | |
sync_threads() | |
end | |
k = UInt(k*2) | |
end | |
return | |
end | |
using CuArrays, CUDAnative | |
shared = rand(Float32, 1024) | |
cushared = cu(shared) | |
bitonicsort!(cushared) = begin | |
CuArrays.@sync begin | |
@cuda threads = 512 blocks = 2 bisort(cushared, 1024) | |
end | |
end | |
using BenchmarkTools | |
@time bitonicsort!(cushared) | |
using SortingAlgorithms | |
@time sort(shared, alg=RadixSort) | |
using Flux | |
cpu(cushared) |> issorted |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment