xiaodaigh/cpu_bitonic_sort.jl

## cpu_bitonic_sort.jl
shared = rand(16)

bisort(shared) = begin
NUM = UInt(length(shared))

k = UInt(2)
while (k <= NUM)
  j = div(k,2)
  while(j >=1)
    for tid in UInt(0):UInt(NUM-1)
      ixj = tid⊻j
      if(ixj > tid)
        if((tid & k) == 0)
          if (shared[tid+1] > shared[ixj+1])
            tmp = shared[ixj+1]
            shared[ixj+1] = shared[tid+1]
            shared[tid+1] = tmp
          end
        else
          if (shared[tid+1] < shared[ixj+1])
            tmp = shared[ixj+1]
            shared[ixj+1] = shared[tid+1]
            shared[tid+1] = tmp
          end
        end
      end
    end
    j = div(j,2)
  end
  k = UInt(k*2)
end
shared
end

## gpu_bitonic_sort.jl

bisort(shared, NUM) = begin
  tid = UInt(((blockIdx().x - 1) * blockDim().x + threadIdx().x)-1)
  k = UInt(2)
  while (k <= NUM)
    j = div(k, 2)
    while j >= 1
      ixj = tid⊻UInt(j)
      if ixj > tid
        if (tid & k) == 0
            if shared[tid+1] > shared[ixj+1]
              tmp = shared[ixj+1]
              shared[ixj+1] = shared[tid+1]
              shared[tid+1] = tmp
            end
        else
            if shared[tid+1] < shared[ixj+1]
              tmp = shared[ixj+1]
              shared[ixj+1] = shared[tid+1]
              shared[tid+1] = tmp
            end
        end
      end
      j =  div(j,2)
      sync_threads()
    end
    k = UInt(k*2)
  end
  return
end

using CuArrays, CUDAnative

shared = rand(Float32, 1024)
cushared = cu(shared)

bitonicsort!(cushared) = begin
  CuArrays.@sync begin
    @cuda threads = 512 blocks = 2  bisort(cushared, 1024)
  end
end

using BenchmarkTools
@time bitonicsort!(cushared)

using SortingAlgorithms
@time sort(shared, alg=RadixSort)

using Flux
cpu(cushared) |> issorted
	shared = rand(16)

	bisort(shared) = begin
	NUM = UInt(length(shared))

	k = UInt(2)
	while (k <= NUM)
	j = div(k,2)
	while(j >=1)
	for tid in UInt(0):UInt(NUM-1)
	ixj = tid⊻j
	if(ixj > tid)
	if((tid & k) == 0)
	if (shared[tid+1] > shared[ixj+1])
	tmp = shared[ixj+1]
	shared[ixj+1] = shared[tid+1]
	shared[tid+1] = tmp
	end
	else
	if (shared[tid+1] < shared[ixj+1])
	tmp = shared[ixj+1]
	shared[ixj+1] = shared[tid+1]
	shared[tid+1] = tmp
	end
	end
	end
	end
	j = div(j,2)
	end
	k = UInt(k*2)
	end
	shared
	end

	bisort(shared, NUM) = begin
	tid = UInt(((blockIdx().x - 1) * blockDim().x + threadIdx().x)-1)
	k = UInt(2)
	while (k <= NUM)
	j = div(k, 2)
	while j >= 1
	ixj = tid⊻UInt(j)
	if ixj > tid
	if (tid & k) == 0
	if shared[tid+1] > shared[ixj+1]
	tmp = shared[ixj+1]
	shared[ixj+1] = shared[tid+1]
	shared[tid+1] = tmp
	end
	else
	if shared[tid+1] < shared[ixj+1]
	tmp = shared[ixj+1]
	shared[ixj+1] = shared[tid+1]
	shared[tid+1] = tmp
	end
	end
	end
	j = div(j,2)
	sync_threads()
	end
	k = UInt(k*2)
	end
	return
	end

	using CuArrays, CUDAnative

	shared = rand(Float32, 1024)
	cushared = cu(shared)

	bitonicsort!(cushared) = begin
	CuArrays.@sync begin
	@cuda threads = 512 blocks = 2 bisort(cushared, 1024)
	end
	end

	using BenchmarkTools
	@time bitonicsort!(cushared)

	using SortingAlgorithms
	@time sort(shared, alg=RadixSort)

	using Flux
	cpu(cushared) \|> issorted