coralmw/hashtext.jl

## hashtext.jl
using CUDAdrv, CUDAnative
using SparseArrays

NKMERS = 1000
NHAYSTACK = NKMERS * 10

# making a mapping bwteen 64bit "kmers" and "locations", 32bit ints.
# can't be stored in a dense array as 73 exabytes for 1000 values.
# so need some mapping. on CPU, we use a hashmap/dict. on GPU... more complex.

haystack = rand(Int64, NHAYSTACK) # random things that look like kmers to look up
locs = rand(Int32, NHAYSTACK) # locations of the above
kmers = haystack[1:NKMERS] # a subset to look up

kmer_loc_mapping = Dict{Int64,Int32}( zip(haystack, locs) )

# on CPU, can do lookup like this. efficent.
map(kmer -> kmer_loc_mapping[kmer], kmers)
# kmer_loc_gpu_mapping = SparseMatrixCSC{Int32, Int64}()
# for l, hay in zip(locs, haystack)
#     kmer_loc_gpu_mapping[hay] = l
# end

# more explicitly, to mkae comp to GPU code eisier
function lookup!(mapping :: Dict{Int64,Int32}, kmers :: Array{Int64}, output :: Array{Int32})
    for (idx, kmer) in enumerate(kmers)
        output[idx] = mapping[kmer]
    end
    return
end

# all args to GPU must be bits types, so they can be copied. Dicts are not
print("dict is bits? ", isbitstype(Dict{Int64,Int32}))

# a sparse array makes sense too, worse random lookup perf but at least its small
kmer_loc_gpu_mapping = SparseArrays.SparseVector(NHAYSTACK, haystack, locs)
# but no
print("sparse is bits? ", isbitstype(SparseMatrixCSC{Int32, Int64}))
# so we just pass both the haystack and the locations and do the lookup manually

# single GPU thread version. This will not go faster than CPU
function gpu_lookup!(haystack, locs, kmers, output)
    for (idx, kmer) in enumerate(kmers)
        hay_idx = 0
        for (hay_trial_idx, hay_kmer) in enumerate(haystack)
            if hay_kmer == kmer
                hay_idx = hay_trial_idx
                break
            end
        end
        output[idx] = locs[hay_idx]
    end
    return
end

# GPU parallel version, over blocks
# in order to go parallel, the work units have ben statically split up. Each block
# will look up locs for the chunksize part of the kmers
function parallel_gpu_lookup!(haystack, locs, kmers, output)
    col = blockIdx().x # set from the call to @cuda
    cols = gridDim().x

    # row = threadIdx().x
    # rows = blockDim().x
    # float div throws errors as needs allocation?
    chunksize = div(length(kmers), cols)
    # @cuprintf("Greetings from block %ld, thread %ld!\n", Int64(blockIdx().x), Int64(threadIdx().x))

    for idx in (col-1)*chunksize+1:col*chunksize

        kmer = kmers[idx]
        hay_idx = 0
        for (hay_trial_idx, hay_kmer) in enumerate(haystack)
            if hay_kmer == kmer
                hay_idx = hay_trial_idx
                break
            end
        end
        output[idx] = locs[hay_idx]
    end
    return
end


cpu_locs = Array{Int32, 1}(undef, NKMERS)
lookup!(kmer_loc_mapping, kmers, cpu_locs)


# gpu_mapping = CuArray(kmer_loc_gpu_mapping) # no, because not isbits
gpu_hay = CuArray(haystack)
gpu_locs = CuArray(locs)
gpu_result = CuArray(Array{Int32, 1}(undef, NKMERS))
gpu_kmers = CuArray(kmers)
@cuda blocks=1 threads=1 gpu_lookup!(gpu_hay, gpu_locs, gpu_kmers, gpu_result)

# @device_code_warntype is a useful macro to print out the low level code.
# I found that accidently promoting kmers / cols to a float in the parallel code will through runtime error
# as the float gets dynamically allocated, and the compiler dies.
parallel_gpu_result = CuArray(Array{Int32, 1}(undef, NKMERS))
@device_code_warntype @cuda blocks=10 threads=1 parallel_gpu_lookup!(gpu_hay, gpu_locs, gpu_kmers, parallel_gpu_result)

# for good parallel GPU code, also need to parallel over threads.
# I think blocks corraspond to warps and threads to vectorisation within a warp? don't quote

println(length(cpu_locs))
	using CUDAdrv, CUDAnative
	using SparseArrays

	NKMERS = 1000
	NHAYSTACK = NKMERS * 10

	# making a mapping bwteen 64bit "kmers" and "locations", 32bit ints.
	# can't be stored in a dense array as 73 exabytes for 1000 values.
	# so need some mapping. on CPU, we use a hashmap/dict. on GPU... more complex.

	haystack = rand(Int64, NHAYSTACK) # random things that look like kmers to look up
	locs = rand(Int32, NHAYSTACK) # locations of the above
	kmers = haystack[1:NKMERS] # a subset to look up

	kmer_loc_mapping = Dict{Int64,Int32}( zip(haystack, locs) )

	# on CPU, can do lookup like this. efficent.
	map(kmer -> kmer_loc_mapping[kmer], kmers)
	# kmer_loc_gpu_mapping = SparseMatrixCSC{Int32, Int64}()
	# for l, hay in zip(locs, haystack)
	# kmer_loc_gpu_mapping[hay] = l
	# end

	# more explicitly, to mkae comp to GPU code eisier
	function lookup!(mapping :: Dict{Int64,Int32}, kmers :: Array{Int64}, output :: Array{Int32})
	for (idx, kmer) in enumerate(kmers)
	output[idx] = mapping[kmer]
	end
	return
	end

	# all args to GPU must be bits types, so they can be copied. Dicts are not
	print("dict is bits? ", isbitstype(Dict{Int64,Int32}))

	# a sparse array makes sense too, worse random lookup perf but at least its small
	kmer_loc_gpu_mapping = SparseArrays.SparseVector(NHAYSTACK, haystack, locs)
	# but no
	print("sparse is bits? ", isbitstype(SparseMatrixCSC{Int32, Int64}))
	# so we just pass both the haystack and the locations and do the lookup manually

	# single GPU thread version. This will not go faster than CPU
	function gpu_lookup!(haystack, locs, kmers, output)
	for (idx, kmer) in enumerate(kmers)
	hay_idx = 0
	for (hay_trial_idx, hay_kmer) in enumerate(haystack)
	if hay_kmer == kmer
	hay_idx = hay_trial_idx
	break
	end
	end
	output[idx] = locs[hay_idx]
	end
	return
	end

	# GPU parallel version, over blocks
	# in order to go parallel, the work units have ben statically split up. Each block
	# will look up locs for the chunksize part of the kmers
	function parallel_gpu_lookup!(haystack, locs, kmers, output)
	col = blockIdx().x # set from the call to @cuda
	cols = gridDim().x

	# row = threadIdx().x
	# rows = blockDim().x
	# float div throws errors as needs allocation?
	chunksize = div(length(kmers), cols)
	# @cuprintf("Greetings from block %ld, thread %ld!\n", Int64(blockIdx().x), Int64(threadIdx().x))

	for idx in (col-1)chunksize+1:colchunksize

	kmer = kmers[idx]
	hay_idx = 0
	for (hay_trial_idx, hay_kmer) in enumerate(haystack)
	if hay_kmer == kmer
	hay_idx = hay_trial_idx
	break
	end
	end
	output[idx] = locs[hay_idx]
	end
	return
	end




	cpu_locs = Array{Int32, 1}(undef, NKMERS)
	lookup!(kmer_loc_mapping, kmers, cpu_locs)



	# gpu_mapping = CuArray(kmer_loc_gpu_mapping) # no, because not isbits
	gpu_hay = CuArray(haystack)
	gpu_locs = CuArray(locs)
	gpu_result = CuArray(Array{Int32, 1}(undef, NKMERS))
	gpu_kmers = CuArray(kmers)
	@cuda blocks=1 threads=1 gpu_lookup!(gpu_hay, gpu_locs, gpu_kmers, gpu_result)

	# @device_code_warntype is a useful macro to print out the low level code.
	# I found that accidently promoting kmers / cols to a float in the parallel code will through runtime error
	# as the float gets dynamically allocated, and the compiler dies.
	parallel_gpu_result = CuArray(Array{Int32, 1}(undef, NKMERS))
	@device_code_warntype @cuda blocks=10 threads=1 parallel_gpu_lookup!(gpu_hay, gpu_locs, gpu_kmers, parallel_gpu_result)

	# for good parallel GPU code, also need to parallel over threads.
	# I think blocks corraspond to warps and threads to vectorisation within a warp? don't quote

	println(length(cpu_locs))