Skip to content

Instantly share code, notes, and snippets.

@coralmw
Created September 30, 2018 22:18
Show Gist options
  • Save coralmw/1137dfa0f6494f7ed8d2b7f2311e0842 to your computer and use it in GitHub Desktop.
Save coralmw/1137dfa0f6494f7ed8d2b7f2311e0842 to your computer and use it in GitHub Desktop.
some cuda hash lookup code. probably not optimal!
using CUDAdrv, CUDAnative
using SparseArrays
NKMERS = 1000
NHAYSTACK = NKMERS * 10
# making a mapping bwteen 64bit "kmers" and "locations", 32bit ints.
# can't be stored in a dense array as 73 exabytes for 1000 values.
# so need some mapping. on CPU, we use a hashmap/dict. on GPU... more complex.
haystack = rand(Int64, NHAYSTACK) # random things that look like kmers to look up
locs = rand(Int32, NHAYSTACK) # locations of the above
kmers = haystack[1:NKMERS] # a subset to look up
kmer_loc_mapping = Dict{Int64,Int32}( zip(haystack, locs) )
# on CPU, can do lookup like this. efficent.
map(kmer -> kmer_loc_mapping[kmer], kmers)
# kmer_loc_gpu_mapping = SparseMatrixCSC{Int32, Int64}()
# for l, hay in zip(locs, haystack)
# kmer_loc_gpu_mapping[hay] = l
# end
# more explicitly, to mkae comp to GPU code eisier
function lookup!(mapping :: Dict{Int64,Int32}, kmers :: Array{Int64}, output :: Array{Int32})
for (idx, kmer) in enumerate(kmers)
output[idx] = mapping[kmer]
end
return
end
# all args to GPU must be bits types, so they can be copied. Dicts are not
print("dict is bits? ", isbitstype(Dict{Int64,Int32}))
# a sparse array makes sense too, worse random lookup perf but at least its small
kmer_loc_gpu_mapping = SparseArrays.SparseVector(NHAYSTACK, haystack, locs)
# but no
print("sparse is bits? ", isbitstype(SparseMatrixCSC{Int32, Int64}))
# so we just pass both the haystack and the locations and do the lookup manually
# single GPU thread version. This will not go faster than CPU
function gpu_lookup!(haystack, locs, kmers, output)
for (idx, kmer) in enumerate(kmers)
hay_idx = 0
for (hay_trial_idx, hay_kmer) in enumerate(haystack)
if hay_kmer == kmer
hay_idx = hay_trial_idx
break
end
end
output[idx] = locs[hay_idx]
end
return
end
# GPU parallel version, over blocks
# in order to go parallel, the work units have ben statically split up. Each block
# will look up locs for the chunksize part of the kmers
function parallel_gpu_lookup!(haystack, locs, kmers, output)
col = blockIdx().x # set from the call to @cuda
cols = gridDim().x
# row = threadIdx().x
# rows = blockDim().x
# float div throws errors as needs allocation?
chunksize = div(length(kmers), cols)
# @cuprintf("Greetings from block %ld, thread %ld!\n", Int64(blockIdx().x), Int64(threadIdx().x))
for idx in (col-1)*chunksize+1:col*chunksize
kmer = kmers[idx]
hay_idx = 0
for (hay_trial_idx, hay_kmer) in enumerate(haystack)
if hay_kmer == kmer
hay_idx = hay_trial_idx
break
end
end
output[idx] = locs[hay_idx]
end
return
end
cpu_locs = Array{Int32, 1}(undef, NKMERS)
lookup!(kmer_loc_mapping, kmers, cpu_locs)
# gpu_mapping = CuArray(kmer_loc_gpu_mapping) # no, because not isbits
gpu_hay = CuArray(haystack)
gpu_locs = CuArray(locs)
gpu_result = CuArray(Array{Int32, 1}(undef, NKMERS))
gpu_kmers = CuArray(kmers)
@cuda blocks=1 threads=1 gpu_lookup!(gpu_hay, gpu_locs, gpu_kmers, gpu_result)
# @device_code_warntype is a useful macro to print out the low level code.
# I found that accidently promoting kmers / cols to a float in the parallel code will through runtime error
# as the float gets dynamically allocated, and the compiler dies.
parallel_gpu_result = CuArray(Array{Int32, 1}(undef, NKMERS))
@device_code_warntype @cuda blocks=10 threads=1 parallel_gpu_lookup!(gpu_hay, gpu_locs, gpu_kmers, parallel_gpu_result)
# for good parallel GPU code, also need to parallel over threads.
# I think blocks corraspond to warps and threads to vectorisation within a warp? don't quote
println(length(cpu_locs))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment