Created
September 30, 2018 22:18
-
-
Save coralmw/1137dfa0f6494f7ed8d2b7f2311e0842 to your computer and use it in GitHub Desktop.
some cuda hash lookup code. probably not optimal!
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using CUDAdrv, CUDAnative | |
using SparseArrays | |
NKMERS = 1000 | |
NHAYSTACK = NKMERS * 10 | |
# making a mapping bwteen 64bit "kmers" and "locations", 32bit ints. | |
# can't be stored in a dense array as 73 exabytes for 1000 values. | |
# so need some mapping. on CPU, we use a hashmap/dict. on GPU... more complex. | |
haystack = rand(Int64, NHAYSTACK) # random things that look like kmers to look up | |
locs = rand(Int32, NHAYSTACK) # locations of the above | |
kmers = haystack[1:NKMERS] # a subset to look up | |
kmer_loc_mapping = Dict{Int64,Int32}( zip(haystack, locs) ) | |
# on CPU, can do lookup like this. efficent. | |
map(kmer -> kmer_loc_mapping[kmer], kmers) | |
# kmer_loc_gpu_mapping = SparseMatrixCSC{Int32, Int64}() | |
# for l, hay in zip(locs, haystack) | |
# kmer_loc_gpu_mapping[hay] = l | |
# end | |
# more explicitly, to mkae comp to GPU code eisier | |
function lookup!(mapping :: Dict{Int64,Int32}, kmers :: Array{Int64}, output :: Array{Int32}) | |
for (idx, kmer) in enumerate(kmers) | |
output[idx] = mapping[kmer] | |
end | |
return | |
end | |
# all args to GPU must be bits types, so they can be copied. Dicts are not | |
print("dict is bits? ", isbitstype(Dict{Int64,Int32})) | |
# a sparse array makes sense too, worse random lookup perf but at least its small | |
kmer_loc_gpu_mapping = SparseArrays.SparseVector(NHAYSTACK, haystack, locs) | |
# but no | |
print("sparse is bits? ", isbitstype(SparseMatrixCSC{Int32, Int64})) | |
# so we just pass both the haystack and the locations and do the lookup manually | |
# single GPU thread version. This will not go faster than CPU | |
function gpu_lookup!(haystack, locs, kmers, output) | |
for (idx, kmer) in enumerate(kmers) | |
hay_idx = 0 | |
for (hay_trial_idx, hay_kmer) in enumerate(haystack) | |
if hay_kmer == kmer | |
hay_idx = hay_trial_idx | |
break | |
end | |
end | |
output[idx] = locs[hay_idx] | |
end | |
return | |
end | |
# GPU parallel version, over blocks | |
# in order to go parallel, the work units have ben statically split up. Each block | |
# will look up locs for the chunksize part of the kmers | |
function parallel_gpu_lookup!(haystack, locs, kmers, output) | |
col = blockIdx().x # set from the call to @cuda | |
cols = gridDim().x | |
# row = threadIdx().x | |
# rows = blockDim().x | |
# float div throws errors as needs allocation? | |
chunksize = div(length(kmers), cols) | |
# @cuprintf("Greetings from block %ld, thread %ld!\n", Int64(blockIdx().x), Int64(threadIdx().x)) | |
for idx in (col-1)*chunksize+1:col*chunksize | |
kmer = kmers[idx] | |
hay_idx = 0 | |
for (hay_trial_idx, hay_kmer) in enumerate(haystack) | |
if hay_kmer == kmer | |
hay_idx = hay_trial_idx | |
break | |
end | |
end | |
output[idx] = locs[hay_idx] | |
end | |
return | |
end | |
cpu_locs = Array{Int32, 1}(undef, NKMERS) | |
lookup!(kmer_loc_mapping, kmers, cpu_locs) | |
# gpu_mapping = CuArray(kmer_loc_gpu_mapping) # no, because not isbits | |
gpu_hay = CuArray(haystack) | |
gpu_locs = CuArray(locs) | |
gpu_result = CuArray(Array{Int32, 1}(undef, NKMERS)) | |
gpu_kmers = CuArray(kmers) | |
@cuda blocks=1 threads=1 gpu_lookup!(gpu_hay, gpu_locs, gpu_kmers, gpu_result) | |
# @device_code_warntype is a useful macro to print out the low level code. | |
# I found that accidently promoting kmers / cols to a float in the parallel code will through runtime error | |
# as the float gets dynamically allocated, and the compiler dies. | |
parallel_gpu_result = CuArray(Array{Int32, 1}(undef, NKMERS)) | |
@device_code_warntype @cuda blocks=10 threads=1 parallel_gpu_lookup!(gpu_hay, gpu_locs, gpu_kmers, parallel_gpu_result) | |
# for good parallel GPU code, also need to parallel over threads. | |
# I think blocks corraspond to warps and threads to vectorisation within a warp? don't quote | |
println(length(cpu_locs)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment