Last active
August 29, 2015 14:05
-
-
Save MauricioCarneiro/4d90e122262e8ba34494 to your computer and use it in GitHub Desktop.
Data structures design example for batch SAM processing in gamgee
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Here are the alignment data structures we covered during today’s call. The storage for alignment data looks like this: | |
*/ | |
template <typename system_tag> | |
struct BAM_alignment_batch_storage | |
{ | |
uint32 num_reads; | |
nvbio::vector<system_tag, cigar_op> cigars; | |
nvbio::PackedVector<system_tag, 4> reads; //VectorDNA16<system_tag> reads; | |
nvbio::vector<system_tag, uint8> qualities; | |
nvbio::vector<system_tag, uint16> flags; | |
nvbio::vector<system_tag, uint32> read_groups; | |
nvbio::vector<system_tag, uint32> alignment_positions; // relative to the sequence! | |
nvbio::vector<system_tag, uint32> alignment_sequence_IDs; | |
nvbio::vector<system_tag, uint8> mapq; | |
nvbio::vector<system_tag, BAM_CRQ_index> crq_index; // CRQ: cigars, reads, qualities | |
}; | |
// device version of alignment data | |
struct BAM_alignment_batch_device : public BAM_alignment_batch_storage<device_tag> | |
{ | |
void load(BAM_alignment_batch_host& batch) | |
{ | |
num_reads = batch.num_reads; | |
cigars = batch.cigars; | |
reads = batch.reads; | |
qualities = batch.qualities; | |
flags = batch.flags; | |
alignment_positions = batch.alignment_positions; | |
alignment_sequence_IDs = batch.alignment_sequence_IDs; | |
mapq = batch.mapq; | |
read_groups = batch.read_groups; | |
crq_index = batch.crq_index; | |
} | |
NVBIO_HOST BAM_alignment_batch_device() { } | |
struct const_view | |
{ | |
uint32 num_reads; | |
D_VectorCigarOp::const_plain_view_type cigars; | |
D_VectorDNA16::const_plain_view_type reads; | |
D_VectorU8::const_plain_view_type qualities; | |
D_VectorU16::const_plain_view_type flags; | |
D_VectorU32::const_plain_view_type read_groups; | |
D_VectorU32::const_plain_view_type alignment_positions; | |
D_VectorU32::const_plain_view_type alignment_sequence_IDs; | |
D_VectorU8::const_plain_view_type mapq; | |
D_VectorCRQIndex::const_plain_view_type crq_index; | |
}; | |
operator const_view() const | |
{ | |
const_view v = { | |
num_reads, | |
plain_view(cigars), | |
plain_view(reads), | |
plain_view(qualities), | |
plain_view(flags), | |
plain_view(read_groups), | |
plain_view(alignment_positions), | |
plain_view(alignment_sequence_IDs), | |
plain_view(mapq), | |
plain_view(crq_index) | |
}; | |
return v; | |
} | |
}; | |
/* | |
BAM_alignment_batch_storage is essentially a placeholder for all fields that are required to be present on both host and device. It’s templated on a system tag, which determines whether the vectors are allocated on the device (nvbio::device_tag) or host (nvbio::host_tag). BAM_alignment_batch_device implements the const_view implicit cast operator, which allows a BAM_alignment_batch_device to be passed into kernels (as an object of type BAM_alignment_batch_device::const_view). The load() method simply copies the data into the GPU from a host-side batch. | |
The "CRQ index" is an index for cigars, reads and qualities, and currently looks like this: | |
*/ | |
struct BAM_CRQ_index | |
{ | |
uint32 cigar_start, cigar_len; | |
uint32 read_start, read_len; | |
uint32 qual_start, qual_len; // qual_len is always the same as read_len for BAM, but not necessarily for SAM | |
NVBIO_HOST_DEVICE BAM_CRQ_index() | |
: cigar_start(0), cigar_len(0), | |
read_start(0), read_len(0), | |
qual_start(0), qual_len(0) | |
{ } | |
NVBIO_HOST_DEVICE BAM_CRQ_index(uint32 cigar_start, uint32 cigar_len, | |
uint32 read_start, uint32 read_len, | |
uint32 qual_start, uint32 qual_len) | |
: cigar_start(cigar_start), cigar_len(cigar_len), | |
read_start(read_start), read_len(read_len), | |
qual_start(qual_start), qual_len(qual_len) | |
{ } | |
}; | |
/* | |
Each of the fields holds the index for the data corresponding to a read in the various storage vectors. I’m not entirely convinced that aggregating all these fields in a single structure is the best approach for performance, so we might want to consider substituting this for separate arrays in BAM_alignment_batch_storage (hiding the details of indexing in the wrapper structures that contain references to the data for a given read). | |
For completeness, the host-side storage looks like this: | |
*/ | |
struct BAM_alignment_batch_host : public BAM_alignment_batch_storage<host_tag> | |
{ | |
// host-only data that we never move to the GPU | |
nvbio::vector<host_tag, BAM_alignment_header> align_headers; | |
nvbio::vector<host_tag, char> aux_data; | |
nvbio::vector<host_tag, char> names; | |
nvbio::vector<host_tag, BAM_alignment_index> index; | |
void reset(uint32 batch_size, bool skip_headers) | |
{ | |
num_reads = 0; | |
align_headers.clear(); | |
index.clear(); | |
crq_index.clear(); | |
aux_data.clear(); | |
names.clear(); | |
cigars.clear(); | |
reads.clear(); | |
qualities.clear(); | |
flags.clear(); | |
alignment_positions.clear(); | |
alignment_sequence_IDs.clear(); | |
mapq.clear(); | |
// read groups are special: they're initialized to zero | |
read_groups.assign(batch_size, 0); | |
if (align_headers.size() < batch_size) | |
{ | |
if (!skip_headers) | |
{ | |
align_headers.reserve(batch_size); | |
index.reserve(batch_size); | |
aux_data.reserve(batch_size * 1024); | |
names.reserve(batch_size * 512); | |
} | |
crq_index.reserve(batch_size); | |
cigars.reserve(batch_size * 32); | |
reads.reserve(batch_size * 350); | |
qualities.reserve(batch_size * 350); | |
flags.reserve(batch_size); | |
alignment_positions.reserve(batch_size); | |
alignment_sequence_IDs.reserve(batch_size); | |
mapq.reserve(batch_size); | |
} | |
} | |
}; | |
/* | |
It contains extra data that is not required on the GPU but will be required to output a complete BAM file in the end, plus a simple utility function to reset a batch for reuse. | |
These structures are wrapped in a single BAM_alingment_batch for convenience: | |
*/ | |
struct BAM_alignment_batch | |
{ | |
BAM_alignment_batch_host host; | |
BAM_alignment_batch_device device; | |
void download(void) | |
{ | |
device.load(host); | |
} | |
}; | |
/* | |
This is the structure that we pass around on the host to the various functions that implement different parts of the pipeline. | |
*/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment