Skip to content

Instantly share code, notes, and snippets.

@MauricioCarneiro
Last active August 29, 2015 14:05
Show Gist options
  • Save MauricioCarneiro/4d90e122262e8ba34494 to your computer and use it in GitHub Desktop.
Save MauricioCarneiro/4d90e122262e8ba34494 to your computer and use it in GitHub Desktop.
Data structures design example for batch SAM processing in gamgee
/*
Here are the alignment data structures we covered during today’s call. The storage for alignment data looks like this:
*/
template <typename system_tag>
struct BAM_alignment_batch_storage
{
uint32 num_reads;
nvbio::vector<system_tag, cigar_op> cigars;
nvbio::PackedVector<system_tag, 4> reads; //VectorDNA16<system_tag> reads;
nvbio::vector<system_tag, uint8> qualities;
nvbio::vector<system_tag, uint16> flags;
nvbio::vector<system_tag, uint32> read_groups;
nvbio::vector<system_tag, uint32> alignment_positions; // relative to the sequence!
nvbio::vector<system_tag, uint32> alignment_sequence_IDs;
nvbio::vector<system_tag, uint8> mapq;
nvbio::vector<system_tag, BAM_CRQ_index> crq_index; // CRQ: cigars, reads, qualities
};
// device version of alignment data
struct BAM_alignment_batch_device : public BAM_alignment_batch_storage<device_tag>
{
void load(BAM_alignment_batch_host& batch)
{
num_reads = batch.num_reads;
cigars = batch.cigars;
reads = batch.reads;
qualities = batch.qualities;
flags = batch.flags;
alignment_positions = batch.alignment_positions;
alignment_sequence_IDs = batch.alignment_sequence_IDs;
mapq = batch.mapq;
read_groups = batch.read_groups;
crq_index = batch.crq_index;
}
NVBIO_HOST BAM_alignment_batch_device() { }
struct const_view
{
uint32 num_reads;
D_VectorCigarOp::const_plain_view_type cigars;
D_VectorDNA16::const_plain_view_type reads;
D_VectorU8::const_plain_view_type qualities;
D_VectorU16::const_plain_view_type flags;
D_VectorU32::const_plain_view_type read_groups;
D_VectorU32::const_plain_view_type alignment_positions;
D_VectorU32::const_plain_view_type alignment_sequence_IDs;
D_VectorU8::const_plain_view_type mapq;
D_VectorCRQIndex::const_plain_view_type crq_index;
};
operator const_view() const
{
const_view v = {
num_reads,
plain_view(cigars),
plain_view(reads),
plain_view(qualities),
plain_view(flags),
plain_view(read_groups),
plain_view(alignment_positions),
plain_view(alignment_sequence_IDs),
plain_view(mapq),
plain_view(crq_index)
};
return v;
}
};
/*
BAM_alignment_batch_storage is essentially a placeholder for all fields that are required to be present on both host and device. It’s templated on a system tag, which determines whether the vectors are allocated on the device (nvbio::device_tag) or host (nvbio::host_tag). BAM_alignment_batch_device implements the const_view implicit cast operator, which allows a BAM_alignment_batch_device to be passed into kernels (as an object of type BAM_alignment_batch_device::const_view). The load() method simply copies the data into the GPU from a host-side batch.
The "CRQ index" is an index for cigars, reads and qualities, and currently looks like this:
*/
struct BAM_CRQ_index
{
uint32 cigar_start, cigar_len;
uint32 read_start, read_len;
uint32 qual_start, qual_len; // qual_len is always the same as read_len for BAM, but not necessarily for SAM
NVBIO_HOST_DEVICE BAM_CRQ_index()
: cigar_start(0), cigar_len(0),
read_start(0), read_len(0),
qual_start(0), qual_len(0)
{ }
NVBIO_HOST_DEVICE BAM_CRQ_index(uint32 cigar_start, uint32 cigar_len,
uint32 read_start, uint32 read_len,
uint32 qual_start, uint32 qual_len)
: cigar_start(cigar_start), cigar_len(cigar_len),
read_start(read_start), read_len(read_len),
qual_start(qual_start), qual_len(qual_len)
{ }
};
/*
Each of the fields holds the index for the data corresponding to a read in the various storage vectors. I’m not entirely convinced that aggregating all these fields in a single structure is the best approach for performance, so we might want to consider substituting this for separate arrays in BAM_alignment_batch_storage (hiding the details of indexing in the wrapper structures that contain references to the data for a given read).
For completeness, the host-side storage looks like this:
*/
struct BAM_alignment_batch_host : public BAM_alignment_batch_storage<host_tag>
{
// host-only data that we never move to the GPU
nvbio::vector<host_tag, BAM_alignment_header> align_headers;
nvbio::vector<host_tag, char> aux_data;
nvbio::vector<host_tag, char> names;
nvbio::vector<host_tag, BAM_alignment_index> index;
void reset(uint32 batch_size, bool skip_headers)
{
num_reads = 0;
align_headers.clear();
index.clear();
crq_index.clear();
aux_data.clear();
names.clear();
cigars.clear();
reads.clear();
qualities.clear();
flags.clear();
alignment_positions.clear();
alignment_sequence_IDs.clear();
mapq.clear();
// read groups are special: they're initialized to zero
read_groups.assign(batch_size, 0);
if (align_headers.size() < batch_size)
{
if (!skip_headers)
{
align_headers.reserve(batch_size);
index.reserve(batch_size);
aux_data.reserve(batch_size * 1024);
names.reserve(batch_size * 512);
}
crq_index.reserve(batch_size);
cigars.reserve(batch_size * 32);
reads.reserve(batch_size * 350);
qualities.reserve(batch_size * 350);
flags.reserve(batch_size);
alignment_positions.reserve(batch_size);
alignment_sequence_IDs.reserve(batch_size);
mapq.reserve(batch_size);
}
}
};
/*
It contains extra data that is not required on the GPU but will be required to output a complete BAM file in the end, plus a simple utility function to reset a batch for reuse.
These structures are wrapped in a single BAM_alingment_batch for convenience:
*/
struct BAM_alignment_batch
{
BAM_alignment_batch_host host;
BAM_alignment_batch_device device;
void download(void)
{
device.load(host);
}
};
/*
This is the structure that we pass around on the host to the various functions that implement different parts of the pipeline.
*/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment