MauricioCarneiro/batch_gamgee_data_structures.cpp

## batch_gamgee_data_structures.cpp
/*
Here are the alignment data structures we covered during today’s call. The storage for alignment data looks like this:
*/

template <typename system_tag>
struct BAM_alignment_batch_storage
{
    uint32 num_reads;

    nvbio::vector<system_tag, cigar_op> cigars;
    nvbio::PackedVector<system_tag, 4> reads; //VectorDNA16<system_tag> reads;
    nvbio::vector<system_tag, uint8> qualities;
    nvbio::vector<system_tag, uint16> flags;
    nvbio::vector<system_tag, uint32> read_groups;
    nvbio::vector<system_tag, uint32> alignment_positions; // relative to the sequence!
    nvbio::vector<system_tag, uint32> alignment_sequence_IDs;
    nvbio::vector<system_tag, uint8> mapq;

    nvbio::vector<system_tag, BAM_CRQ_index> crq_index; // CRQ: cigars, reads, qualities
};

// device version of alignment data
struct BAM_alignment_batch_device : public BAM_alignment_batch_storage<device_tag>
{
    void load(BAM_alignment_batch_host& batch)
    {
        num_reads = batch.num_reads;

        cigars = batch.cigars;
        reads = batch.reads;
        qualities = batch.qualities;
        flags = batch.flags;
        alignment_positions = batch.alignment_positions;
        alignment_sequence_IDs = batch.alignment_sequence_IDs;
        mapq = batch.mapq;
        read_groups = batch.read_groups;
        crq_index = batch.crq_index;

    }

    NVBIO_HOST BAM_alignment_batch_device() { }

    struct const_view
    {
        uint32 num_reads;

        D_VectorCigarOp::const_plain_view_type cigars;
        D_VectorDNA16::const_plain_view_type reads;
        D_VectorU8::const_plain_view_type qualities;
        D_VectorU16::const_plain_view_type flags;
        D_VectorU32::const_plain_view_type read_groups;
        D_VectorU32::const_plain_view_type alignment_positions;
        D_VectorU32::const_plain_view_type alignment_sequence_IDs;
        D_VectorU8::const_plain_view_type mapq;
        D_VectorCRQIndex::const_plain_view_type crq_index;
    };

    operator const_view() const
    {
        const_view v = {
                num_reads,

                plain_view(cigars),
                plain_view(reads),
                plain_view(qualities),
                plain_view(flags),
                plain_view(read_groups),
                plain_view(alignment_positions),
                plain_view(alignment_sequence_IDs),
                plain_view(mapq),
                plain_view(crq_index)
        };

        return v;
    }
};

/*
BAM_alignment_batch_storage is essentially a placeholder for all fields that are required to be present on both host and device. It’s templated on a system tag, which determines whether the vectors are allocated on the device (nvbio::device_tag) or host (nvbio::host_tag). BAM_alignment_batch_device implements the const_view implicit cast operator, which allows a BAM_alignment_batch_device to be passed into kernels (as an object of type BAM_alignment_batch_device::const_view). The load() method simply copies the data into the GPU from a host-side batch.

The "CRQ index" is an index for cigars, reads and qualities, and currently looks like this:
*/

struct BAM_CRQ_index
{
    uint32 cigar_start, cigar_len;
    uint32 read_start, read_len;
    uint32 qual_start, qual_len; // qual_len is always the same as read_len for BAM, but not necessarily for SAM

    NVBIO_HOST_DEVICE BAM_CRQ_index()
        : cigar_start(0), cigar_len(0),
          read_start(0), read_len(0),
          qual_start(0), qual_len(0)
    { }

    NVBIO_HOST_DEVICE BAM_CRQ_index(uint32 cigar_start, uint32 cigar_len,
                                    uint32 read_start, uint32 read_len,
                                    uint32 qual_start, uint32 qual_len)
        : cigar_start(cigar_start), cigar_len(cigar_len),
          read_start(read_start), read_len(read_len),
          qual_start(qual_start), qual_len(qual_len)
    { }
};

/*
Each of the fields holds the index for the data corresponding to a read in the various storage vectors. I’m not entirely convinced that aggregating all these fields in a single structure is the best approach for performance, so we might want to consider substituting this for separate arrays in BAM_alignment_batch_storage (hiding the details of indexing in the wrapper structures that contain references to the data for a given read).

For completeness, the host-side storage looks like this:
*/

struct BAM_alignment_batch_host : public BAM_alignment_batch_storage<host_tag>
{
    // host-only data that we never move to the GPU
    nvbio::vector<host_tag, BAM_alignment_header> align_headers;
    nvbio::vector<host_tag, char> aux_data;
    nvbio::vector<host_tag, char> names;

    nvbio::vector<host_tag, BAM_alignment_index> index;

    void reset(uint32 batch_size, bool skip_headers)
    {
        num_reads = 0;

        align_headers.clear();
        index.clear();
        crq_index.clear();
        aux_data.clear();
        names.clear();
        cigars.clear();
        reads.clear();
        qualities.clear();
        flags.clear();
        alignment_positions.clear();
        alignment_sequence_IDs.clear();
        mapq.clear();

        // read groups are special: they're initialized to zero
        read_groups.assign(batch_size, 0);

        if (align_headers.size() < batch_size)
        {
            if (!skip_headers)
            {
                align_headers.reserve(batch_size);
                index.reserve(batch_size);
                aux_data.reserve(batch_size * 1024);
                names.reserve(batch_size * 512);
            }

            crq_index.reserve(batch_size);
            cigars.reserve(batch_size * 32);
            reads.reserve(batch_size * 350);
            qualities.reserve(batch_size * 350);
            flags.reserve(batch_size);
            alignment_positions.reserve(batch_size);
            alignment_sequence_IDs.reserve(batch_size);
            mapq.reserve(batch_size);
        }
    }
};

/*
It contains extra data that is not required on the GPU but will be required to output a complete BAM file in the end, plus a simple utility function to reset a batch for reuse.

These structures are wrapped in a single BAM_alingment_batch for convenience:
*/

struct BAM_alignment_batch
{
    BAM_alignment_batch_host host;
    BAM_alignment_batch_device device;

    void download(void)
    {
        device.load(host);
    }
};

/*
This is the structure that we pass around on the host to the various functions that implement different parts of the pipeline.
*/
	/*
	Here are the alignment data structures we covered during today’s call. The storage for alignment data looks like this:
	*/

	template <typename system_tag>
	struct BAM_alignment_batch_storage
	{
	uint32 num_reads;

	nvbio::vector<system_tag, cigar_op> cigars;
	nvbio::PackedVector<system_tag, 4> reads; //VectorDNA16<system_tag> reads;
	nvbio::vector<system_tag, uint8> qualities;
	nvbio::vector<system_tag, uint16> flags;
	nvbio::vector<system_tag, uint32> read_groups;
	nvbio::vector<system_tag, uint32> alignment_positions; // relative to the sequence!
	nvbio::vector<system_tag, uint32> alignment_sequence_IDs;
	nvbio::vector<system_tag, uint8> mapq;

	nvbio::vector<system_tag, BAM_CRQ_index> crq_index; // CRQ: cigars, reads, qualities
	};

	// device version of alignment data
	struct BAM_alignment_batch_device : public BAM_alignment_batch_storage<device_tag>
	{
	void load(BAM_alignment_batch_host& batch)
	{
	num_reads = batch.num_reads;

	cigars = batch.cigars;
	reads = batch.reads;
	qualities = batch.qualities;
	flags = batch.flags;
	alignment_positions = batch.alignment_positions;
	alignment_sequence_IDs = batch.alignment_sequence_IDs;
	mapq = batch.mapq;
	read_groups = batch.read_groups;
	crq_index = batch.crq_index;

	}

	NVBIO_HOST BAM_alignment_batch_device() { }

	struct const_view
	{
	uint32 num_reads;

	D_VectorCigarOp::const_plain_view_type cigars;
	D_VectorDNA16::const_plain_view_type reads;
	D_VectorU8::const_plain_view_type qualities;
	D_VectorU16::const_plain_view_type flags;
	D_VectorU32::const_plain_view_type read_groups;
	D_VectorU32::const_plain_view_type alignment_positions;
	D_VectorU32::const_plain_view_type alignment_sequence_IDs;
	D_VectorU8::const_plain_view_type mapq;
	D_VectorCRQIndex::const_plain_view_type crq_index;
	};

	operator const_view() const
	{
	const_view v = {
	num_reads,

	plain_view(cigars),
	plain_view(reads),
	plain_view(qualities),
	plain_view(flags),
	plain_view(read_groups),
	plain_view(alignment_positions),
	plain_view(alignment_sequence_IDs),
	plain_view(mapq),
	plain_view(crq_index)
	};

	return v;
	}
	};

	/*
	BAM_alignment_batch_storage is essentially a placeholder for all fields that are required to be present on both host and device. It’s templated on a system tag, which determines whether the vectors are allocated on the device (nvbio::device_tag) or host (nvbio::host_tag). BAM_alignment_batch_device implements the const_view implicit cast operator, which allows a BAM_alignment_batch_device to be passed into kernels (as an object of type BAM_alignment_batch_device::const_view). The load() method simply copies the data into the GPU from a host-side batch.

	The "CRQ index" is an index for cigars, reads and qualities, and currently looks like this:
	*/

	struct BAM_CRQ_index
	{
	uint32 cigar_start, cigar_len;
	uint32 read_start, read_len;
	uint32 qual_start, qual_len; // qual_len is always the same as read_len for BAM, but not necessarily for SAM

	NVBIO_HOST_DEVICE BAM_CRQ_index()
	: cigar_start(0), cigar_len(0),
	read_start(0), read_len(0),
	qual_start(0), qual_len(0)
	{ }

	NVBIO_HOST_DEVICE BAM_CRQ_index(uint32 cigar_start, uint32 cigar_len,
	uint32 read_start, uint32 read_len,
	uint32 qual_start, uint32 qual_len)
	: cigar_start(cigar_start), cigar_len(cigar_len),
	read_start(read_start), read_len(read_len),
	qual_start(qual_start), qual_len(qual_len)
	{ }
	};

	/*
	Each of the fields holds the index for the data corresponding to a read in the various storage vectors. I’m not entirely convinced that aggregating all these fields in a single structure is the best approach for performance, so we might want to consider substituting this for separate arrays in BAM_alignment_batch_storage (hiding the details of indexing in the wrapper structures that contain references to the data for a given read).

	For completeness, the host-side storage looks like this:
	*/

	struct BAM_alignment_batch_host : public BAM_alignment_batch_storage<host_tag>
	{
	// host-only data that we never move to the GPU
	nvbio::vector<host_tag, BAM_alignment_header> align_headers;
	nvbio::vector<host_tag, char> aux_data;
	nvbio::vector<host_tag, char> names;

	nvbio::vector<host_tag, BAM_alignment_index> index;

	void reset(uint32 batch_size, bool skip_headers)
	{
	num_reads = 0;

	align_headers.clear();
	index.clear();
	crq_index.clear();
	aux_data.clear();
	names.clear();
	cigars.clear();
	reads.clear();
	qualities.clear();
	flags.clear();
	alignment_positions.clear();
	alignment_sequence_IDs.clear();
	mapq.clear();

	// read groups are special: they're initialized to zero
	read_groups.assign(batch_size, 0);

	if (align_headers.size() < batch_size)
	{
	if (!skip_headers)
	{
	align_headers.reserve(batch_size);
	index.reserve(batch_size);
	aux_data.reserve(batch_size * 1024);
	names.reserve(batch_size * 512);
	}

	crq_index.reserve(batch_size);
	cigars.reserve(batch_size * 32);
	reads.reserve(batch_size * 350);
	qualities.reserve(batch_size * 350);
	flags.reserve(batch_size);
	alignment_positions.reserve(batch_size);
	alignment_sequence_IDs.reserve(batch_size);
	mapq.reserve(batch_size);
	}
	}
	};

	/*
	It contains extra data that is not required on the GPU but will be required to output a complete BAM file in the end, plus a simple utility function to reset a batch for reuse.

	These structures are wrapped in a single BAM_alingment_batch for convenience:
	*/

	struct BAM_alignment_batch
	{
	BAM_alignment_batch_host host;
	BAM_alignment_batch_device device;

	void download(void)
	{
	device.load(host);
	}
	};

	/*
	This is the structure that we pass around on the host to the various functions that implement different parts of the pipeline.
	*/