Skip to content

Instantly share code, notes, and snippets.

@vellamike
Created December 10, 2019 20:57
Show Gist options
  • Save vellamike/84ec96f6bd9b8ef19d9a00b8f69212c1 to your computer and use it in GitHub Desktop.
Save vellamike/84ec96f6bd9b8ef19d9a00b8f69212c1 to your computer and use it in GitHub Desktop.
kseq vs htslib
#include <iostream>
#include "seqio.h"
#include <zlib.h>
extern "C" {
#include <htslib/faidx.h>
}
using namespace klibpp;
struct free_deleter
{
template <typename T>
void operator()(T* x)
{
std::free(x);
}
};
typedef struct
{
/// Name of sequence.
std::string name;
/// Base pair representation of sequence.
std::string seq;
} FastaSequence;
class FastaParserHTS
{
public:
FastaParserHTS(const std::string& fasta_file);
~FastaParserHTS();
int32_t get_num_seqences();
FastaSequence get_sequence_by_id(int32_t i);
FastaSequence get_sequence_by_name(const std::string&);
private:
faidx_t* fasta_index_;
mutable std::mutex index_mutex_;
int32_t num_seqequences_;
};
FastaParserHTS::FastaParserHTS(const std::string& fasta_file)
{
fasta_index_ = fai_load3(fasta_file.c_str(), NULL, NULL, FAI_CREATE);
if (fasta_index_ == NULL)
{
throw std::runtime_error("Could not load fasta index!");
}
num_seqequences_ = faidx_nseq(fasta_index_);
if (num_seqequences_ == 0)
{
fai_destroy(fasta_index_);
throw std::runtime_error("FASTA file has 0 sequences");
}
}
FastaParserHTS::~FastaParserHTS()
{
fai_destroy(fasta_index_);
}
int32_t FastaParserHTS::get_num_seqences()
{
return num_seqequences_;
}
FastaSequence FastaParserHTS::get_sequence_by_name(const std::string& name)
{
FastaSequence s{};
{
std::lock_guard<std::mutex> lock(index_mutex_);
int32_t length;
std::unique_ptr<char, free_deleter> seq(fai_fetch(fasta_index_, name.c_str(), &length));
if (length < 0)
{
throw std::runtime_error("Error in reading sequence information for seq ID " + name);
}
s.name = std::string(name);
s.seq = std::string(seq.get());
}
return s;
}
FastaSequence FastaParserHTS::get_sequence_by_id(int32_t i)
{
std::string str_name = "";
{
std::lock_guard<std::mutex> lock(index_mutex_);
const char* name = faidx_iseq(fasta_index_, i);
if (name == NULL)
{
throw std::runtime_error("No sequence found for ID " + std::to_string(i));
}
str_name = std::string(name);
}
return get_sequence_by_name(str_name);
}
class FastaParser
{
public:
/// \brief FastaParser implementations can have custom destructors, so delcare the abstract dtor as default.
virtual ~FastaParser() = default;
/// \brief Return number of sequences in FASTA file
///
/// \return Sequence count in file
virtual int32_t get_num_seqences() const = 0;
/// \brief Fetch an entry from the FASTA file by index position in file.
/// \param id Position of sequence in file. If id is greater than file size,
/// an error is thrown.
///
/// \return A FastaSequence object describing the entry.
virtual FastaSequence get_sequence_by_id(int32_t id) const = 0;
/// \brief Fetch an entry from the FASTA file by name.
/// \param name Name of the sequence in FASTA file. If there is no entry
/// by that name, an error is thrown.
///
/// \return A FastaSequence object describing the entry.
virtual FastaSequence get_sequence_by_name(const std::string& name) const = 0;
};
char* fasta_filepath = "/home/mike/science/cumap_validation/simulated_data/10M_ref_100x_reads.fasta";
int main(int argc, char* argv[])
{
// First test kseq
KSeq record;
SeqStreamIn iss(fasta_filepath);
std::vector<std::string> seqs1;
auto start_time = std::chrono::high_resolution_clock::now();
while (iss >> record) {
//std::cout << record.name << std::endl;
if (!record.comment.empty()) std::cout << record.comment << std::endl;
//std::cout << record.seq << std::endl;
seqs1.push_back(record.seq);
if (!record.qual.empty()) std::cout << record.qual << std::endl;
}
auto end_time = std::chrono::high_resolution_clock::now();
std::cerr << "KSEQ++ duration (ms)" << std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time).count() << std::endl;
// Now test htslib
FastaParserHTS f = FastaParserHTS(std::string(fasta_filepath));
int num_sequences = f.get_num_seqences();
std::vector<std::string> seqs2;
start_time = std::chrono::high_resolution_clock::now();
for(int i=0; i< f.get_num_seqences(); i++){
seqs2.push_back((f.get_sequence_by_id(i).seq));
}
end_time = std::chrono::high_resolution_clock::now();
std::cerr << "HTSLib duration (ms)" << std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time).count() << std::endl;
for (auto s: seqs1) {
if (s == "blah") {
exit(1);
}
}
for (auto s: seqs2) {
if (s == "blah") {
exit(1);
}
}
assert(seqs1.size() == seqs2.size());
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment