Skip to content

Instantly share code, notes, and snippets.

@winni2k
Created April 4, 2015 12:32
Show Gist options
  • Save winni2k/a14ca190f94af8bd2d14 to your computer and use it in GitHub Desktop.
Save winni2k/a14ca190f94af8bd2d14 to your computer and use it in GitHub Desktop.
How to load and parse a tabixed WTCCC haps file
#include "tabix.hpp"
#include <vector>
#include <string>
// untested code
#if __cplusplus <= 199711L
#error This library needs at least a C++11 compliant compiler
#endif
using namespace std;
vector<vector<unsigned char>> tabhaps_reader(string inTabhaps, string region ){
//inTabhaps is tabixed WTCCC haps file
// region is region to extract
// for example: 20:2000000-3000000
// create tabix object
Tabix tbx(inTabHaps);
// set region to extract from vcf
if (tbx.setRegion(region) != true)
throw std::runtime_error("Malformed region string: " + region);
// put each line into haps
vector<vector<unsigned char>> haps;
// read each line in region
while (tbx.getNextLine(line)) {
// parse haps file
// not sure the outer move statement is necessary, but it won't hurt
haps.push_back(std::move(getLine(std::move(line))));
}
return haps;
}
// wtcccLine will be consumed by this function
// best to move the wtcccLine in
vector<unsigned char> getLine(string wtcccLine, unsigned numSamps) {
vector<string> tokens;
string::size_type lastTokenStart = string::npos;
lastTokenStart = tokenize_partial(line, 6, tokens);
// assuming number of samples in haps file is known and stored in
// numSamps
vector<unsigned char> haps;
haps.reserve(numSamps * 2);
// chromosome is in tokens[0]
// id and position are in tokens[1] and tokens[2]
// ref and alt alleles are stored in tokens[3] and tokens[4]
// the whole haps line is in tokens[5]
assert(tokens.size() == 6);
const string &hapString = tokens[5];
// +1 because there is no EOL char
if (hapString.size() + 1 != lastTokenStart + 4 * numSamps)
throw runtime_error("Input line has " + to_string(hapString.size()) +
" != 4*" + to_string(numSamps) + "+" +
to_string(lastTokenStart) +
" characters. May have to run with slower mode that "
"allows multiple spaces between alleles. Or perhaps "
"this line is terminated with \r");
for (size_t hapNum = 0; hapNum < 2 * numSamps; ++hapNum) {
const size_t charNum = lastTokenStart + 2 * hapNum;
switch (hapString[charNum]) {
case '0':
haps.push_back(0);
break;
case '1':
haps.push_back(1);
break;
default:
throw std::runtime_error("Encountered unexpected allele (" +
string(1, hapString[charNum * 2]) +
") at line " + to_string(m_linesRead + 1));
}
}
assert(haps.size() == numSamps * 2);
return haps;
}
// this returns a vector filled with the first n_max_tokens -1 tokens of str
// the last token contains the rest of str
string::size_type tokenize_partial(const string &str, size_t n_max_tokens,
vector<string> &tokens) {
tokens.clear();
tokens.reserve(n_max_tokens);
string::size_type p_last = str.find_first_not_of(" \t", 0);
string::size_type p_curr = str.find_first_of(" \t", p_last);
if (n_max_tokens > 1)
while ((string::npos != p_curr || string::npos != p_last) &&
tokens.size() + 1 != n_max_tokens) {
tokens.push_back(str.substr(p_last, p_curr - p_last));
p_last = str.find_first_not_of(" \t", p_curr);
p_curr = str.find_first_of(" \t", p_last);
}
return p_last;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment