Created
April 4, 2015 12:32
-
-
Save winni2k/a14ca190f94af8bd2d14 to your computer and use it in GitHub Desktop.
How to load and parse a tabixed WTCCC haps file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "tabix.hpp" | |
#include <vector> | |
#include <string> | |
// untested code | |
#if __cplusplus <= 199711L | |
#error This library needs at least a C++11 compliant compiler | |
#endif | |
using namespace std; | |
vector<vector<unsigned char>> tabhaps_reader(string inTabhaps, string region ){ | |
//inTabhaps is tabixed WTCCC haps file | |
// region is region to extract | |
// for example: 20:2000000-3000000 | |
// create tabix object | |
Tabix tbx(inTabHaps); | |
// set region to extract from vcf | |
if (tbx.setRegion(region) != true) | |
throw std::runtime_error("Malformed region string: " + region); | |
// put each line into haps | |
vector<vector<unsigned char>> haps; | |
// read each line in region | |
while (tbx.getNextLine(line)) { | |
// parse haps file | |
// not sure the outer move statement is necessary, but it won't hurt | |
haps.push_back(std::move(getLine(std::move(line)))); | |
} | |
return haps; | |
} | |
// wtcccLine will be consumed by this function | |
// best to move the wtcccLine in | |
vector<unsigned char> getLine(string wtcccLine, unsigned numSamps) { | |
vector<string> tokens; | |
string::size_type lastTokenStart = string::npos; | |
lastTokenStart = tokenize_partial(line, 6, tokens); | |
// assuming number of samples in haps file is known and stored in | |
// numSamps | |
vector<unsigned char> haps; | |
haps.reserve(numSamps * 2); | |
// chromosome is in tokens[0] | |
// id and position are in tokens[1] and tokens[2] | |
// ref and alt alleles are stored in tokens[3] and tokens[4] | |
// the whole haps line is in tokens[5] | |
assert(tokens.size() == 6); | |
const string &hapString = tokens[5]; | |
// +1 because there is no EOL char | |
if (hapString.size() + 1 != lastTokenStart + 4 * numSamps) | |
throw runtime_error("Input line has " + to_string(hapString.size()) + | |
" != 4*" + to_string(numSamps) + "+" + | |
to_string(lastTokenStart) + | |
" characters. May have to run with slower mode that " | |
"allows multiple spaces between alleles. Or perhaps " | |
"this line is terminated with \r"); | |
for (size_t hapNum = 0; hapNum < 2 * numSamps; ++hapNum) { | |
const size_t charNum = lastTokenStart + 2 * hapNum; | |
switch (hapString[charNum]) { | |
case '0': | |
haps.push_back(0); | |
break; | |
case '1': | |
haps.push_back(1); | |
break; | |
default: | |
throw std::runtime_error("Encountered unexpected allele (" + | |
string(1, hapString[charNum * 2]) + | |
") at line " + to_string(m_linesRead + 1)); | |
} | |
} | |
assert(haps.size() == numSamps * 2); | |
return haps; | |
} | |
// this returns a vector filled with the first n_max_tokens -1 tokens of str | |
// the last token contains the rest of str | |
string::size_type tokenize_partial(const string &str, size_t n_max_tokens, | |
vector<string> &tokens) { | |
tokens.clear(); | |
tokens.reserve(n_max_tokens); | |
string::size_type p_last = str.find_first_not_of(" \t", 0); | |
string::size_type p_curr = str.find_first_of(" \t", p_last); | |
if (n_max_tokens > 1) | |
while ((string::npos != p_curr || string::npos != p_last) && | |
tokens.size() + 1 != n_max_tokens) { | |
tokens.push_back(str.substr(p_last, p_curr - p_last)); | |
p_last = str.find_first_not_of(" \t", p_curr); | |
p_curr = str.find_first_of(" \t", p_last); | |
} | |
return p_last; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment