Last active
August 29, 2015 14:18
-
-
Save winni2k/d17f8bdf4ec4bf9d3d84 to your computer and use it in GitHub Desktop.
Code Snippet for reading WTCCC style haps files quickly
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <string> | |
#include <vector> | |
// untested code, may contain bugs | |
#if __cplusplus <= 199711L | |
#error This library needs at least a C++11 compliant compiler | |
#endif | |
using namespace std; | |
// wtcccLine will be consumed by this function | |
vector<unsigned char> getLine(string &wtcccLine, unsigned numSamps) { | |
vector<string> tokens; | |
string::size_type lastTokenStart = string::npos; | |
lastTokenStart = tokenize_partial(line, 6, tokens); | |
// assuming number of samples in haps file is known and stored in | |
// numSamps | |
vector<unsigned char> haps; | |
haps.reserve(numSamps * 2); | |
// chromosome is in tokens[0] | |
// id and position are in tokens[1] and tokens[2] | |
// ref and alt alleles are stored in tokens[3] and tokens[4] | |
// the whole haps line is in tokens[5] | |
assert(tokens.size() == 6); | |
const string &hapString = tokens[5]; | |
// +1 because there is no EOL char | |
if (hapString.size() + 1 != lastTokenStart + 4 * numSamps) | |
throw runtime_error("Input line has " + to_string(hapString.size()) + | |
" != 4*" + to_string(numSamps) + "+" + | |
to_string(lastTokenStart) + | |
" characters. May have to run with slower mode that " | |
"allows multiple spaces between alleles. Or perhaps " | |
"this line is terminated with \r"); | |
for (size_t hapNum = 0; hapNum < 2 * numSamps; ++hapNum) { | |
const size_t charNum = lastTokenStart + 2 * hapNum; | |
switch (hapString[charNum]) { | |
case '0': | |
haps.push_back(0); | |
break; | |
case '1': | |
haps.push_back(1); | |
break; | |
default: | |
throw std::runtime_error("Encountered unexpected allele (" + | |
string(1, hapString[charNum * 2]) + | |
") at line " + to_string(m_linesRead + 1)); | |
} | |
} | |
assert(haps.size() == numSamps * 2); | |
return haps; | |
} | |
// this returns a vector filled with the first n_max_tokens -1 tokens of str | |
string::size_type tokenize_partial(const string &str, size_t n_max_tokens, | |
vector<string> &tokens) { | |
tokens.clear(); | |
tokens.reserve(n_max_tokens); | |
string::size_type p_last = str.find_first_not_of(" \t", 0); | |
string::size_type p_curr = str.find_first_of(" \t", p_last); | |
if (n_max_tokens > 1) | |
while ((string::npos != p_curr || string::npos != p_last) && | |
tokens.size() + 1 != n_max_tokens) { | |
tokens.push_back(str.substr(p_last, p_curr - p_last)); | |
p_last = str.find_first_not_of(" \t", p_curr); | |
p_curr = str.find_first_of(" \t", p_last); | |
} | |
return p_last; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I made
str
aconst string &
and do not pass it back in thetokens
vector.