Skip to content

Instantly share code, notes, and snippets.

@winni2k
Last active August 29, 2015 14:18
Show Gist options
  • Save winni2k/d17f8bdf4ec4bf9d3d84 to your computer and use it in GitHub Desktop.
Save winni2k/d17f8bdf4ec4bf9d3d84 to your computer and use it in GitHub Desktop.
Code Snippet for reading WTCCC style haps files quickly
#include <string>
#include <vector>
// untested code, may contain bugs
#if __cplusplus <= 199711L
#error This library needs at least a C++11 compliant compiler
#endif
using namespace std;
// wtcccLine will be consumed by this function
vector<unsigned char> getLine(string &wtcccLine, unsigned numSamps) {
vector<string> tokens;
string::size_type lastTokenStart = string::npos;
lastTokenStart = tokenize_partial(line, 6, tokens);
// assuming number of samples in haps file is known and stored in
// numSamps
vector<unsigned char> haps;
haps.reserve(numSamps * 2);
// chromosome is in tokens[0]
// id and position are in tokens[1] and tokens[2]
// ref and alt alleles are stored in tokens[3] and tokens[4]
// the whole haps line is in tokens[5]
assert(tokens.size() == 6);
const string &hapString = tokens[5];
// +1 because there is no EOL char
if (hapString.size() + 1 != lastTokenStart + 4 * numSamps)
throw runtime_error("Input line has " + to_string(hapString.size()) +
" != 4*" + to_string(numSamps) + "+" +
to_string(lastTokenStart) +
" characters. May have to run with slower mode that "
"allows multiple spaces between alleles. Or perhaps "
"this line is terminated with \r");
for (size_t hapNum = 0; hapNum < 2 * numSamps; ++hapNum) {
const size_t charNum = lastTokenStart + 2 * hapNum;
switch (hapString[charNum]) {
case '0':
haps.push_back(0);
break;
case '1':
haps.push_back(1);
break;
default:
throw std::runtime_error("Encountered unexpected allele (" +
string(1, hapString[charNum * 2]) +
") at line " + to_string(m_linesRead + 1));
}
}
assert(haps.size() == numSamps * 2);
return haps;
}
// this returns a vector filled with the first n_max_tokens -1 tokens of str
string::size_type tokenize_partial(const string &str, size_t n_max_tokens,
vector<string> &tokens) {
tokens.clear();
tokens.reserve(n_max_tokens);
string::size_type p_last = str.find_first_not_of(" \t", 0);
string::size_type p_curr = str.find_first_of(" \t", p_last);
if (n_max_tokens > 1)
while ((string::npos != p_curr || string::npos != p_last) &&
tokens.size() + 1 != n_max_tokens) {
tokens.push_back(str.substr(p_last, p_curr - p_last));
p_last = str.find_first_not_of(" \t", p_curr);
p_curr = str.find_first_of(" \t", p_last);
}
return p_last;
}
@winni2k
Copy link
Author

winni2k commented Apr 3, 2015

I made str a const string & and do not pass it back in the tokens vector.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment