winni2k/read tabixed haps

## read tabixed haps
#include "tabix.hpp"
#include <vector>
#include <string>

// untested code

 #if __cplusplus <= 199711L
#error This library needs at least a C++11 compliant compiler
#endif

using namespace std;

vector<vector<unsigned char>> tabhaps_reader(string inTabhaps, string region ){

  //inTabhaps is tabixed WTCCC haps file
  // region is region to extract
  // for example: 20:2000000-3000000

  // create tabix object
  Tabix tbx(inTabHaps);

  // set region to extract from vcf
  if (tbx.setRegion(region) != true)
    throw std::runtime_error("Malformed region string: " + region);

  // put each line into haps
  vector<vector<unsigned char>> haps;
  // read each line in region
  while (tbx.getNextLine(line)) {
    // parse haps file
    // not sure the outer move statement is necessary, but it won't hurt
    haps.push_back(std::move(getLine(std::move(line))));
  }
  return haps;
}

// wtcccLine will be consumed by this function
// best to move the wtcccLine in
vector<unsigned char> getLine(string wtcccLine, unsigned numSamps) {
  vector<string> tokens;
  string::size_type lastTokenStart = string::npos;
  lastTokenStart = tokenize_partial(line, 6, tokens);

  // assuming number of samples in haps file is known and stored in
  // numSamps
  vector<unsigned char> haps;
  haps.reserve(numSamps * 2);

  // chromosome is in tokens[0]
  // id and position are in tokens[1] and tokens[2]
  // ref and alt alleles are stored in tokens[3] and tokens[4]
  // the whole haps line is in tokens[5]
  assert(tokens.size() == 6);

  const string &hapString = tokens[5];

  // +1 because there is no EOL char
  if (hapString.size() + 1 != lastTokenStart + 4 * numSamps)
    throw runtime_error("Input line has " + to_string(hapString.size()) +
                        " != 4*" + to_string(numSamps) + "+" +
                        to_string(lastTokenStart) +
                        " characters. May have to run with slower mode that "
                        "allows multiple spaces between alleles. Or perhaps "
                        "this line is terminated with \r");
  for (size_t hapNum = 0; hapNum < 2 * numSamps; ++hapNum) {

    const size_t charNum = lastTokenStart + 2 * hapNum;
    switch (hapString[charNum]) {
    case '0':
      haps.push_back(0);
      break;
    case '1':
      haps.push_back(1);
      break;
    default:
      throw std::runtime_error("Encountered unexpected allele (" +
                               string(1, hapString[charNum * 2]) +
                               ") at line " + to_string(m_linesRead + 1));
    }
  }
  assert(haps.size() == numSamps * 2);

  return haps;
}

// this returns a vector filled with the first n_max_tokens -1 tokens of str
// the last token contains the rest of str
string::size_type tokenize_partial(const string &str, size_t n_max_tokens,
                                   vector<string> &tokens) {
  tokens.clear();
  tokens.reserve(n_max_tokens);
  string::size_type p_last = str.find_first_not_of(" \t", 0);
  string::size_type p_curr = str.find_first_of(" \t", p_last);
  if (n_max_tokens > 1)
    while ((string::npos != p_curr || string::npos != p_last) &&
           tokens.size() + 1 != n_max_tokens) {
      tokens.push_back(str.substr(p_last, p_curr - p_last));
      p_last = str.find_first_not_of(" \t", p_curr);
      p_curr = str.find_first_of(" \t", p_last);
    }

  return p_last;
}
	#include "tabix.hpp"
	#include <vector>
	#include <string>

	// untested code

	#if __cplusplus <= 199711L
	#error This library needs at least a C++11 compliant compiler
	#endif

	using namespace std;

	vector<vector<unsigned char>> tabhaps_reader(string inTabhaps, string region ){

	//inTabhaps is tabixed WTCCC haps file
	// region is region to extract
	// for example: 20:2000000-3000000

	// create tabix object
	Tabix tbx(inTabHaps);

	// set region to extract from vcf
	if (tbx.setRegion(region) != true)
	throw std::runtime_error("Malformed region string: " + region);

	// put each line into haps
	vector<vector<unsigned char>> haps;
	// read each line in region
	while (tbx.getNextLine(line)) {
	// parse haps file
	// not sure the outer move statement is necessary, but it won't hurt
	haps.push_back(std::move(getLine(std::move(line))));
	}
	return haps;
	}

	// wtcccLine will be consumed by this function
	// best to move the wtcccLine in
	vector<unsigned char> getLine(string wtcccLine, unsigned numSamps) {
	vector<string> tokens;
	string::size_type lastTokenStart = string::npos;
	lastTokenStart = tokenize_partial(line, 6, tokens);

	// assuming number of samples in haps file is known and stored in
	// numSamps
	vector<unsigned char> haps;
	haps.reserve(numSamps * 2);

	// chromosome is in tokens[0]
	// id and position are in tokens[1] and tokens[2]
	// ref and alt alleles are stored in tokens[3] and tokens[4]
	// the whole haps line is in tokens[5]
	assert(tokens.size() == 6);

	const string &hapString = tokens[5];

	// +1 because there is no EOL char
	if (hapString.size() + 1 != lastTokenStart + 4 * numSamps)
	throw runtime_error("Input line has " + to_string(hapString.size()) +
	" != 4*" + to_string(numSamps) + "+" +
	to_string(lastTokenStart) +
	" characters. May have to run with slower mode that "
	"allows multiple spaces between alleles. Or perhaps "
	"this line is terminated with \r");
	for (size_t hapNum = 0; hapNum < 2 * numSamps; ++hapNum) {

	const size_t charNum = lastTokenStart + 2 * hapNum;
	switch (hapString[charNum]) {
	case '0':
	haps.push_back(0);
	break;
	case '1':
	haps.push_back(1);
	break;
	default:
	throw std::runtime_error("Encountered unexpected allele (" +
	string(1, hapString[charNum * 2]) +
	") at line " + to_string(m_linesRead + 1));
	}
	}
	assert(haps.size() == numSamps * 2);

	return haps;
	}

	// this returns a vector filled with the first n_max_tokens -1 tokens of str
	// the last token contains the rest of str
	string::size_type tokenize_partial(const string &str, size_t n_max_tokens,
	vector<string> &tokens) {
	tokens.clear();
	tokens.reserve(n_max_tokens);
	string::size_type p_last = str.find_first_not_of(" \t", 0);
	string::size_type p_curr = str.find_first_of(" \t", p_last);
	if (n_max_tokens > 1)
	while ((string::npos != p_curr \|\| string::npos != p_last) &&
	tokens.size() + 1 != n_max_tokens) {
	tokens.push_back(str.substr(p_last, p_curr - p_last));
	p_last = str.find_first_not_of(" \t", p_curr);
	p_curr = str.find_first_of(" \t", p_last);
	}

	return p_last;
	}