Created
August 7, 2017 08:31
-
-
Save alienzj/6125cba353995ca2fb328c01a4d22cc4 to your computer and use it in GitHub Desktop.
Overall Accurate
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#pragma once | |
#include <boost/algorithm/string.hpp> | |
#include <boost/algorithm/string_regex.hpp> | |
#include <boost/iostreams/filtering_streambuf.hpp> | |
#include <boost/iostreams/filter/gzip.hpp> | |
#include <boost/iostreams/copy.hpp> | |
#include <boost/program_options.hpp> | |
#include <boost/regex.hpp> | |
#include <iostream> | |
#include <fstream> | |
#include <sstream> | |
#include <string> | |
#include <vector> | |
#include <tuple> | |
#include <cmath> | |
namespace bio | |
{ | |
std::tuple<double, int, int, int> | |
calculate_cutoff(std::string& qual, double seed_oa_cutoff, double frag_oa_cutoff) | |
{ | |
int qual_system = 33; | |
double min = 0.0; | |
double oa_0 = 1.0, oa_1 = 1.0, ca = 1.0; | |
int position = 0; | |
std::vector<int> qual_value_vec; | |
std::vector<double> correct_rate_vec; | |
for (auto& qual_base : qual) | |
{ | |
int qual_value = static_cast<int>(qual_base) - qual_system; | |
qual_value_vec.push_back(qual_value); | |
double correct_rate = 1 - std::pow(10, -qual_value / 10); | |
correct_rate = (correct_rate == 0) ? 0.1 : correct_rate; | |
correct_rate_vec.push_back(correct_rate); | |
//std::cout << qual_base << " : " << qual_value << " : " << correct_rate << "\n"; | |
} | |
std::vector<double> seed_oa(qual.size() - 4, 1); | |
int i = 0; | |
while (i < 5 ) | |
{ | |
seed_oa[0] *= correct_rate_vec[i]; | |
++i; | |
} | |
while (i < qual.size()) | |
{ | |
seed_oa[i - 4] = seed_oa[i - 5] * correct_rate_vec[i] / correct_rate_vec[i - 5]; | |
position = (seed_oa[i - 4] > seed_oa[position]) ? i - 4 : position; | |
++i; | |
if (seed_oa[position] > seed_oa_cutoff) | |
break; | |
} | |
//std::cout << position << std::endl; | |
i = position + 5; | |
ca = seed_oa[position]; | |
while (i < qual.size()) | |
{ | |
double acc = correct_rate_vec[i]; | |
if (acc < min) | |
{ | |
oa_1 = ca * min; | |
min = acc; | |
} | |
else | |
{ | |
oa_1 = ca * acc; | |
} | |
if (oa_1 < frag_oa_cutoff) break; | |
++i; | |
ca = oa_1; | |
} | |
int qual_sum = 0; | |
for (int j = position; j < i; ++j) | |
qual_sum += qual_value_vec[j]; | |
i = i - position; | |
return { ca, qual_sum, position, i }; | |
} | |
int filter_se_reads(const std::string& fq_path, | |
const int min_len, | |
double seed_oa_cutoff, | |
double frag_oa_cutoff) | |
{ | |
std::stringstream fastq_str; | |
std::ifstream fq_file(fq_path, std::ios_base::in | std::ios_base::binary); | |
try | |
{ | |
boost::iostreams::filtering_istreambuf in; | |
in.push(boost::iostreams::gzip_decompressor()); | |
in.push(fq_file); | |
boost::iostreams::copy(in, fastq_str); | |
} | |
catch (const boost::iostreams::gzip_error& e) | |
{ | |
std::cerr << e.what() << "\n"; | |
} | |
const std::string fq_path_filter = boost::algorithm::replace_last_copy(fq_path, "fq", "clean.fq"); | |
std::ofstream fq_file_filter(fq_path_filter, std::ios_base::out | std::ios_base::binary); | |
boost::iostreams::filtering_ostreambuf out; | |
out.push(boost::iostreams::gzip_compressor()); | |
out.push(fq_file_filter); | |
std::string line; | |
while (std::getline(fastq_str, line)) | |
{ | |
std::vector<std::string> header; | |
boost::algorithm::split_regex(header, line, boost::regex("\\s+")); | |
std::string reads_id = header[0]; | |
std::getline(fastq_str, line); | |
std::string reads_seq = line; | |
std::getline(fastq_str, line); | |
std::string reads_direction = line; | |
std::getline(fastq_str, line); | |
std::string reads_qual = line; | |
//std::cout << reads_id << "\n"; | |
//std::cout << reads_direction << "\n"; | |
//std::cout << reads_seq << "\n"; | |
//std::cout << reads_qual << "\n"; | |
double ca; | |
int qual_sum, start, len; | |
std::tie(ca, qual_sum, start, len) = bio::calculate_cutoff(reads_qual, seed_oa_cutoff, frag_oa_cutoff); | |
if (len >= min_len && ca >= frag_oa_cutoff) | |
{ | |
std::string seq_filter = reads_seq.substr(len, start); | |
std::string qual_filter = reads_qual.substr(len, start); | |
std::ostringstream reads_filter; | |
reads_filter | |
<< reads_id << " length=" << len << "\n" | |
<< seq_filter << "\n" | |
<< reads_direction << "\n" | |
<< qual_filter << "\n"; | |
try | |
{ | |
boost::iostreams::copy(out, reads_filter); | |
} | |
catch (const boost::iostreams::gzip_error& e) | |
{ | |
std::cerr << e.what() << "\n"; | |
} | |
return 0; | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment