-
-
Save bschiffthaler/eace15d8d3eb63aaf782c09616a5a274 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <string> | |
#include <cstring> | |
#include <iostream> | |
#include <fstream> | |
#include <chrono> | |
#include <vector> | |
#include <chrono> | |
#include <cmath> | |
#include <jellyfish/err.hpp> | |
#include <jellyfish/misc.hpp> | |
#include <jellyfish/mer_heap.hpp> | |
#include <jellyfish/jellyfish.hpp> | |
#include <jellyfish/rectangular_binary_matrix.hpp> | |
#include <jellyfish/cpp_array.hpp> | |
#include "robin_hood.h" | |
#define XXH_INLINE_ALL | |
#include "xxhash.h" | |
#define BUFSIZE 1000000000 | |
#define K 17 | |
#define GENOME 480000000 | |
class kmer { | |
public: | |
kmer() { } | |
kmer(std::string const && str) { data = str; } | |
bool operator==(kmer const & other) const { | |
return data == other.data; | |
} | |
std::string data; | |
}; | |
class fasthash_k { | |
public: | |
XXH64_hash_t operator()(kmer const & km) const | |
{ | |
return XXH64(&km.data[0], K, 31415); | |
} | |
}; | |
void add_to_set(std::string const & f, | |
robin_hood::unordered_map<kmer, XXH64_hash_t, fasthash_k>& set, | |
uint64_t& index, | |
uint64_t& cov, | |
std::vector<jellyfish::mer_dna>& m_vec, | |
std::vector<uint64_t>& c_vec) | |
{ | |
std::ifstream in_stream(f.c_str()); | |
jellyfish::file_header header(in_stream); | |
jellyfish::mer_dna::k(header.key_len() / 2); | |
binary_reader reader(in_stream, &header); | |
uint64_t lo_pass = (((cov / GENOME) + 1) / 4); | |
lo_pass = lo_pass < 1 ? 1 : lo_pass; | |
while (reader.next()) | |
{ | |
m_vec.push_back(reader.key()); | |
c_vec.push_back(reader.val()); | |
} | |
for (uint64_t i = 0; i < m_vec.size(); i++) | |
{ | |
uint64_t const & val = c_vec[i]; | |
if (val > lo_pass) | |
{ | |
set[m_vec[i].to_str()]++; | |
} | |
} | |
} | |
uint64_t read_cov(std::string const & x) | |
{ | |
std::ifstream ifs(x.c_str()); | |
uint64_t ret; | |
ifs >> ret; | |
return ret; | |
} | |
int main(int argc, char ** argv) | |
{ | |
robin_hood::unordered_map<kmer, XXH64_hash_t, fasthash_k> set; | |
set.reserve(BUFSIZE); | |
std::vector<jellyfish::mer_dna> m_vec; // k-mers | |
std::vector<uint64_t> c_vec; // k-counts | |
m_vec.reserve(BUFSIZE); | |
c_vec.reserve(BUFSIZE); | |
uint64_t index = 0; | |
for (int i = 1; i < argc; i++) | |
{ | |
m_vec.clear(); | |
c_vec.clear(); | |
std::string cov_path(argv[i]); | |
cov_path += ".cov"; | |
uint64_t cov = read_cov(cov_path); | |
add_to_set(std::string(argv[i]), set, index, cov, m_vec, c_vec); | |
index++; | |
} | |
double nsamples = static_cast<double>(argc); | |
uint64_t cutoff = std::ceil(nsamples / 5); | |
for (auto const & entry : set) | |
{ | |
if (entry.second >= cutoff) | |
{ | |
std::cout << entry.first.data << '\n'; | |
} | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment