Skip to content

Instantly share code, notes, and snippets.

@bschiffthaler
Created November 1, 2019 07:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bschiffthaler/eace15d8d3eb63aaf782c09616a5a274 to your computer and use it in GitHub Desktop.
Save bschiffthaler/eace15d8d3eb63aaf782c09616a5a274 to your computer and use it in GitHub Desktop.
#include <string>
#include <cstring>
#include <iostream>
#include <fstream>
#include <chrono>
#include <vector>
#include <chrono>
#include <cmath>
#include <jellyfish/err.hpp>
#include <jellyfish/misc.hpp>
#include <jellyfish/mer_heap.hpp>
#include <jellyfish/jellyfish.hpp>
#include <jellyfish/rectangular_binary_matrix.hpp>
#include <jellyfish/cpp_array.hpp>
#include "robin_hood.h"
#define XXH_INLINE_ALL
#include "xxhash.h"
#define BUFSIZE 1000000000
#define K 17
#define GENOME 480000000
class kmer {
public:
kmer() { }
kmer(std::string const && str) { data = str; }
bool operator==(kmer const & other) const {
return data == other.data;
}
std::string data;
};
class fasthash_k {
public:
XXH64_hash_t operator()(kmer const & km) const
{
return XXH64(&km.data[0], K, 31415);
}
};
void add_to_set(std::string const & f,
robin_hood::unordered_map<kmer, XXH64_hash_t, fasthash_k>& set,
uint64_t& index,
uint64_t& cov,
std::vector<jellyfish::mer_dna>& m_vec,
std::vector<uint64_t>& c_vec)
{
std::ifstream in_stream(f.c_str());
jellyfish::file_header header(in_stream);
jellyfish::mer_dna::k(header.key_len() / 2);
binary_reader reader(in_stream, &header);
uint64_t lo_pass = (((cov / GENOME) + 1) / 4);
lo_pass = lo_pass < 1 ? 1 : lo_pass;
while (reader.next())
{
m_vec.push_back(reader.key());
c_vec.push_back(reader.val());
}
for (uint64_t i = 0; i < m_vec.size(); i++)
{
uint64_t const & val = c_vec[i];
if (val > lo_pass)
{
set[m_vec[i].to_str()]++;
}
}
}
uint64_t read_cov(std::string const & x)
{
std::ifstream ifs(x.c_str());
uint64_t ret;
ifs >> ret;
return ret;
}
int main(int argc, char ** argv)
{
robin_hood::unordered_map<kmer, XXH64_hash_t, fasthash_k> set;
set.reserve(BUFSIZE);
std::vector<jellyfish::mer_dna> m_vec; // k-mers
std::vector<uint64_t> c_vec; // k-counts
m_vec.reserve(BUFSIZE);
c_vec.reserve(BUFSIZE);
uint64_t index = 0;
for (int i = 1; i < argc; i++)
{
m_vec.clear();
c_vec.clear();
std::string cov_path(argv[i]);
cov_path += ".cov";
uint64_t cov = read_cov(cov_path);
add_to_set(std::string(argv[i]), set, index, cov, m_vec, c_vec);
index++;
}
double nsamples = static_cast<double>(argc);
uint64_t cutoff = std::ceil(nsamples / 5);
for (auto const & entry : set)
{
if (entry.second >= cutoff)
{
std::cout << entry.first.data << '\n';
}
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment