Skip to content

Instantly share code, notes, and snippets.

@izaak-coleman
Last active April 29, 2020 21:26
Show Gist options
  • Save izaak-coleman/b46cdb65030e9165ad22323e141e5937 to your computer and use it in GitHub Desktop.
Save izaak-coleman/b46cdb65030e9165ad22323e141e5937 to your computer and use it in GitHub Desktop.
TEST: cqf_memory.cpp
#include <iostream>
#include <set>
#include <fstream>
#include "gqf_cpp.h"
/**
* @brief Get a list of sample files.
* @param file_list has one file path per line
*/
std::vector<std::string> read_file_list(std::string file_list) {
std::vector<std::string> file_paths;
std::string line;
std::ifstream fin(file_list.c_str());
while (std::getline(fin, line)) {
file_paths.push_back(line);
}
fin.close();
return file_paths;
}
/**
* @brief Create a set of kmers for each sample in @param file_list.
* Add all kmer sets into @param list_kmer_sets and return the vector
*/
void load_kmer_sets(
std::vector<std::string> &file_list,
std::vector<std::set<uint64_t>> &list_kmer_sets
) {
// Loop through samples
for (auto file : file_list) {
std::ifstream fin(file);
std::string line;
std::set<uint64_t> kmer_set;
while(std::getline(fin, line)) {
kmer_set.insert(std::stoull(line));
}
list_kmer_sets.push_back(kmer_set);
fin.close();
}
}
/**
* @brief Returns a vector of sets of kmers.
* Each set stores the kmers for one sample
* in_file is a text file with filenames for each sample
*/
void loading_list_of_kmer_sets(std::vector<std::set<uint64_t>> &list_kmer_sets) {
std::string in_file = "api_fastq.lst";
std::vector<std::string> file_list = read_file_list(in_file);
load_kmer_sets(file_list, list_kmer_sets);
}
/**
* @brief Returns a vector of sets of kmers. Randomly generate kmers
*/
void random_loading_list_of_kmer_sets(std::vector<std::set<uint64_t>> &list_kmer_sets){
std::vector<uint64_t> start_vals = {3146833986914269,
3866702768502618,
925108359212988};
uint64_t kmers_per_sample = 700000;
for (int sample_id=0; sample_id<3; sample_id++){
std::set<uint64_t> kmer_set;
for (uint64_t k=0; k<kmers_per_sample; k++){
uint64_t kmer = start_vals[sample_id] + k;
kmer_set.insert(kmer);
}
list_kmer_sets.push_back(kmer_set);
}
}
/**
* @brief Insert key and eq_id into cqf
*/
int insert(uint64_t key, uint64_t eq_id, CQF<KeyObject> &cqf) {
// Make sure the kmer has never been inserted before
uint64_t count = cqf.query(KeyObject(key,0,0), QF_NO_LOCK |QF_KEY_IS_HASH);
if (count > 0){
std::cout << "K-mer was already present. kmer: " << key << " eqid: " << count << std::endl;;
exit(1);
}
int ret = cqf.insert(KeyObject(key,0,eq_id), QF_NO_LOCK | QF_KEY_IS_HASH);
return ret;
}
/**
* @brief Insert kmers into cqf
* @param list_kmer_sets List of kmers from multiple samples
*/
void build(
CQF<KeyObject> &cqf,
std::vector<std::set<uint64_t>> &list_kmer_sets
) {
// Load all the kmers into all_kmers
std::set<uint64_t> all_kmers;
for (std::set<uint64_t> sq: list_kmer_sets){
all_kmers.insert(sq.begin(), sq.end());
}
for (std::set<uint64_t>::iterator kmer_it = all_kmers.begin(); kmer_it != all_kmers.end(); kmer_it++) {
uint64_t key = *kmer_it;
uint64_t eq_id = 2;
int ret = insert(key, eq_id, cqf);
}
}
int main() {
std::cout << "CQF Memory Test" << std::endl;
// ------------------------------------------------------------------------
// Option 1: Before CQF
// Load kmers from multiple samples
// -- Program Crashes --
// ------------------------------------------------------------------------
// std::vector<std::set<uint64_t>> list_kmer_sets;
// loading_list_of_kmer_sets(list_kmer_sets);
// ------------------------------------------------------------------------
// ------------------------------------------------------------------------
// Initialization CQF
// ------------------------------------------------------------------------
uint64_t q_bits = 20;
uint64_t key_remainder_bits = 62-q_bits;
uint64_t key_bits = q_bits + key_remainder_bits;
enum qf_hashmode hashmode = QF_HASH_INVERTIBLE;
CQF<KeyObject> cqf(q_bits, key_bits, hashmode, SEED);
cqf.set_auto_resize();
uint64_t initial_nslot = 1ULL << q_bits;
assert(cqf.numslots() == initial_nslot);
// ------------------------------------------------------------------------
// Option 2: After CQF
// Load kmers from multiple samples
// -- Program Successfully Completes --
// ------------------------------------------------------------------------
std::vector<std::set<uint64_t>> list_kmer_sets;
loading_list_of_kmer_sets(list_kmer_sets);
// ------------------------------------------------------------------------
// ------------------------------------------------------------------------
// Option 3: After CQF
// Load hard-coded kmers
// -- Program Crashes After Resizing Once --
// ------------------------------------------------------------------------
// std::vector<std::set<uint64_t>> list_kmer_sets;
// random_loading_list_of_kmer_sets(list_kmer_sets);
// ------------------------------------------------------------------------
build(cqf, list_kmer_sets);
std::cout << "Total insertions in CQF: " << cqf.dist_elts() << std::endl;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment