Last active
April 29, 2020 21:26
-
-
Save izaak-coleman/b46cdb65030e9165ad22323e141e5937 to your computer and use it in GitHub Desktop.
TEST: cqf_memory.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <set> | |
#include <fstream> | |
#include "gqf_cpp.h" | |
/** | |
* @brief Get a list of sample files. | |
* @param file_list has one file path per line | |
*/ | |
std::vector<std::string> read_file_list(std::string file_list) { | |
std::vector<std::string> file_paths; | |
std::string line; | |
std::ifstream fin(file_list.c_str()); | |
while (std::getline(fin, line)) { | |
file_paths.push_back(line); | |
} | |
fin.close(); | |
return file_paths; | |
} | |
/** | |
* @brief Create a set of kmers for each sample in @param file_list. | |
* Add all kmer sets into @param list_kmer_sets and return the vector | |
*/ | |
void load_kmer_sets( | |
std::vector<std::string> &file_list, | |
std::vector<std::set<uint64_t>> &list_kmer_sets | |
) { | |
// Loop through samples | |
for (auto file : file_list) { | |
std::ifstream fin(file); | |
std::string line; | |
std::set<uint64_t> kmer_set; | |
while(std::getline(fin, line)) { | |
kmer_set.insert(std::stoull(line)); | |
} | |
list_kmer_sets.push_back(kmer_set); | |
fin.close(); | |
} | |
} | |
/** | |
* @brief Returns a vector of sets of kmers. | |
* Each set stores the kmers for one sample | |
* in_file is a text file with filenames for each sample | |
*/ | |
void loading_list_of_kmer_sets(std::vector<std::set<uint64_t>> &list_kmer_sets) { | |
std::string in_file = "api_fastq.lst"; | |
std::vector<std::string> file_list = read_file_list(in_file); | |
load_kmer_sets(file_list, list_kmer_sets); | |
} | |
/** | |
* @brief Returns a vector of sets of kmers. Randomly generate kmers | |
*/ | |
void random_loading_list_of_kmer_sets(std::vector<std::set<uint64_t>> &list_kmer_sets){ | |
std::vector<uint64_t> start_vals = {3146833986914269, | |
3866702768502618, | |
925108359212988}; | |
uint64_t kmers_per_sample = 700000; | |
for (int sample_id=0; sample_id<3; sample_id++){ | |
std::set<uint64_t> kmer_set; | |
for (uint64_t k=0; k<kmers_per_sample; k++){ | |
uint64_t kmer = start_vals[sample_id] + k; | |
kmer_set.insert(kmer); | |
} | |
list_kmer_sets.push_back(kmer_set); | |
} | |
} | |
/** | |
* @brief Insert key and eq_id into cqf | |
*/ | |
int insert(uint64_t key, uint64_t eq_id, CQF<KeyObject> &cqf) { | |
// Make sure the kmer has never been inserted before | |
uint64_t count = cqf.query(KeyObject(key,0,0), QF_NO_LOCK |QF_KEY_IS_HASH); | |
if (count > 0){ | |
std::cout << "K-mer was already present. kmer: " << key << " eqid: " << count << std::endl;; | |
exit(1); | |
} | |
int ret = cqf.insert(KeyObject(key,0,eq_id), QF_NO_LOCK | QF_KEY_IS_HASH); | |
return ret; | |
} | |
/** | |
* @brief Insert kmers into cqf | |
* @param list_kmer_sets List of kmers from multiple samples | |
*/ | |
void build( | |
CQF<KeyObject> &cqf, | |
std::vector<std::set<uint64_t>> &list_kmer_sets | |
) { | |
// Load all the kmers into all_kmers | |
std::set<uint64_t> all_kmers; | |
for (std::set<uint64_t> sq: list_kmer_sets){ | |
all_kmers.insert(sq.begin(), sq.end()); | |
} | |
for (std::set<uint64_t>::iterator kmer_it = all_kmers.begin(); kmer_it != all_kmers.end(); kmer_it++) { | |
uint64_t key = *kmer_it; | |
uint64_t eq_id = 2; | |
int ret = insert(key, eq_id, cqf); | |
} | |
} | |
int main() { | |
std::cout << "CQF Memory Test" << std::endl; | |
// ------------------------------------------------------------------------ | |
// Option 1: Before CQF | |
// Load kmers from multiple samples | |
// -- Program Crashes -- | |
// ------------------------------------------------------------------------ | |
// std::vector<std::set<uint64_t>> list_kmer_sets; | |
// loading_list_of_kmer_sets(list_kmer_sets); | |
// ------------------------------------------------------------------------ | |
// ------------------------------------------------------------------------ | |
// Initialization CQF | |
// ------------------------------------------------------------------------ | |
uint64_t q_bits = 20; | |
uint64_t key_remainder_bits = 62-q_bits; | |
uint64_t key_bits = q_bits + key_remainder_bits; | |
enum qf_hashmode hashmode = QF_HASH_INVERTIBLE; | |
CQF<KeyObject> cqf(q_bits, key_bits, hashmode, SEED); | |
cqf.set_auto_resize(); | |
uint64_t initial_nslot = 1ULL << q_bits; | |
assert(cqf.numslots() == initial_nslot); | |
// ------------------------------------------------------------------------ | |
// Option 2: After CQF | |
// Load kmers from multiple samples | |
// -- Program Successfully Completes -- | |
// ------------------------------------------------------------------------ | |
std::vector<std::set<uint64_t>> list_kmer_sets; | |
loading_list_of_kmer_sets(list_kmer_sets); | |
// ------------------------------------------------------------------------ | |
// ------------------------------------------------------------------------ | |
// Option 3: After CQF | |
// Load hard-coded kmers | |
// -- Program Crashes After Resizing Once -- | |
// ------------------------------------------------------------------------ | |
// std::vector<std::set<uint64_t>> list_kmer_sets; | |
// random_loading_list_of_kmer_sets(list_kmer_sets); | |
// ------------------------------------------------------------------------ | |
build(cqf, list_kmer_sets); | |
std::cout << "Total insertions in CQF: " << cqf.dist_elts() << std::endl; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment