Skip to content

Instantly share code, notes, and snippets.

@pmenzel
Created July 16, 2018 18:23
Show Gist options
  • Save pmenzel/af5257fa272e3dddf3a369406c64de41 to your computer and use it in GitHub Desktop.
Save pmenzel/af5257fa272e3dddf3a369406c64de41 to your computer and use it in GitHub Desktop.
calculate Least Common Ancestor from NCBI taxon ids
/* lca.cpp, 2018, Peter Menzel
1. Download https://github.com/bioinformatics-centre/kaiju/ and compile
2. Copy lca.cpp to kaiju/src
3. Compile lca.cpp with:
g++ -O3 -std=c++11 -I./include/ncbi-blast+ -o lca lca.cpp Config.o util.o bwt/bwt.o bwt/compactfmi.o bwt/sequence.o bwt/suffixArray.o include/ncbi-blast+/algo/blast/core/blast_seg.o include/ncbi-blast+/algo/blast/core/blast_util.o include/ncbi-blast+/algo/blast/core/blast_filter.o include/ncbi-blast+/algo/blast/core/ncbi_std.o include/ncbi-blast+/algo/blast/core/blast_program.o include/ncbi-blast+/algo/blast/core/blast_encoding.o include/ncbi-blast+/algo/blast/core/blast_query_info.o include/ncbi-blast+/algo/blast/core/blast_stat.o include/ncbi-blast+/algo/blast/core/blast_options.o include/ncbi-blast+/algo/blast/core/blast_message.o include/ncbi-blast+/algo/blast/core/ncbi_math.o include/ncbi-blast+/algo/blast/core/pattern.o include/ncbi-blast+/algo/blast/core/blast_psi_priv.o include/ncbi-blast+/algo/blast/core/blast_dynarray.o include/ncbi-blast+/algo/blast/core/matrix_freq_ratios.o include/ncbi-blast+/algo/blast/core/blast_posit.o include/ncbi-blast+/util/tables/raw_scoremat.o include/ncbi-blast+/algo/blast/composition_adjustment/matrix_frequency_data.o
4. ./lca -t nodes.dmp -i input.tsv -o output.txt
*/
#include <stdint.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <stdexcept>
#include <iostream>
#include <sstream>
#include <fstream>
#include <unordered_map>
#include <vector>
#include <unordered_set>
#include <climits>
#include "Config.hpp"
#include "version.hpp"
#include "util.hpp"
void usage(char *progname);
int main(int argc, char **argv) {
Config * config = new Config();
std::unordered_map<uint64_t,uint64_t> * nodes = new std::unordered_map<uint64_t,uint64_t>();
std::unordered_map<uint64_t,unsigned int> node2depth;
std::string nodes_filename;
std::string in_filename;
std::string out_filename;
bool verbose = false;
bool debug = false;
// --------------------- START ------------------------------------------------------------------
// Read command line params
int c;
while ((c = getopt(argc, argv, "ahdvrl:g:t:i:o:")) != -1) {
switch (c) {
case 'h':
usage(argv[0]);
case 'd':
debug = true; break;
case 'v':
verbose = true; break;
case 'i':
in_filename = optarg; break;
case 't':
nodes_filename = optarg; break;
case 'o':
out_filename = optarg; break;
default:
usage(argv[0]);
}
}
if(nodes_filename.length() == 0) { error("Please specify the location of the nodes.dmp file, using the -t option."); usage(argv[0]); }
if(in_filename.length() == 0) { error("Please specify the location of the input file, using the -i option."); usage(argv[0]); }
if(out_filename.length() == 0) { error("Please specify the name of the output file, using the -o option."); usage(argv[0]); }
config->nodes = nodes;
config->debug = debug;
config->verbose = verbose;
std::ifstream nodes_file;
nodes_file.open(nodes_filename.c_str());
if(!nodes_file.is_open()) { error("Could not open file " + nodes_filename); exit(EXIT_FAILURE); }
parseNodesDmp(*nodes,nodes_file);
nodes_file.close();
std::ifstream inputfile;
inputfile.open(in_filename);
if(!inputfile.is_open()) { error("Could not open file " + in_filename); exit(EXIT_FAILURE); }
std::ofstream out_file;
out_file.open(out_filename);
if(!out_file.is_open()) { error("Could not open file " + out_filename + " for writing."); exit(EXIT_FAILURE); }
std::set<uint64_t> ids;
std::string line;
while(getline(inputfile, line)){
if(line.length() == 0) { continue; }
line += "\t";
std::string curr_id;
ids.clear();
if(debug) std::cerr << "Processing line: " << line << std::endl;
size_t start = 0, end = 0;
while((end = line.find_first_of("\t",start)) != std::string::npos) {
std::string curr_id = line.substr(start, end - start);
if(debug) std::cerr << "Start: " << start << " End: " << end << std::endl;
if(debug) std::cerr << "Current field: " << ">" << curr_id << "<" << std::endl;
uint64_t id;
try {
id = stoul(curr_id);
}
catch(const std::invalid_argument& ia) {
std::cerr << "Error: Bad number in taxon id " << curr_id << std::endl;
break;
}
catch (const std::out_of_range& oor) {
std::cerr << "Error: Bad number (out of range error) in taxon id " << curr_id << std::endl;
break;
}
if(nodes->count(id)>0) {
ids.insert(id);
}
else {
std::cerr << "Taxon ID " << id << " not found in nodes.dmp" << std::endl;
}
start = end+1;
}
if(!ids.empty()) {
uint64_t lca = (ids.size()==1) ? *(ids.begin()) : lca_from_ids(config, node2depth, ids);
if(debug) std::cerr << "LCA=" << lca << std::endl;
out_file << lca << "\n";
}
}
inputfile.close();
out_file.close();
}
void usage(char *progname) {
fprintf(stderr, "Usage:\n %s -t nodes.dmp -i input.tsv -o output.txt\n", progname);
fprintf(stderr, "Mandatory arguments:\n");
fprintf(stderr, " -t FILENAME Name of nodes.dmp file.\n");
fprintf(stderr, " -i FILENAME Name of tab-delimited input file.\n");
fprintf(stderr, " -o FILENAME Name of output file.\n");
exit(EXIT_FAILURE);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment