Created
January 26, 2022 22:11
-
-
Save benfulton/dbb1be9edd3a023093fb5a6cee3a03af to your computer and use it in GitHub Desktop.
Reverse engineering the R APE package tree reader, to determine how the nodes are numbered
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* tree_build.c 2020-02-12 */ | |
/* Copyright 2008-2020 Emmanuel Paradis, 2017 Klaus Schliep */ | |
/* This file is part of the R-package `ape'. */ | |
/* See the file ../COPYING for licensing issues. */ | |
//#include <R.h> | |
//#include <Rinternals.h> | |
#include <cmath> | |
#include <string> | |
#include <vector> | |
#include <iostream> | |
using namespace std; | |
struct tree_data | |
{ | |
vector<int> Nnode, edge; | |
vector<double> edge_length; | |
vector<string> node_label; | |
vector<string> tip_label; | |
int nnode = 0; | |
vector<double> root_edge; | |
}; | |
void extract_portion_Newick(string x, int a, int b, char *y) | |
{ | |
int i, j; | |
for (i = a, j = 0; i <= b; i++, j++) y[j] = x[i]; | |
y[j] = '\0'; | |
} | |
void decode_internal_edge(string x, int a, int b, char *lab, double *w) | |
{ | |
int co = a; | |
char *endstr, str[100]; | |
while (x[co] != ':' && co <= b) co++; | |
if (a == co) lab[0] = '\0'; /* if no node label */ | |
else extract_portion_Newick(x, a, co - 1, lab); | |
if (co < b) { | |
extract_portion_Newick(x, co + 1, b, str); | |
*w = strtod(str, &endstr); | |
} else *w = NAN; | |
} | |
void decode_terminal_edge(string x, int a, int b, char *tip, double *w) | |
{ | |
int co = a; | |
char *endstr, str[100]; | |
while (x[co] != ':' && co <= b) co++; | |
extract_portion_Newick(x, a, co - 1, tip); | |
if (co < b) { | |
extract_portion_Newick(x, co + 1, b, str); | |
*w = strtod(str, &endstr); | |
} else *w = NAN; | |
} | |
#define ADD_INTERNAL_EDGE \ | |
phy.edge[j] = curnode; \ | |
phy.edge[j + nedge] = curnode = ++node; \ | |
stack_internal[k++] = j; \ | |
j++ | |
#define GO_DOWN \ | |
decode_internal_edge(x, ps + 1, pt - 1, lab, &tmpd); \ | |
phy.node_label[curnode - 1 - ntip] = lab; \ | |
l = stack_internal[--k]; \ | |
phy.edge_length[l] = tmpd; \ | |
curnode = phy.edge[l] | |
void add_terminal_edge_tiplabel(tree_data& phy, int&curtip, int&j, int curnode, string x, int pr, int ps, char *tip, int nedge) | |
{ | |
double tmpd; | |
phy.edge[j] = curnode; | |
decode_terminal_edge(x, pr + 1, ps - 1, tip, &tmpd); | |
if (phy.tip_label.size() < curtip) phy.tip_label.resize(curtip); | |
phy.tip_label[curtip - 1] = tip; | |
phy.edge[j + nedge] = curtip; | |
phy.edge_length[j] = tmpd; | |
curtip++; | |
j++; | |
} | |
#define ADD_TERMINAL_EDGE_TIPLABEL add_terminal_edge_tiplabel(phy, curtip, j, curnode, x, pr, ps, tip, nedge) | |
tree_data treeBuild(string nwk) | |
{ | |
string x; | |
int n, i, ntip = 1, nleft = 0, nright = 0, nedge, curnode, node, j, nsk = 0, ps, pr, pt, l, k, stack_internal[10000], curtip = 1; | |
double *el, tmpd; | |
char lab[512], tip[512]; | |
tree_data phy; | |
phy.tip_label.resize(1); | |
x = nwk; | |
n = x.size(); | |
vector<int> skeleton(n); | |
for (i = 0; i < n; i++) { | |
if (x[i] == '(') { | |
skeleton[nsk] = i; | |
nsk++; | |
nleft++; | |
continue; | |
} | |
if (x[i] == ',') { | |
skeleton[nsk] = i; | |
nsk++; | |
ntip++; | |
continue; | |
} | |
if (x[i] == ')') { | |
skeleton[nsk] = i; | |
nsk++; | |
nright++; | |
phy.nnode++; | |
} | |
} | |
if (nleft != nright) perror("numbers of left and right parentheses in Newick string not equaln"); | |
nedge = ntip + phy.nnode - 1; | |
phy.Nnode.resize(1); | |
phy.edge.resize(nedge*2); | |
phy.edge_length.resize(nedge); | |
phy.node_label.resize(phy.nnode); | |
phy.Nnode[0] = phy.nnode; | |
curnode = node = ntip + 1; | |
k = j = 0; | |
for (i = 1; i < nsk - 1; i++) { | |
ps = skeleton[i]; | |
if (x[ps] == '(') { | |
ADD_INTERNAL_EDGE; | |
continue; | |
} | |
pr = skeleton[i - 1]; | |
if (x[ps] == ',') { | |
if (x[pr] != ')') { | |
ADD_TERMINAL_EDGE_TIPLABEL; | |
} | |
continue; | |
} | |
if (x[ps] == ')') { | |
pt = skeleton[i + 1]; | |
if (x[pr] == ',') { | |
ADD_TERMINAL_EDGE_TIPLABEL; | |
GO_DOWN; | |
continue; | |
} | |
if (x[pr] == '(') { | |
ADD_TERMINAL_EDGE_TIPLABEL; | |
GO_DOWN; | |
continue; | |
} | |
if (x[pr] == ')') { | |
GO_DOWN; | |
} | |
} | |
} | |
pr = skeleton[nsk - 2]; | |
ps = skeleton[nsk - 1]; | |
if (x[pr] == ',' && x[ps] == ')') { | |
ADD_TERMINAL_EDGE_TIPLABEL; | |
} | |
if (ps < n - 2) { | |
i = ps + 1; | |
while (i < n - 2 && x[i] != ':') i++; | |
if (i < n - 2) { | |
decode_internal_edge(x, ps + 1, n - 2, lab, &tmpd); | |
phy.root_edge.push_back(tmpd); | |
phy.node_label[0] = lab; | |
} else { | |
extract_portion_Newick(x, ps + 1, n - 2, lab); | |
phy.node_label[0] = lab; | |
} | |
} | |
return phy; | |
} | |
int main() | |
{ | |
// auto t = treeBuild("(A:1,B:1):1"); | |
// auto t = treeBuild("((((cat:68.710687,horse:68.710687):4.566771,cow:73.277458):20.722542,(((((chimp:4.444178,human:4.444178):6.682660,orang:11.126837):2.285866,gibbon:13.412704):7.211528,(macaque:4.567239,baboon:4.567239):16.056993):16.060691,marmoset:36.684923):57.315077)mancat:38.738115,(rat:36.302467,mouse:36.302467):96.435648)"); | |
auto t = treeBuild("((Parasteatoda_tepidariorum<285>_1:138.531,Stegodyphus_mimosarum<284>_1:138.531)<297>_1:187.093,Centruroides_sculpturatus<296>_3:325.625)<313>_1:46.2175;"); | |
cout << "Tips: " << t.tip_label.size() << endl; | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment