Skip to content

Instantly share code, notes, and snippets.

@therohk
Last active May 29, 2021 09:53
Show Gist options
  • Save therohk/664fe94755dab40edd2b76046fee5818 to your computer and use it in GitHub Desktop.
Save therohk/664fe94755dab40edd2b76046fee5818 to your computer and use it in GitHub Desktop.
Code for conversion of Tree of Life from xml source to csv dataset using jsoup
public static void main(String[] args) {
DataSet linkSet = new DataSet();
linkSet.getHeaders().setHeaders("source_node_id", "target_node_id");
DataSet nodeSet = new DataSet();
nodeSet.getHeaders().setHeaders("node_id", "node_name", "child_nodes", "leaf_node", "tolorg_link", "extinct", "confidence", "phylesis");
try {
Connection connection = HttpConnection.connect("http://tolweb.org/");
connection.process("tolskeletaldump.xml", "application/xml");
connection.parser(new Parser(new XmlTreeBuilder()));
Document document = connection.response().parse();
Elements elements = document.select("node[id]");
for(Element element : elements) {
DataTuple tuple = new DataTuple();
tuple.setRecord("source_node_id", element.parent().parent().attr("id"));
tuple.setRecord("target_node_id", element.attr("id"));
linkSet.getStore().addTuple(tuple);
DataTuple nodeTuple = new DataTuple();
String nodeName = element.getElementsByTag("name").first().ownText();
if(StringUtils.containsAny(nodeName, ','))
nodeName = TextUtils.encloseQuotes(nodeName);
if(StringUtils.isAllLowerCase(nodeName))
nodeName = StringUtils.leftSub(nodeName, 1) + StringUtils.snip(nodeName, 1, 0);
if(StringUtils.isBlank(nodeName))
nodeName = "none";
nodeTuple.setRecord("node_id", element.attr("id"));
nodeTuple.setRecord("node_name", nodeName);
nodeTuple.setRecord("child_nodes", element.attr("childcount"));
nodeTuple.setRecord("leaf_node", element.attr("leaf"));
nodeTuple.setRecord("tolorg_link", element.attr("haspage"));
nodeTuple.setRecord("extinct", element.attr("extinct"));
nodeTuple.setRecord("confidence", element.attr("confidence"));
nodeTuple.setRecord("phylesis", element.attr("phylesis"));
nodeSet.getStore().addTuple(nodeTuple);
}
} catch (IOException e) {
e.printStackTrace();
}
nodeSet.getStore().updateTuples("tolorg_link", s -> "2".equals(s), s -> "1");
nodeSet.getStore().updateTuples("extinct", s -> "2".equals(s), s -> "1");
linkSet.getConfig().setDelimiter(",");
StorageUtils.flushDataSet("treeoflife_links.csv", linkSet);
nodeSet.getConfig().setDelimiter(",");
StorageUtils.flushDataSet("treeoflife_nodes.csv", nodeSet);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment