Skip to content

Instantly share code, notes, and snippets.

@lindenb
Created May 25, 2012 12:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lindenb/2787783 to your computer and use it in GitHub Desktop.
Save lindenb/2787783 to your computer and use it in GitHub Desktop.
import java.io.*;
import java.util.*;
import java.util.regex.*;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamWriter;
public class Biostar45691
{
private static class Node
{
int tax_id;
int parent_id=-1;
String name=null;
int count=0;
int count_leaf=0;
@Override
public int hashCode() {
return tax_id;
}
@Override
public boolean equals(Object obj) {
return obj==this || ((Node)obj).tax_id==this.tax_id;
}
}
private Map<Integer,Node> id2node=new HashMap<Integer, Biostar45691.Node>(898000);
private void parseTaxonomy(File taxDir) throws Exception
{
Pattern pipe=Pattern.compile("[\\|]");
BufferedReader in=new BufferedReader(new FileReader(new File(taxDir,"nodes.dmp")));
String line;
while((line=in.readLine())!=null)
{
String tokens[]=pipe.split(line,3);
Node n=new Node();
n.tax_id=Integer.parseInt(tokens[0].trim());
n.parent_id=Integer.parseInt(tokens[1].trim());
id2node.put(n.tax_id,n);
}
in.close();
in=new BufferedReader(new FileReader(new File(taxDir,"names.dmp")));
while((line=in.readLine())!=null)
{
String tokens[]=pipe.split(line);
int tax_id=Integer.parseInt(tokens[0].trim());
Node n=id2node.get(tax_id);
if(n==null) continue;
if(n.name==null || tokens[3].contains("scientific name"))
{
n.name=tokens[1].trim();
}
}
in.close();
}
private void touch(Node n)
{
n.count++;
if(n.parent_id!=n.tax_id)
{
Node p=id2node.get(n.parent_id);
if(p!=null) touch(p);
}
}
private void parseTaxonIds(File taxons) throws Exception
{
String line;
BufferedReader in=new BufferedReader(new FileReader(taxons));
while((line=in.readLine())!=null)
{
if(line.isEmpty()) continue;
int taxon_id=Integer.parseInt(line.trim());
Node n=id2node.get(taxon_id);
if(n==null) continue;
n.count_leaf++;
touch(n);
}
in.close();
}
private Set<Node> lineage(Node n)
{
Set<Node> L=new HashSet<Node>();
while(n!=null && n.tax_id!=n.parent_id)
{
L.add(n);
n=id2node.get(n.parent_id);
}
return L;
}
private void dump(OutputStream out) throws Exception
{
XMLOutputFactory xmlfactory= XMLOutputFactory.newInstance();
XMLStreamWriter w= xmlfactory.createXMLStreamWriter(out,"UTF-8");
w.writeStartDocument("UTF-8","1.0");
w.writeStartElement("gexf");
w.writeAttribute("xmlns","http://www.gexf.net/1.2draft");
w.writeAttribute("xmlns:xsi","http://www.w3.org/2001/XMLSchema-instance");
w.writeAttribute("xsi:schemaLocation","http://www.gexf.net/1.2draft http://www.gexf.net/1.2draft/gexf.xsd");
w.writeAttribute("version","1.2");
w.writeStartElement("meta");
w.writeStartElement("creator");
w.writeCharacters("Pierre Lindenbaum");
w.writeEndElement();
w.writeStartElement("description");
w.writeCharacters("biostar 45691");
w.writeEndElement();
w.writeEndElement();
w.writeStartElement("graph");
w.writeAttribute("defaultedgetype","directed");
w.writeStartElement("attributes");
w.writeAttribute("class", "node");
w.writeAttribute("mode", "static");
w.writeStartElement("attribute");
w.writeAttribute("id", "count");
w.writeAttribute("title", "Count");
w.writeAttribute("type", "float");
w.writeEndElement();
w.writeStartElement("attribute");
w.writeAttribute("id", "countleaf");
w.writeAttribute("title", "CountLeaf");
w.writeAttribute("type", "float");
w.writeEndElement();
w.writeEndElement();
Set<Node> toprint=new HashSet<Node>();
for(Node n:this.id2node.values())
{
if(n.count==0) continue;
if(toprint.addAll(lineage(n)));
}
w.writeStartElement("nodes");
for(Node a:toprint)
{
w.writeStartElement("node");
w.writeAttribute("id", String.valueOf(a.tax_id));
w.writeAttribute("label",String.valueOf(a.name));
w.writeStartElement("attvalues");
w.writeStartElement("attvalue");
w.writeAttribute("for","count");
w.writeAttribute("value",String.valueOf(a.count));
w.writeEndElement();
w.writeStartElement("attvalue");
w.writeAttribute("for","countleaf");
w.writeAttribute("value",String.valueOf(a.count_leaf));
w.writeEndElement();
w.writeEndElement();
w.writeEndElement();
}
w.writeEndElement();
w.writeStartElement("edges");
int L=0;
for(Node a:toprint)
{
if(a.tax_id==a.parent_id) continue;
w.writeStartElement("edge");
w.writeAttribute("id",String.valueOf(++L));
w.writeAttribute("source",String.valueOf(a.tax_id));
w.writeAttribute("target",String.valueOf(a.parent_id));
w.writeEndElement();
}
w.writeEndElement();
w.writeEndElement();
w.writeEndElement();
w.writeEndDocument();
w.flush();
out.flush();
}
public static void main(String args[]) throws Exception
{
Biostar45691 app=new Biostar45691();
if(args.length!=2)
{
System.err.println("Usage: taxonomy-dir taxon-id-files");
return;
}
app.parseTaxonomy(new File(args[0]));
app.parseTaxonIds(new File(args[1]));
app.dump(System.out);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment