Created
May 25, 2012 12:32
-
-
Save lindenb/2787783 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.*; | |
import java.util.*; | |
import java.util.regex.*; | |
import javax.xml.stream.XMLOutputFactory; | |
import javax.xml.stream.XMLStreamWriter; | |
public class Biostar45691 | |
{ | |
private static class Node | |
{ | |
int tax_id; | |
int parent_id=-1; | |
String name=null; | |
int count=0; | |
int count_leaf=0; | |
@Override | |
public int hashCode() { | |
return tax_id; | |
} | |
@Override | |
public boolean equals(Object obj) { | |
return obj==this || ((Node)obj).tax_id==this.tax_id; | |
} | |
} | |
private Map<Integer,Node> id2node=new HashMap<Integer, Biostar45691.Node>(898000); | |
private void parseTaxonomy(File taxDir) throws Exception | |
{ | |
Pattern pipe=Pattern.compile("[\\|]"); | |
BufferedReader in=new BufferedReader(new FileReader(new File(taxDir,"nodes.dmp"))); | |
String line; | |
while((line=in.readLine())!=null) | |
{ | |
String tokens[]=pipe.split(line,3); | |
Node n=new Node(); | |
n.tax_id=Integer.parseInt(tokens[0].trim()); | |
n.parent_id=Integer.parseInt(tokens[1].trim()); | |
id2node.put(n.tax_id,n); | |
} | |
in.close(); | |
in=new BufferedReader(new FileReader(new File(taxDir,"names.dmp"))); | |
while((line=in.readLine())!=null) | |
{ | |
String tokens[]=pipe.split(line); | |
int tax_id=Integer.parseInt(tokens[0].trim()); | |
Node n=id2node.get(tax_id); | |
if(n==null) continue; | |
if(n.name==null || tokens[3].contains("scientific name")) | |
{ | |
n.name=tokens[1].trim(); | |
} | |
} | |
in.close(); | |
} | |
private void touch(Node n) | |
{ | |
n.count++; | |
if(n.parent_id!=n.tax_id) | |
{ | |
Node p=id2node.get(n.parent_id); | |
if(p!=null) touch(p); | |
} | |
} | |
private void parseTaxonIds(File taxons) throws Exception | |
{ | |
String line; | |
BufferedReader in=new BufferedReader(new FileReader(taxons)); | |
while((line=in.readLine())!=null) | |
{ | |
if(line.isEmpty()) continue; | |
int taxon_id=Integer.parseInt(line.trim()); | |
Node n=id2node.get(taxon_id); | |
if(n==null) continue; | |
n.count_leaf++; | |
touch(n); | |
} | |
in.close(); | |
} | |
private Set<Node> lineage(Node n) | |
{ | |
Set<Node> L=new HashSet<Node>(); | |
while(n!=null && n.tax_id!=n.parent_id) | |
{ | |
L.add(n); | |
n=id2node.get(n.parent_id); | |
} | |
return L; | |
} | |
private void dump(OutputStream out) throws Exception | |
{ | |
XMLOutputFactory xmlfactory= XMLOutputFactory.newInstance(); | |
XMLStreamWriter w= xmlfactory.createXMLStreamWriter(out,"UTF-8"); | |
w.writeStartDocument("UTF-8","1.0"); | |
w.writeStartElement("gexf"); | |
w.writeAttribute("xmlns","http://www.gexf.net/1.2draft"); | |
w.writeAttribute("xmlns:xsi","http://www.w3.org/2001/XMLSchema-instance"); | |
w.writeAttribute("xsi:schemaLocation","http://www.gexf.net/1.2draft http://www.gexf.net/1.2draft/gexf.xsd"); | |
w.writeAttribute("version","1.2"); | |
w.writeStartElement("meta"); | |
w.writeStartElement("creator"); | |
w.writeCharacters("Pierre Lindenbaum"); | |
w.writeEndElement(); | |
w.writeStartElement("description"); | |
w.writeCharacters("biostar 45691"); | |
w.writeEndElement(); | |
w.writeEndElement(); | |
w.writeStartElement("graph"); | |
w.writeAttribute("defaultedgetype","directed"); | |
w.writeStartElement("attributes"); | |
w.writeAttribute("class", "node"); | |
w.writeAttribute("mode", "static"); | |
w.writeStartElement("attribute"); | |
w.writeAttribute("id", "count"); | |
w.writeAttribute("title", "Count"); | |
w.writeAttribute("type", "float"); | |
w.writeEndElement(); | |
w.writeStartElement("attribute"); | |
w.writeAttribute("id", "countleaf"); | |
w.writeAttribute("title", "CountLeaf"); | |
w.writeAttribute("type", "float"); | |
w.writeEndElement(); | |
w.writeEndElement(); | |
Set<Node> toprint=new HashSet<Node>(); | |
for(Node n:this.id2node.values()) | |
{ | |
if(n.count==0) continue; | |
if(toprint.addAll(lineage(n))); | |
} | |
w.writeStartElement("nodes"); | |
for(Node a:toprint) | |
{ | |
w.writeStartElement("node"); | |
w.writeAttribute("id", String.valueOf(a.tax_id)); | |
w.writeAttribute("label",String.valueOf(a.name)); | |
w.writeStartElement("attvalues"); | |
w.writeStartElement("attvalue"); | |
w.writeAttribute("for","count"); | |
w.writeAttribute("value",String.valueOf(a.count)); | |
w.writeEndElement(); | |
w.writeStartElement("attvalue"); | |
w.writeAttribute("for","countleaf"); | |
w.writeAttribute("value",String.valueOf(a.count_leaf)); | |
w.writeEndElement(); | |
w.writeEndElement(); | |
w.writeEndElement(); | |
} | |
w.writeEndElement(); | |
w.writeStartElement("edges"); | |
int L=0; | |
for(Node a:toprint) | |
{ | |
if(a.tax_id==a.parent_id) continue; | |
w.writeStartElement("edge"); | |
w.writeAttribute("id",String.valueOf(++L)); | |
w.writeAttribute("source",String.valueOf(a.tax_id)); | |
w.writeAttribute("target",String.valueOf(a.parent_id)); | |
w.writeEndElement(); | |
} | |
w.writeEndElement(); | |
w.writeEndElement(); | |
w.writeEndElement(); | |
w.writeEndDocument(); | |
w.flush(); | |
out.flush(); | |
} | |
public static void main(String args[]) throws Exception | |
{ | |
Biostar45691 app=new Biostar45691(); | |
if(args.length!=2) | |
{ | |
System.err.println("Usage: taxonomy-dir taxon-id-files"); | |
return; | |
} | |
app.parseTaxonomy(new File(args[0])); | |
app.parseTaxonIds(new File(args[1])); | |
app.dump(System.out); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment