Skip to content

Instantly share code, notes, and snippets.

@lindenb
Created March 3, 2010 12:45
Show Gist options
  • Save lindenb/320585 to your computer and use it in GitHub Desktop.
Save lindenb/320585 to your computer and use it in GitHub Desktop.
package org.lindenb.acn2taxonomy;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.lindenb.berkeley.db.PrimaryDB;
import org.lindenb.io.IOUtils;
import org.lindenb.me.Me;
import org.lindenb.util.C;
import org.lindenb.util.Compilation;
import org.lindenb.util.StringUtils;
import org.lindenb.xml.XMLUtilities;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import com.sleepycat.bind.tuple.IntegerBinding;
import com.sleepycat.bind.tuple.TupleBinding;
import com.sleepycat.bind.tuple.TupleInput;
import com.sleepycat.bind.tuple.TupleOutput;
import com.sleepycat.je.DatabaseConfig;
import com.sleepycat.je.Environment;
import com.sleepycat.je.EnvironmentConfig;
public class AcnToTaxonomy
{
private static final Logger LOG=Logger.getLogger("org.lindenb");
private File baseDir=new File(System.getProperty("java.io.tmpdir"));
private File dbHome=null;
private Environment environment=null;
private PrimaryDB<Integer, TaxonNode> id2taxon=null;
private DocumentBuilder docBuilder;
private long sleep_time=100;
private static class TaxonNode
{
int id;
String name="";
int parent_id=-1;
}
private static class TaxonBinding
extends TupleBinding<TaxonNode>
{
@Override
public TaxonNode entryToObject(TupleInput in)
{
TaxonNode n=new TaxonNode();
n.id=in.readInt();
n.name=in.readString();
n.parent_id=in.readInt();
return n;
}
@Override
public void objectToEntry(TaxonNode node, TupleOutput out)
{
out.writeInt(node.id);
out.writeString(node.name);
out.writeInt(node.parent_id);
}
}
private class TinyXmlHandler
extends DefaultHandler
{
private StringBuilder text=null;
private int TSeq_taxid=-1;
private String TSeq_defline=null;
private String error=null;
TinyXmlHandler(String acn)
{
}
@Override
public void startElement(String uri, String localName, String name,
Attributes attributes) throws SAXException
{
text=null;
if(StringUtils.isIn(name,"TSeq_taxid","TSeq_defline","Error"))
{
this.text=new StringBuilder();
}
}
@Override
public void endElement(String uri, String localName, String name) throws SAXException
{
if(name.equals("TSeq_taxid")) { this.TSeq_taxid= Integer.parseInt(this.text.toString());}
else if(name.equals("TSeq_defline")) { this.TSeq_defline= this.text.toString();}
else if(name.equals("Error")) { this.error= this.text.toString();}
text=null;
}
@Override
public void characters(char[] ch, int start, int length)
throws SAXException {
if(this.text!=null) text.append(ch, start, length);
}
}
private AcnToTaxonomy()
throws Exception
{
DocumentBuilderFactory f=DocumentBuilderFactory.newInstance();
f.setCoalescing(true);
f.setNamespaceAware(false);
f.setValidating(false);
f.setExpandEntityReferences(true);
f.setIgnoringComments(true);
f.setIgnoringElementContentWhitespace(true);
this.docBuilder= f.newDocumentBuilder();
}
private void open() throws IOException
{
this.dbHome=IOUtils.createTempDir(this.baseDir);
LOG.info("created "+this.dbHome);
EnvironmentConfig envConfig= new EnvironmentConfig();
envConfig.setAllowCreate(true);
envConfig.setReadOnly(false);
this.environment= new Environment(dbHome, envConfig);
LOG.info("opened bdbd env");
DatabaseConfig dbConfig=new DatabaseConfig();
dbConfig.setAllowCreate(true);
dbConfig.setReadOnly(false);
this.id2taxon=new PrimaryDB<Integer, TaxonNode>(this.environment, null, "id2taxon", dbConfig, new IntegerBinding(), new TaxonBinding());
}
private void close()
{
if(this.id2taxon!=null)
{
LOG.info("closing database");
this.id2taxon.close();
this.id2taxon=null;
}
if(this.environment!=null)
{
LOG.info("closing bdbd env");
this.environment.close();
this.environment=null;
}
if(this.dbHome!=null)
{
for(File f: this.dbHome.listFiles())
{
f.delete();
}
this.dbHome.delete();
this.dbHome=null;
}
}
private InputStream openURL(URL url)throws IOException
{
final int max_try=10;
for(int try_count=0;try_count<max_try;++try_count)
{
InputStream is=null;
try
{
URLConnection con=url.openConnection();
con.setConnectTimeout(10*1000);
is=con.getInputStream();
return is;
}
catch(Exception err)
{
System.err.println("Cannot open "+url+" trying... "+(try_count+1)+"/"+try_count);
try
{
Thread.sleep(10*1000);
}
catch (InterruptedException e)
{
}
}
}
throw new IOException("Cannot open "+url);
}
private StringBuilder taxopath(int taxonid,StringBuilder str) throws Exception
{
TaxonNode node=this.id2taxon.get(null, taxonid);
if(node==null)
{
String url="http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=taxonomy&id="+taxonid+"&retmode=xml&tool=acn2tax&email=plindenbaum_at_yahoo_fr";
InputStream in=openURL(new URL(url));
Document dom= this.docBuilder.parse(new InputSource(in));
in.close();
Element root=dom.getDocumentElement();
Element Taxon=XMLUtilities.one(root, "Taxon");
node=new TaxonNode();
Element TaxId=XMLUtilities.one(Taxon, "TaxId");
Element ScientificName=XMLUtilities.one(Taxon, "ScientificName");
node.id= Integer.parseInt(TaxId.getTextContent());
node.name= ScientificName.getTextContent();
Element LineageEx=XMLUtilities.one(Taxon, "LineageEx");
List<Element> taxons= XMLUtilities.elements(LineageEx, "Taxon");
List<TaxonNode> nodes= new ArrayList<TaxonNode>(taxons.size());
for(Element e: taxons)
{
TaxId=XMLUtilities.one(e, "TaxId");
ScientificName=XMLUtilities.one(e, "ScientificName");
TaxonNode newnode= new TaxonNode();
newnode.id= Integer.parseInt(TaxId.getTextContent());
newnode.name= ScientificName.getTextContent();
nodes.add(newnode);
}
nodes.add(node);
for(int i=1;i< nodes.size();i++)
{
nodes.get(i).parent_id=nodes.get(i-1).id;
if(!this.id2taxon.containsKey(null,nodes.get(i).id))
{
this.id2taxon.put(null,nodes.get(i).id,nodes.get(i));
}
}
}
else
{
str.insert(0,"\""+C.escape(node.name)+"\"("+node.id+")"+(str.length()==0?"":" > "));
}
if(node.parent_id>0)
{
taxopath(node.parent_id,str);
}
return str;
}
private void run(BufferedReader in) throws Exception
{
SAXParserFactory f= SAXParserFactory.newInstance();
f.setNamespaceAware(false);
f.setValidating(false);
SAXParser parser=f.newSAXParser();
Pattern pattern=Pattern.compile("[a-z][a-z_0-9]+(\\.[0-9]+)?",Pattern.CASE_INSENSITIVE);
String line;
while((line=in.readLine())!=null)
{
if(line.startsWith("#")) continue;
line=line.trim();
if(line.isEmpty()) continue;
if(!pattern.matcher(line).matches())
{
System.err.println("Invalid acn "+line+" does not match "+pattern.pattern());
continue;
}
String api_url="http://www.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id="+
line+
"&rettype=fasta&retmode=xml&tool=acn2tax&email=plindenbaum_at_yahoo_fr"
;
LOG.info(api_url);
URL url=new URL(api_url);
InputStream is=openURL(url);
TinyXmlHandler handler=new TinyXmlHandler(line);
parser.parse(is, handler);
is.close();
if(handler.error!=null)
{
System.err.println("#Error: cannot get "+line+" : "+handler.error);
}
else
{
StringBuilder taxonpath=taxopath(handler.TSeq_taxid,new StringBuilder());
System.out.println(line+"\t\""+C.escape(handler.TSeq_defline)+"\"\t"+taxonpath);
}
try { Thread.sleep(this.sleep_time);}catch(Exception e2) {}
}
}
public static void main(String[] args)
{
AcnToTaxonomy app=null;
try
{
app=new AcnToTaxonomy();
LOG.setLevel(Level.OFF);
int optind=0;
while(optind< args.length)
{
if(args[optind].equals("-h") ||
args[optind].equals("-help") ||
args[optind].equals("--help"))
{
System.err.println(Me.FIRST_NAME+" "+Me.LAST_NAME+" "+Me.MAIL);
System.err.println(Compilation.getLabel());
System.err.println("Options:");
System.err.println(" -b <dir> base directory for bdb files:"+app.baseDir);
System.err.println(" --log-level <level> one of "+Level.class.getName());
System.err.println(" -h help; This screen.");
return;
}
else if(args[optind].equals("--log-level"))
{
LOG.setLevel(Level.parse(args[++optind]));
}
else if(args[optind].equals("-b"))
{
app.baseDir=new File(args[optind++]);
if(!app.baseDir.exists())
{
System.err.println("File does not exist: "+app.baseDir);
return;
}
if(!app.baseDir.isDirectory())
{
System.err.println("File is not a directory: "+app.baseDir);
return;
}
break;
}
else if(args[optind].equals("--"))
{
optind++;
break;
}
else if(args[optind].startsWith("-"))
{
System.err.println("Unknown option "+args[optind]);
return;
}
else
{
break;
}
++optind;
}
app.open();
if(optind==args.length)
{
app.run(new BufferedReader(new InputStreamReader(System.in)));
}
else
{
while(optind< args.length)
{
java.io.BufferedReader r= IOUtils.openReader(args[optind++]);
app.run(r);
r.close();
}
}
}
catch(Throwable err)
{
err.printStackTrace();
}
finally
{
if(app!=null) app.close();
}
}
}
all:
mkdir -p acn2tax/lib
mkdir -p acn2tax/tmp
javac -d acn2tax/tmp -cp /usr/local/package/je-4.0.71/lib/je-4.0.71.jar
-sourcepath src:/home/pierre/lindenb/src/java src/org/lindenb/acn2taxonomy/AcnTo
Taxonomy.java
jar cvf acn2tax/lib/acn2tax.jar -C acn2tax/tmp .
-cp /usr/local/package/je-4.0.71/lib/je-4.0.71.jar acn2tax/lib/
rm -rf acn2tax/tmp
zip -r acn2tax.zip acn2tax
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment