Created
March 3, 2010 12:45
-
-
Save lindenb/320585 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package org.lindenb.acn2taxonomy; | |
import java.io.BufferedReader; | |
import java.io.File; | |
import java.io.IOException; | |
import java.io.InputStream; | |
import java.io.InputStreamReader; | |
import java.net.URL; | |
import java.net.URLConnection; | |
import java.util.ArrayList; | |
import java.util.List; | |
import java.util.logging.Level; | |
import java.util.logging.Logger; | |
import java.util.regex.Pattern; | |
import javax.xml.parsers.DocumentBuilder; | |
import javax.xml.parsers.DocumentBuilderFactory; | |
import javax.xml.parsers.SAXParser; | |
import javax.xml.parsers.SAXParserFactory; | |
import org.lindenb.berkeley.db.PrimaryDB; | |
import org.lindenb.io.IOUtils; | |
import org.lindenb.me.Me; | |
import org.lindenb.util.C; | |
import org.lindenb.util.Compilation; | |
import org.lindenb.util.StringUtils; | |
import org.lindenb.xml.XMLUtilities; | |
import org.w3c.dom.Document; | |
import org.w3c.dom.Element; | |
import org.xml.sax.Attributes; | |
import org.xml.sax.InputSource; | |
import org.xml.sax.SAXException; | |
import org.xml.sax.helpers.DefaultHandler; | |
import com.sleepycat.bind.tuple.IntegerBinding; | |
import com.sleepycat.bind.tuple.TupleBinding; | |
import com.sleepycat.bind.tuple.TupleInput; | |
import com.sleepycat.bind.tuple.TupleOutput; | |
import com.sleepycat.je.DatabaseConfig; | |
import com.sleepycat.je.Environment; | |
import com.sleepycat.je.EnvironmentConfig; | |
public class AcnToTaxonomy | |
{ | |
private static final Logger LOG=Logger.getLogger("org.lindenb"); | |
private File baseDir=new File(System.getProperty("java.io.tmpdir")); | |
private File dbHome=null; | |
private Environment environment=null; | |
private PrimaryDB<Integer, TaxonNode> id2taxon=null; | |
private DocumentBuilder docBuilder; | |
private long sleep_time=100; | |
private static class TaxonNode | |
{ | |
int id; | |
String name=""; | |
int parent_id=-1; | |
} | |
private static class TaxonBinding | |
extends TupleBinding<TaxonNode> | |
{ | |
@Override | |
public TaxonNode entryToObject(TupleInput in) | |
{ | |
TaxonNode n=new TaxonNode(); | |
n.id=in.readInt(); | |
n.name=in.readString(); | |
n.parent_id=in.readInt(); | |
return n; | |
} | |
@Override | |
public void objectToEntry(TaxonNode node, TupleOutput out) | |
{ | |
out.writeInt(node.id); | |
out.writeString(node.name); | |
out.writeInt(node.parent_id); | |
} | |
} | |
private class TinyXmlHandler | |
extends DefaultHandler | |
{ | |
private StringBuilder text=null; | |
private int TSeq_taxid=-1; | |
private String TSeq_defline=null; | |
private String error=null; | |
TinyXmlHandler(String acn) | |
{ | |
} | |
@Override | |
public void startElement(String uri, String localName, String name, | |
Attributes attributes) throws SAXException | |
{ | |
text=null; | |
if(StringUtils.isIn(name,"TSeq_taxid","TSeq_defline","Error")) | |
{ | |
this.text=new StringBuilder(); | |
} | |
} | |
@Override | |
public void endElement(String uri, String localName, String name) throws SAXException | |
{ | |
if(name.equals("TSeq_taxid")) { this.TSeq_taxid= Integer.parseInt(this.text.toString());} | |
else if(name.equals("TSeq_defline")) { this.TSeq_defline= this.text.toString();} | |
else if(name.equals("Error")) { this.error= this.text.toString();} | |
text=null; | |
} | |
@Override | |
public void characters(char[] ch, int start, int length) | |
throws SAXException { | |
if(this.text!=null) text.append(ch, start, length); | |
} | |
} | |
private AcnToTaxonomy() | |
throws Exception | |
{ | |
DocumentBuilderFactory f=DocumentBuilderFactory.newInstance(); | |
f.setCoalescing(true); | |
f.setNamespaceAware(false); | |
f.setValidating(false); | |
f.setExpandEntityReferences(true); | |
f.setIgnoringComments(true); | |
f.setIgnoringElementContentWhitespace(true); | |
this.docBuilder= f.newDocumentBuilder(); | |
} | |
private void open() throws IOException | |
{ | |
this.dbHome=IOUtils.createTempDir(this.baseDir); | |
LOG.info("created "+this.dbHome); | |
EnvironmentConfig envConfig= new EnvironmentConfig(); | |
envConfig.setAllowCreate(true); | |
envConfig.setReadOnly(false); | |
this.environment= new Environment(dbHome, envConfig); | |
LOG.info("opened bdbd env"); | |
DatabaseConfig dbConfig=new DatabaseConfig(); | |
dbConfig.setAllowCreate(true); | |
dbConfig.setReadOnly(false); | |
this.id2taxon=new PrimaryDB<Integer, TaxonNode>(this.environment, null, "id2taxon", dbConfig, new IntegerBinding(), new TaxonBinding()); | |
} | |
private void close() | |
{ | |
if(this.id2taxon!=null) | |
{ | |
LOG.info("closing database"); | |
this.id2taxon.close(); | |
this.id2taxon=null; | |
} | |
if(this.environment!=null) | |
{ | |
LOG.info("closing bdbd env"); | |
this.environment.close(); | |
this.environment=null; | |
} | |
if(this.dbHome!=null) | |
{ | |
for(File f: this.dbHome.listFiles()) | |
{ | |
f.delete(); | |
} | |
this.dbHome.delete(); | |
this.dbHome=null; | |
} | |
} | |
private InputStream openURL(URL url)throws IOException | |
{ | |
final int max_try=10; | |
for(int try_count=0;try_count<max_try;++try_count) | |
{ | |
InputStream is=null; | |
try | |
{ | |
URLConnection con=url.openConnection(); | |
con.setConnectTimeout(10*1000); | |
is=con.getInputStream(); | |
return is; | |
} | |
catch(Exception err) | |
{ | |
System.err.println("Cannot open "+url+" trying... "+(try_count+1)+"/"+try_count); | |
try | |
{ | |
Thread.sleep(10*1000); | |
} | |
catch (InterruptedException e) | |
{ | |
} | |
} | |
} | |
throw new IOException("Cannot open "+url); | |
} | |
private StringBuilder taxopath(int taxonid,StringBuilder str) throws Exception | |
{ | |
TaxonNode node=this.id2taxon.get(null, taxonid); | |
if(node==null) | |
{ | |
String url="http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=taxonomy&id="+taxonid+"&retmode=xml&tool=acn2tax&email=plindenbaum_at_yahoo_fr"; | |
InputStream in=openURL(new URL(url)); | |
Document dom= this.docBuilder.parse(new InputSource(in)); | |
in.close(); | |
Element root=dom.getDocumentElement(); | |
Element Taxon=XMLUtilities.one(root, "Taxon"); | |
node=new TaxonNode(); | |
Element TaxId=XMLUtilities.one(Taxon, "TaxId"); | |
Element ScientificName=XMLUtilities.one(Taxon, "ScientificName"); | |
node.id= Integer.parseInt(TaxId.getTextContent()); | |
node.name= ScientificName.getTextContent(); | |
Element LineageEx=XMLUtilities.one(Taxon, "LineageEx"); | |
List<Element> taxons= XMLUtilities.elements(LineageEx, "Taxon"); | |
List<TaxonNode> nodes= new ArrayList<TaxonNode>(taxons.size()); | |
for(Element e: taxons) | |
{ | |
TaxId=XMLUtilities.one(e, "TaxId"); | |
ScientificName=XMLUtilities.one(e, "ScientificName"); | |
TaxonNode newnode= new TaxonNode(); | |
newnode.id= Integer.parseInt(TaxId.getTextContent()); | |
newnode.name= ScientificName.getTextContent(); | |
nodes.add(newnode); | |
} | |
nodes.add(node); | |
for(int i=1;i< nodes.size();i++) | |
{ | |
nodes.get(i).parent_id=nodes.get(i-1).id; | |
if(!this.id2taxon.containsKey(null,nodes.get(i).id)) | |
{ | |
this.id2taxon.put(null,nodes.get(i).id,nodes.get(i)); | |
} | |
} | |
} | |
else | |
{ | |
str.insert(0,"\""+C.escape(node.name)+"\"("+node.id+")"+(str.length()==0?"":" > ")); | |
} | |
if(node.parent_id>0) | |
{ | |
taxopath(node.parent_id,str); | |
} | |
return str; | |
} | |
private void run(BufferedReader in) throws Exception | |
{ | |
SAXParserFactory f= SAXParserFactory.newInstance(); | |
f.setNamespaceAware(false); | |
f.setValidating(false); | |
SAXParser parser=f.newSAXParser(); | |
Pattern pattern=Pattern.compile("[a-z][a-z_0-9]+(\\.[0-9]+)?",Pattern.CASE_INSENSITIVE); | |
String line; | |
while((line=in.readLine())!=null) | |
{ | |
if(line.startsWith("#")) continue; | |
line=line.trim(); | |
if(line.isEmpty()) continue; | |
if(!pattern.matcher(line).matches()) | |
{ | |
System.err.println("Invalid acn "+line+" does not match "+pattern.pattern()); | |
continue; | |
} | |
String api_url="http://www.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id="+ | |
line+ | |
"&rettype=fasta&retmode=xml&tool=acn2tax&email=plindenbaum_at_yahoo_fr" | |
; | |
LOG.info(api_url); | |
URL url=new URL(api_url); | |
InputStream is=openURL(url); | |
TinyXmlHandler handler=new TinyXmlHandler(line); | |
parser.parse(is, handler); | |
is.close(); | |
if(handler.error!=null) | |
{ | |
System.err.println("#Error: cannot get "+line+" : "+handler.error); | |
} | |
else | |
{ | |
StringBuilder taxonpath=taxopath(handler.TSeq_taxid,new StringBuilder()); | |
System.out.println(line+"\t\""+C.escape(handler.TSeq_defline)+"\"\t"+taxonpath); | |
} | |
try { Thread.sleep(this.sleep_time);}catch(Exception e2) {} | |
} | |
} | |
public static void main(String[] args) | |
{ | |
AcnToTaxonomy app=null; | |
try | |
{ | |
app=new AcnToTaxonomy(); | |
LOG.setLevel(Level.OFF); | |
int optind=0; | |
while(optind< args.length) | |
{ | |
if(args[optind].equals("-h") || | |
args[optind].equals("-help") || | |
args[optind].equals("--help")) | |
{ | |
System.err.println(Me.FIRST_NAME+" "+Me.LAST_NAME+" "+Me.MAIL); | |
System.err.println(Compilation.getLabel()); | |
System.err.println("Options:"); | |
System.err.println(" -b <dir> base directory for bdb files:"+app.baseDir); | |
System.err.println(" --log-level <level> one of "+Level.class.getName()); | |
System.err.println(" -h help; This screen."); | |
return; | |
} | |
else if(args[optind].equals("--log-level")) | |
{ | |
LOG.setLevel(Level.parse(args[++optind])); | |
} | |
else if(args[optind].equals("-b")) | |
{ | |
app.baseDir=new File(args[optind++]); | |
if(!app.baseDir.exists()) | |
{ | |
System.err.println("File does not exist: "+app.baseDir); | |
return; | |
} | |
if(!app.baseDir.isDirectory()) | |
{ | |
System.err.println("File is not a directory: "+app.baseDir); | |
return; | |
} | |
break; | |
} | |
else if(args[optind].equals("--")) | |
{ | |
optind++; | |
break; | |
} | |
else if(args[optind].startsWith("-")) | |
{ | |
System.err.println("Unknown option "+args[optind]); | |
return; | |
} | |
else | |
{ | |
break; | |
} | |
++optind; | |
} | |
app.open(); | |
if(optind==args.length) | |
{ | |
app.run(new BufferedReader(new InputStreamReader(System.in))); | |
} | |
else | |
{ | |
while(optind< args.length) | |
{ | |
java.io.BufferedReader r= IOUtils.openReader(args[optind++]); | |
app.run(r); | |
r.close(); | |
} | |
} | |
} | |
catch(Throwable err) | |
{ | |
err.printStackTrace(); | |
} | |
finally | |
{ | |
if(app!=null) app.close(); | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
all: | |
mkdir -p acn2tax/lib | |
mkdir -p acn2tax/tmp | |
javac -d acn2tax/tmp -cp /usr/local/package/je-4.0.71/lib/je-4.0.71.jar | |
-sourcepath src:/home/pierre/lindenb/src/java src/org/lindenb/acn2taxonomy/AcnTo | |
Taxonomy.java | |
jar cvf acn2tax/lib/acn2tax.jar -C acn2tax/tmp . | |
-cp /usr/local/package/je-4.0.71/lib/je-4.0.71.jar acn2tax/lib/ | |
rm -rf acn2tax/tmp | |
zip -r acn2tax.zip acn2tax |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment