Created
December 14, 2010 14:29
-
-
Save lindenb/740496 to your computer and use it in GitHub Desktop.
Given a gene, identify the world experts http://biostar.stackexchange.com/questions/4296
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Author: Pierre Lindenbaum PhD | |
* WWW: http://plindenbaum.blogspot.com | |
* Motivation: | |
* Given a gene, identify the world experts | |
* http://biostar.stackexchange.com/questions/4296 | |
*/ | |
import java.net.URLEncoder; | |
import java.text.Collator; | |
import java.util.ArrayList; | |
import java.util.Collections; | |
import java.util.Comparator; | |
import java.util.HashSet; | |
import java.util.List; | |
import java.util.Locale; | |
import java.util.Set; | |
import java.util.TreeSet; | |
import java.util.logging.Level; | |
import java.util.logging.Logger; | |
import javax.xml.parsers.DocumentBuilder; | |
import javax.xml.parsers.DocumentBuilderFactory; | |
import javax.xml.stream.XMLOutputFactory; | |
import javax.xml.stream.XMLStreamWriter; | |
import javax.xml.xpath.XPath; | |
import javax.xml.xpath.XPathConstants; | |
import javax.xml.xpath.XPathFactory; | |
import org.w3c.dom.Document; | |
import org.w3c.dom.Node; | |
import org.w3c.dom.NodeList; | |
public class BioStar4296 | |
{ | |
private Logger LOG=Logger.getLogger(BioStar4296.class.getName()); | |
private String organism="Homo Sapiens"; | |
private DocumentBuilder docBuilder; | |
private XPath xpath; | |
private Collator collator; | |
static class Author | |
{ | |
String suffix=""; | |
String firstName=""; | |
String lastName=""; | |
String initials=""; | |
Set<String> mails=new HashSet<String>(); | |
Set<Integer> pmids=new TreeSet<Integer>(); | |
int factor=1; | |
Set<String> affilitations=new HashSet<String>(); | |
@Override | |
public int hashCode() { | |
final int prime = 31; | |
int result = 1; | |
result = prime * result | |
+ ((firstName == null) ? 0 : firstName.hashCode()); | |
result = prime * result | |
+ ((lastName == null) ? 0 : lastName.hashCode()); | |
return result; | |
} | |
@Override | |
public boolean equals(Object obj) { | |
if (this == obj) | |
return true; | |
if (obj == null) | |
return false; | |
if (getClass() != obj.getClass()) | |
return false; | |
Author other = (Author) obj; | |
if (firstName == null) { | |
if (other.firstName != null) | |
return false; | |
} else if (!firstName.equals(other.firstName)) | |
return false; | |
if (lastName == null) { | |
if (other.lastName != null) | |
return false; | |
} else if (!lastName.equals(other.lastName)) | |
return false; | |
return true; | |
} | |
@Override | |
public String toString() { | |
return firstName+" "+lastName+" lab:"+this.affilitations+" mails:"+this.mails; | |
} | |
void write(XMLStreamWriter w) | |
throws Exception | |
{ | |
w.writeStartElement("Person"); | |
w.writeCharacters("\n"); | |
w.writeStartElement("firstName"); | |
w.writeCharacters(firstName); | |
w.writeEndElement(); | |
w.writeCharacters("\n"); | |
w.writeStartElement("lastName"); | |
w.writeCharacters(lastName); | |
w.writeEndElement(); | |
w.writeCharacters("\n"); | |
for(Integer s:pmids) | |
{ | |
w.writeStartElement("pmid"); | |
w.writeCharacters(String.valueOf(s)); | |
w.writeEndElement(); | |
w.writeCharacters("\n"); | |
} | |
for(String s:mails) | |
{ | |
w.writeStartElement("mail"); | |
w.writeCharacters(s); | |
w.writeEndElement(); | |
w.writeCharacters("\n"); | |
} | |
for(String s:affilitations) | |
{ | |
w.writeStartElement("affilitation"); | |
w.writeCharacters(s); | |
w.writeEndElement(); | |
w.writeCharacters("\n"); | |
} | |
w.writeEndElement(); | |
w.writeCharacters("\n"); | |
} | |
} | |
private BioStar4296() throws Exception | |
{ | |
LOG.setLevel(Level.OFF); | |
DocumentBuilderFactory f=DocumentBuilderFactory.newInstance(); | |
f.setNamespaceAware(false); | |
f.setCoalescing(true); | |
f.setIgnoringComments(true); | |
f.setIgnoringElementContentWhitespace(true); | |
f.setValidating(false); | |
this.docBuilder=f.newDocumentBuilder(); | |
XPathFactory factory=XPathFactory.newInstance(); | |
this.xpath=factory.newXPath(); | |
this.collator= Collator.getInstance(Locale.FRENCH); | |
this.collator.setStrength(Collator.PRIMARY); | |
} | |
private int search(XMLStreamWriter w,String geneName) | |
throws Exception | |
{ | |
w.writeCharacters("\n"); | |
w.writeStartElement("gene"); | |
w.writeAttribute("name", geneName); | |
String url= "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gene&term="+ | |
URLEncoder.encode(geneName+"[PREF] \""+this.organism+"\"[ORGN]", "UTF-8"); | |
LOG.info(url); | |
Document dom=this.docBuilder.parse(url); | |
NodeList list=(NodeList)this.xpath.evaluate( | |
"/eSearchResult/IdList/Id", | |
dom,XPathConstants.NODESET); | |
if(list.getLength()==0) | |
{ | |
w.writeComment("Cannot find any entry for "+geneName); | |
w.writeEndElement(); | |
return -1; | |
} | |
else if(list.getLength()!=1) | |
{ | |
w.writeComment("Ambigous name "+geneName); | |
w.writeEndElement(); | |
return -1; | |
} | |
String geneId= list.item(0).getTextContent(); | |
LOG.info("GeneId:"+geneId); | |
w.writeAttribute("geneId", geneId); | |
url="http://www.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&id="+ | |
geneId+ | |
"&rettype=text&retmode=xml"; | |
LOG.info(url); | |
dom=this.docBuilder.parse(url); | |
list=(NodeList)this.xpath.evaluate( | |
"//PubMedId", | |
dom,XPathConstants.NODESET); | |
if(list.getLength()==0) | |
{ | |
w.writeComment("No pubmed for "+geneName); | |
w.writeEndElement(); | |
return -1; | |
} | |
List<Author> authors=new ArrayList<Author>(); | |
Set<Integer> pmidSet=new TreeSet<Integer>(); | |
for(int articleIdx=0;articleIdx< list.getLength();++articleIdx) | |
{ | |
String pmid= list.item(articleIdx).getTextContent(); | |
LOG.info("PMID:"+pmid); | |
pmidSet.add(Integer.parseInt(pmid)); | |
} | |
w.writeAttribute("count-pmids",String.valueOf(pmidSet.size())); | |
w.writeCharacters("\n"); | |
for(Integer pmid: pmidSet) | |
{ | |
LOG.info("PMID:"+pmid); | |
url="http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id="+pmid+"&retmode=xml"; | |
LOG.info("url:"+url); | |
dom=this.docBuilder.parse(url); | |
Node n=(Node)this.xpath.evaluate("//Affiliation", dom,XPathConstants.NODE); | |
if(n==null) continue; | |
String affiliation=n.getTextContent(); | |
String adressFragments[]=affiliation.split("[ \t\\:\\<,\\>\\(\\)]"); | |
LOG.info("affiliation:"+affiliation); | |
NodeList authorList=(NodeList)this.xpath.evaluate( | |
"//AuthorList/Author", | |
dom,XPathConstants.NODESET); | |
LOG.info("Authors:"+authorList.getLength()); | |
if(authorList.getLength()==0) continue; | |
for(int j=0;j< authorList.getLength();++j) | |
{ | |
boolean collective=false; | |
Author author=new Author(); | |
for(Node c1=authorList.item(j).getFirstChild();c1!=null;c1=c1.getNextSibling()) | |
{ | |
if(c1.getNodeType()!=Node.ELEMENT_NODE) continue; | |
String tag=c1.getNodeName(); | |
String content= c1.getTextContent(); | |
if(tag.equals("LastName")) | |
{ | |
author.lastName= content; | |
} | |
else if(tag.equals("FirstName") || tag.equals("ForeName")) | |
{ | |
author.firstName= content; | |
} | |
else if(tag.equals("Initials")) | |
{ | |
author.initials= content; | |
} | |
else if(tag.equals("CollectiveName")) | |
{ | |
collective=true; | |
break; | |
} | |
else if(tag.equals("Suffix")) | |
{ | |
author.suffix= content; | |
} | |
} | |
if(collective) continue; | |
LOG.info("Make New Author:"+author); | |
int k=0; | |
for(k=0;k< authors.size();++k) | |
{ | |
Author p=authors.get(k); | |
if( !p.firstName.isEmpty() && | |
this.collator.compare(p.firstName,author.firstName)==0 && | |
this.collator.compare(p.lastName,author.lastName)==0) | |
{ | |
LOG.info("Same: "+p+" "+author); | |
author=p; | |
break; | |
} | |
} | |
if(k==authors.size()) | |
{ | |
k=0; | |
for(k=0;k< authors.size();++k) | |
{ | |
Author p=authors.get(k); | |
if( | |
( | |
(!author.initials.isEmpty() && p.firstName.toLowerCase().startsWith(author.initials.toLowerCase())) || | |
(!p.initials.isEmpty() && author.firstName.toLowerCase().startsWith(p.initials) )|| | |
this.collator.compare(p.initials,author.initials)==0 )&& | |
this.collator.compare(p.lastName,author.lastName)==0) | |
{ | |
LOG.info("Same: "+p+" "+author); | |
if(p.firstName.length()< author.firstName.length()) | |
{ | |
p.firstName=author.firstName; | |
} | |
author=p; | |
break; | |
} | |
} | |
} | |
if(k==authors.size()) | |
{ | |
LOG.info("Adding: "+author); | |
authors.add(author); | |
} | |
author.factor*=j; | |
author.affilitations.add(affiliation); | |
author.pmids.add(pmid); | |
if(affiliation.indexOf('@')!=-1) | |
{ | |
for(String mail: adressFragments) | |
{ | |
mail.replaceAll("\\{\\}", ""); | |
if(mail.endsWith(".")) mail= mail.substring(0,mail.length()-1); | |
int index=mail.indexOf('@'); | |
if(index==-1) continue; | |
String mailPrefix=mail.substring(0,index).toLowerCase(); | |
if(mailPrefix.contains(author.lastName.toLowerCase()) || | |
collator.compare(mailPrefix, author.lastName)==0) | |
{ | |
LOG.info("Adding: "+mail+" to "+author); | |
author.mails.add(mail.toLowerCase()); | |
} | |
else if( author.firstName.length()>1 && | |
(mailPrefix.contains( author.firstName.toLowerCase()) || | |
collator.compare(mailPrefix, author.firstName)==0)) | |
{ | |
LOG.info("Adding: "+mail+" to "+author); | |
author.mails.add(mail.toLowerCase()); | |
} | |
} | |
} | |
} | |
} | |
if(authors.isEmpty()) | |
{ | |
w.writeComment("No Author found"); | |
w.writeEndElement(); | |
return -1; | |
} | |
Collections.sort(authors,new Comparator<Author>() | |
{ | |
@Override | |
public int compare(Author o1, Author o2) | |
{ | |
int i= o2.pmids.size()-o1.pmids.size(); | |
if(i!=0) return i; | |
i= o2.factor-o1.factor;//later is more interesting ? not sure... | |
return i; | |
} | |
}); | |
authors.get(0).write(w); | |
w.writeEndElement(); | |
return 0; | |
} | |
public static void main(String[] args) | |
{ | |
try { | |
BioStar4296 app= new BioStar4296(); | |
int optind=0; | |
while(optind<args.length) | |
{ | |
if(args[optind].equals("-h")) | |
{ | |
System.err.println("Pierre Lindenbaum"); | |
System.err.println("Options:"); | |
System.err.println(" -o <organism> ["+app.organism+"]"); | |
System.err.println(" -v show logs"); | |
return; | |
} | |
else if(args[optind].equals("-o")) | |
{ | |
app.organism=args[++optind]; | |
} | |
else if(args[optind].equals("-v")) | |
{ | |
app.LOG.setLevel(Level.ALL); | |
} | |
else if(args[optind].equals("--")) | |
{ | |
optind++; | |
break; | |
} | |
else if(args[optind].startsWith("-")) | |
{ | |
System.err.println("Unnown option: "+args[optind]); | |
return; | |
} | |
else | |
{ | |
break; | |
} | |
++optind; | |
} | |
if(optind==args.length) | |
{ | |
System.err.println("Gene Name missing"); | |
} | |
else | |
{ | |
XMLOutputFactory xmlfactory= XMLOutputFactory.newInstance(); | |
XMLStreamWriter w= xmlfactory.createXMLStreamWriter(System.out,"UTF-8"); | |
w.writeStartDocument("UTF-8","1.0"); | |
w.writeCharacters("\n"); | |
w.writeStartElement("experts"); | |
w.writeCharacters("\n"); | |
while(optind < args.length) | |
{ | |
app.search(w,args[optind]); | |
optind++; | |
w.writeCharacters("\n"); | |
} | |
w.writeEndElement(); | |
w.writeEndDocument(); | |
w.flush(); | |
} | |
} catch (Exception e) | |
{ | |
e.printStackTrace(); | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0" encoding="UTF-8"?> | |
<experts> | |
<gene name="ZC3H7B" geneId="23264" count-pmids="13"> | |
<Person> | |
<firstName>Sumio</firstName> | |
<lastName>Sugano</lastName> | |
<pmid>8125298</pmid> | |
<pmid>9373149</pmid> | |
<pmid>14702039</pmid> | |
<affilitation>International and Interdisciplinary Studies, The University of Tokyo, Japan.</affilitation> | |
<affilitation>Institute of Medical Science, University of Tokyo, Japan.</affilitation> | |
<affilitation>Helix Research Institute, 1532-3 Yana, Kisarazu, Chiba 292-0812, Japan.</affilitation> | |
</Person> | |
</gene> | |
<gene name="eif4G1" geneId="1981" count-pmids="106"> | |
<Person> | |
<firstName>Nahum</firstName> | |
<lastName>Sonenberg</lastName> | |
<pmid>7651417</pmid> | |
<pmid>7935836</pmid> | |
<pmid>8449919</pmid> | |
<pmid>8521827</pmid> | |
<pmid>9372926</pmid> | |
<pmid>9418880</pmid> | |
<pmid>9857202</pmid> | |
<pmid>9878069</pmid> | |
<pmid>10523622</pmid> | |
<pmid>10753870</pmid> | |
<pmid>10872469</pmid> | |
<pmid>10996799</pmid> | |
<pmid>15193258</pmid> | |
<pmid>15234964</pmid> | |
<pmid>15314020</pmid> | |
<pmid>15961545</pmid> | |
<pmid>16698552</pmid> | |
<pmid>19114555</pmid> | |
<pmid>19203580</pmid> | |
<pmid>20053821</pmid> | |
<affilitation>Department of Biochemistry and McGill Cancer Center, McGill University, Montreal, H3G 1Y6, Quebec, Canada.</affilitation> | |
<affilitation>Department of Biochemistry, McGill University, Montreal, Quebec, Canada.</affilitation> | |
<affilitation>Laboratories of Molecular Biophysics, The Rockefeller University, New York, New York 10021, USA.</affilitation> | |
<affilitation>Molecular Histology Unit, San Raffaele Institute, 20132 Milan, Italy.</affilitation> | |
<affilitation>Department of Biochemistry McGill University, Montréal, Québec, Canada. gingras@med.mcgill.ca</affilitation> | |
<affilitation>Department of Biochemistry and McGill Cancer Cancer Center, McGill University, 3655 Drummond Street, Montréal, Québec, H3G 1Y6 Canada.</affilitation> | |
<affilitation>Department of Biochemistry and McGill Cancer Centre, McGill University, Montreal, Quebec, Canada.</affilitation> | |
<affilitation>Department of Biological Chemistry and Molecular Pharmacology, Harvard Medical School, Boston, MA, USA. amarint@bu.edu</affilitation> | |
<affilitation>Dana-Farber Cancer Institute, Harvard Medical School, Boston, Massachusetts 02115, USA.</affilitation> | |
<affilitation>Department of Biochemistry, University of California, Riverside, Riverside, California 92521, USA.</affilitation> | |
<affilitation>Department of Biochemistry, McGill University, Montréal, Quebec, Canada.</affilitation> | |
<affilitation>Department of Biochemistry and McGill Cancer Center, McGill University, Montréal, Québec, Canada.</affilitation> | |
<affilitation>Department of Biochemistry and McGill Cancer Centre, McGill University, Drummond Street 3655, Montreal, Quebec, Canada H3G 1Y6.</affilitation> | |
<affilitation>Department of Biochemistry, McGill Cancer Center, McGill University, Montreal, Quebec, Canada H3G 1Y6.</affilitation> | |
<affilitation>Department of Biochemistry and Molecular Genetics, University of Illinois at Chicago, 60607, USA. nhay@uic.edu</affilitation> | |
<affilitation>Department of Biochemistry, McGill University, Montréal, Québec, Canada.</affilitation> | |
<affilitation>Department of Medicine, University of Minnesota, Minneapolis, MN 55455, USA.</affilitation> | |
<affilitation>Department of Biochemistry, Case Western Reserve University, Cleveland, Ohio 44106.</affilitation> | |
</Person> | |
</gene> | |
<gene name="PRNP" geneId="5621" count-pmids="429"> | |
<Person> | |
<firstName>John</firstName> | |
<lastName>Collinge</lastName> | |
<pmid>1352724</pmid> | |
<pmid>1677164</pmid> | |
<pmid>2159587</pmid> | |
<pmid>2567794</pmid> | |
<pmid>8364585</pmid> | |
<pmid>11283320</pmid> | |
<pmid>11704923</pmid> | |
<pmid>12514748</pmid> | |
<pmid>12621436</pmid> | |
<pmid>12690204</pmid> | |
<pmid>15123682</pmid> | |
<pmid>15539564</pmid> | |
<pmid>16099923</pmid> | |
<pmid>16156720</pmid> | |
<pmid>16342955</pmid> | |
<pmid>16824036</pmid> | |
<pmid>16847141</pmid> | |
<pmid>16925523</pmid> | |
<pmid>17709704</pmid> | |
<pmid>18638557</pmid> | |
<pmid>19081515</pmid> | |
<pmid>19218199</pmid> | |
<pmid>19321423</pmid> | |
<pmid>19369250</pmid> | |
<pmid>19923577</pmid> | |
<pmid>20109837</pmid> | |
<pmid>20583301</pmid> | |
<mail>j.collinge@ic.ac.uk</mail> | |
<affilitation>MRC Prion Unit, Department of Neurodegenerative Disease, Institute of Neurology, Queen Square, London WC1N 3BG, UK.</affilitation> | |
<affilitation>CNS Infection and Immunity Group, Department of Neurogenetics, Division of Neurosciences and Psychological Medicine, Faculty of Medicine, Imperial College, Norfolk Place, London W2 1PG, UK.</affilitation> | |
<affilitation>Krebs Institute for Biomolecular Research, Department of Molecular Biology and Biotechnology, University of Sheffield, Sheffield S10 2TN, UK.</affilitation> | |
<affilitation>MRC Prion Unit and Department of Neurogenetics, Imperial College School of Medicine at St. Mary's, London, United Kingdom. J.Collinge@ic.ac.uk</affilitation> | |
<affilitation>Division of Neuroscience (Neurophysiology), Medical School, University of Birmingham, Edgbaston, Birmingham, UK. sratte@pitt.edu</affilitation> | |
<affilitation>Division of Psychiatry, Clinical Research Centre, Harrow, Middlesex, UK.</affilitation> | |
<affilitation>MRC Prion Unit and National Prion Clinic, UCL Institute of Neurology and National Hospital for Neurology and Neurosurgery, London, UK.</affilitation> | |
<affilitation>Medical Research Council Prion Unit and Department of Neurodegenerative Disease, Institute of Neurology, Queen Square, London, UK.</affilitation> | |
<affilitation>MRC Prion Unit and Department of Neurodegenerative Disease, UCL Institute of Neurology, National Hospital for Neurology and Neurosurgery, Queen Square, London WC1N 3BG, UK.</affilitation> | |
<affilitation>MRC Prion Unit, Department of Neurodegenerative Disease, UCL Institute of Neurology, National Hospital for Neurology and Neurosurgery, Queen Square, London, WC1N 3BG, UK.</affilitation> | |
<affilitation>MRC Prion Unit, Department of Neurodegenerative Disease, Institute of Neurology, University College London, Queen Square, London WC1N 3BG, UK.</affilitation> | |
<affilitation>MRC Prion Unit and Department of Neurogenetics, Imperial College, St. Mary's Hospital, London, United Kingdom.</affilitation> | |
<affilitation>Medical Research Council Prion Unit, Department of Neurodegenerative Disease, Institute of Neurology, University College London, Queen Square, London WC1N 3BG, United Kingdom.</affilitation> | |
<affilitation>Medical Research Council (MRC) Prion Unit and Department of Neurodegenerative Disease, Institute of Neurology, University College London, Queen Square, London WC1N 3BG, UK.</affilitation> | |
<affilitation>MRC Prion Unit and National Prion Clinic, Institute of Neurology and National Hospital for Neurology and Neurosurgery, Queen Square, London, UK. l.l.hosszu@shef.ac.uk</affilitation> | |
<affilitation>MRC Prion Unit, Department of Neurodegenerative Diseases, Institute of Neurology, University College London, Queen Square, London WC1N 3BG, UK.</affilitation> | |
<affilitation>Medical Research Council Prion Unit, Institute of Neurology, Queen Square, London WC1N 3BG, United Kingdom.</affilitation> | |
<affilitation>Division of Psychiatry, Clinical Research Centre, Harrow, Middlesex, U.K.</affilitation> | |
<affilitation>Medical Research Council Prion Unit, Department of Neurodegenerative Disease, University College London Institute of Neurology, United Kingdom.</affilitation> | |
<affilitation>Division of Psychiatry, Clinical Research Centre, Harrow, UK.</affilitation> | |
<affilitation>Department of Biochemistry and Molecular Genetics, St Mary's Hospital Medical School, London, UK.</affilitation> | |
<affilitation>Department of Biochemistry and Molecular Genetics, St. Mary's Hospital Medical School, Norfolk Place, London, UK.</affilitation> | |
<affilitation>MRC Prion Unit and Department of Neurodegenerative Disease, Institute of Neurology, Queen Square, London, UK.</affilitation> | |
<affilitation>Medical Research Council Prion Unit, and Department of Neurodegenerative Disease, Institute of Neurology, University College, Queen Square, London WC1N 3BG, UK.</affilitation> | |
</Person> | |
</gene> | |
</experts> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment