Skip to content

Instantly share code, notes, and snippets.

@lindenb
Created December 14, 2010 14:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lindenb/740496 to your computer and use it in GitHub Desktop.
Save lindenb/740496 to your computer and use it in GitHub Desktop.
Given a gene, identify the world experts http://biostar.stackexchange.com/questions/4296
/**
* Author: Pierre Lindenbaum PhD
* WWW: http://plindenbaum.blogspot.com
* Motivation:
* Given a gene, identify the world experts
* http://biostar.stackexchange.com/questions/4296
*/
import java.net.URLEncoder;
import java.text.Collator;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import java.util.TreeSet;
import java.util.logging.Level;
import java.util.logging.Logger;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamWriter;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
public class BioStar4296
{
private Logger LOG=Logger.getLogger(BioStar4296.class.getName());
private String organism="Homo Sapiens";
private DocumentBuilder docBuilder;
private XPath xpath;
private Collator collator;
static class Author
{
String suffix="";
String firstName="";
String lastName="";
String initials="";
Set<String> mails=new HashSet<String>();
Set<Integer> pmids=new TreeSet<Integer>();
int factor=1;
Set<String> affilitations=new HashSet<String>();
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result
+ ((firstName == null) ? 0 : firstName.hashCode());
result = prime * result
+ ((lastName == null) ? 0 : lastName.hashCode());
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
Author other = (Author) obj;
if (firstName == null) {
if (other.firstName != null)
return false;
} else if (!firstName.equals(other.firstName))
return false;
if (lastName == null) {
if (other.lastName != null)
return false;
} else if (!lastName.equals(other.lastName))
return false;
return true;
}
@Override
public String toString() {
return firstName+" "+lastName+" lab:"+this.affilitations+" mails:"+this.mails;
}
void write(XMLStreamWriter w)
throws Exception
{
w.writeStartElement("Person");
w.writeCharacters("\n");
w.writeStartElement("firstName");
w.writeCharacters(firstName);
w.writeEndElement();
w.writeCharacters("\n");
w.writeStartElement("lastName");
w.writeCharacters(lastName);
w.writeEndElement();
w.writeCharacters("\n");
for(Integer s:pmids)
{
w.writeStartElement("pmid");
w.writeCharacters(String.valueOf(s));
w.writeEndElement();
w.writeCharacters("\n");
}
for(String s:mails)
{
w.writeStartElement("mail");
w.writeCharacters(s);
w.writeEndElement();
w.writeCharacters("\n");
}
for(String s:affilitations)
{
w.writeStartElement("affilitation");
w.writeCharacters(s);
w.writeEndElement();
w.writeCharacters("\n");
}
w.writeEndElement();
w.writeCharacters("\n");
}
}
private BioStar4296() throws Exception
{
LOG.setLevel(Level.OFF);
DocumentBuilderFactory f=DocumentBuilderFactory.newInstance();
f.setNamespaceAware(false);
f.setCoalescing(true);
f.setIgnoringComments(true);
f.setIgnoringElementContentWhitespace(true);
f.setValidating(false);
this.docBuilder=f.newDocumentBuilder();
XPathFactory factory=XPathFactory.newInstance();
this.xpath=factory.newXPath();
this.collator= Collator.getInstance(Locale.FRENCH);
this.collator.setStrength(Collator.PRIMARY);
}
private int search(XMLStreamWriter w,String geneName)
throws Exception
{
w.writeCharacters("\n");
w.writeStartElement("gene");
w.writeAttribute("name", geneName);
String url= "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gene&term="+
URLEncoder.encode(geneName+"[PREF] \""+this.organism+"\"[ORGN]", "UTF-8");
LOG.info(url);
Document dom=this.docBuilder.parse(url);
NodeList list=(NodeList)this.xpath.evaluate(
"/eSearchResult/IdList/Id",
dom,XPathConstants.NODESET);
if(list.getLength()==0)
{
w.writeComment("Cannot find any entry for "+geneName);
w.writeEndElement();
return -1;
}
else if(list.getLength()!=1)
{
w.writeComment("Ambigous name "+geneName);
w.writeEndElement();
return -1;
}
String geneId= list.item(0).getTextContent();
LOG.info("GeneId:"+geneId);
w.writeAttribute("geneId", geneId);
url="http://www.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&id="+
geneId+
"&rettype=text&retmode=xml";
LOG.info(url);
dom=this.docBuilder.parse(url);
list=(NodeList)this.xpath.evaluate(
"//PubMedId",
dom,XPathConstants.NODESET);
if(list.getLength()==0)
{
w.writeComment("No pubmed for "+geneName);
w.writeEndElement();
return -1;
}
List<Author> authors=new ArrayList<Author>();
Set<Integer> pmidSet=new TreeSet<Integer>();
for(int articleIdx=0;articleIdx< list.getLength();++articleIdx)
{
String pmid= list.item(articleIdx).getTextContent();
LOG.info("PMID:"+pmid);
pmidSet.add(Integer.parseInt(pmid));
}
w.writeAttribute("count-pmids",String.valueOf(pmidSet.size()));
w.writeCharacters("\n");
for(Integer pmid: pmidSet)
{
LOG.info("PMID:"+pmid);
url="http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id="+pmid+"&retmode=xml";
LOG.info("url:"+url);
dom=this.docBuilder.parse(url);
Node n=(Node)this.xpath.evaluate("//Affiliation", dom,XPathConstants.NODE);
if(n==null) continue;
String affiliation=n.getTextContent();
String adressFragments[]=affiliation.split("[ \t\\:\\<,\\>\\(\\)]");
LOG.info("affiliation:"+affiliation);
NodeList authorList=(NodeList)this.xpath.evaluate(
"//AuthorList/Author",
dom,XPathConstants.NODESET);
LOG.info("Authors:"+authorList.getLength());
if(authorList.getLength()==0) continue;
for(int j=0;j< authorList.getLength();++j)
{
boolean collective=false;
Author author=new Author();
for(Node c1=authorList.item(j).getFirstChild();c1!=null;c1=c1.getNextSibling())
{
if(c1.getNodeType()!=Node.ELEMENT_NODE) continue;
String tag=c1.getNodeName();
String content= c1.getTextContent();
if(tag.equals("LastName"))
{
author.lastName= content;
}
else if(tag.equals("FirstName") || tag.equals("ForeName"))
{
author.firstName= content;
}
else if(tag.equals("Initials"))
{
author.initials= content;
}
else if(tag.equals("CollectiveName"))
{
collective=true;
break;
}
else if(tag.equals("Suffix"))
{
author.suffix= content;
}
}
if(collective) continue;
LOG.info("Make New Author:"+author);
int k=0;
for(k=0;k< authors.size();++k)
{
Author p=authors.get(k);
if( !p.firstName.isEmpty() &&
this.collator.compare(p.firstName,author.firstName)==0 &&
this.collator.compare(p.lastName,author.lastName)==0)
{
LOG.info("Same: "+p+" "+author);
author=p;
break;
}
}
if(k==authors.size())
{
k=0;
for(k=0;k< authors.size();++k)
{
Author p=authors.get(k);
if(
(
(!author.initials.isEmpty() && p.firstName.toLowerCase().startsWith(author.initials.toLowerCase())) ||
(!p.initials.isEmpty() && author.firstName.toLowerCase().startsWith(p.initials) )||
this.collator.compare(p.initials,author.initials)==0 )&&
this.collator.compare(p.lastName,author.lastName)==0)
{
LOG.info("Same: "+p+" "+author);
if(p.firstName.length()< author.firstName.length())
{
p.firstName=author.firstName;
}
author=p;
break;
}
}
}
if(k==authors.size())
{
LOG.info("Adding: "+author);
authors.add(author);
}
author.factor*=j;
author.affilitations.add(affiliation);
author.pmids.add(pmid);
if(affiliation.indexOf('@')!=-1)
{
for(String mail: adressFragments)
{
mail.replaceAll("\\{\\}", "");
if(mail.endsWith(".")) mail= mail.substring(0,mail.length()-1);
int index=mail.indexOf('@');
if(index==-1) continue;
String mailPrefix=mail.substring(0,index).toLowerCase();
if(mailPrefix.contains(author.lastName.toLowerCase()) ||
collator.compare(mailPrefix, author.lastName)==0)
{
LOG.info("Adding: "+mail+" to "+author);
author.mails.add(mail.toLowerCase());
}
else if( author.firstName.length()>1 &&
(mailPrefix.contains( author.firstName.toLowerCase()) ||
collator.compare(mailPrefix, author.firstName)==0))
{
LOG.info("Adding: "+mail+" to "+author);
author.mails.add(mail.toLowerCase());
}
}
}
}
}
if(authors.isEmpty())
{
w.writeComment("No Author found");
w.writeEndElement();
return -1;
}
Collections.sort(authors,new Comparator<Author>()
{
@Override
public int compare(Author o1, Author o2)
{
int i= o2.pmids.size()-o1.pmids.size();
if(i!=0) return i;
i= o2.factor-o1.factor;//later is more interesting ? not sure...
return i;
}
});
authors.get(0).write(w);
w.writeEndElement();
return 0;
}
public static void main(String[] args)
{
try {
BioStar4296 app= new BioStar4296();
int optind=0;
while(optind<args.length)
{
if(args[optind].equals("-h"))
{
System.err.println("Pierre Lindenbaum");
System.err.println("Options:");
System.err.println(" -o <organism> ["+app.organism+"]");
System.err.println(" -v show logs");
return;
}
else if(args[optind].equals("-o"))
{
app.organism=args[++optind];
}
else if(args[optind].equals("-v"))
{
app.LOG.setLevel(Level.ALL);
}
else if(args[optind].equals("--"))
{
optind++;
break;
}
else if(args[optind].startsWith("-"))
{
System.err.println("Unnown option: "+args[optind]);
return;
}
else
{
break;
}
++optind;
}
if(optind==args.length)
{
System.err.println("Gene Name missing");
}
else
{
XMLOutputFactory xmlfactory= XMLOutputFactory.newInstance();
XMLStreamWriter w= xmlfactory.createXMLStreamWriter(System.out,"UTF-8");
w.writeStartDocument("UTF-8","1.0");
w.writeCharacters("\n");
w.writeStartElement("experts");
w.writeCharacters("\n");
while(optind < args.length)
{
app.search(w,args[optind]);
optind++;
w.writeCharacters("\n");
}
w.writeEndElement();
w.writeEndDocument();
w.flush();
}
} catch (Exception e)
{
e.printStackTrace();
}
}
}
<?xml version="1.0" encoding="UTF-8"?>
<experts>
<gene name="ZC3H7B" geneId="23264" count-pmids="13">
<Person>
<firstName>Sumio</firstName>
<lastName>Sugano</lastName>
<pmid>8125298</pmid>
<pmid>9373149</pmid>
<pmid>14702039</pmid>
<affilitation>International and Interdisciplinary Studies, The University of Tokyo, Japan.</affilitation>
<affilitation>Institute of Medical Science, University of Tokyo, Japan.</affilitation>
<affilitation>Helix Research Institute, 1532-3 Yana, Kisarazu, Chiba 292-0812, Japan.</affilitation>
</Person>
</gene>
<gene name="eif4G1" geneId="1981" count-pmids="106">
<Person>
<firstName>Nahum</firstName>
<lastName>Sonenberg</lastName>
<pmid>7651417</pmid>
<pmid>7935836</pmid>
<pmid>8449919</pmid>
<pmid>8521827</pmid>
<pmid>9372926</pmid>
<pmid>9418880</pmid>
<pmid>9857202</pmid>
<pmid>9878069</pmid>
<pmid>10523622</pmid>
<pmid>10753870</pmid>
<pmid>10872469</pmid>
<pmid>10996799</pmid>
<pmid>15193258</pmid>
<pmid>15234964</pmid>
<pmid>15314020</pmid>
<pmid>15961545</pmid>
<pmid>16698552</pmid>
<pmid>19114555</pmid>
<pmid>19203580</pmid>
<pmid>20053821</pmid>
<affilitation>Department of Biochemistry and McGill Cancer Center, McGill University, Montreal, H3G 1Y6, Quebec, Canada.</affilitation>
<affilitation>Department of Biochemistry, McGill University, Montreal, Quebec, Canada.</affilitation>
<affilitation>Laboratories of Molecular Biophysics, The Rockefeller University, New York, New York 10021, USA.</affilitation>
<affilitation>Molecular Histology Unit, San Raffaele Institute, 20132 Milan, Italy.</affilitation>
<affilitation>Department of Biochemistry McGill University, Montréal, Québec, Canada. gingras@med.mcgill.ca</affilitation>
<affilitation>Department of Biochemistry and McGill Cancer Cancer Center, McGill University, 3655 Drummond Street, Montréal, Québec, H3G 1Y6 Canada.</affilitation>
<affilitation>Department of Biochemistry and McGill Cancer Centre, McGill University, Montreal, Quebec, Canada.</affilitation>
<affilitation>Department of Biological Chemistry and Molecular Pharmacology, Harvard Medical School, Boston, MA, USA. amarint@bu.edu</affilitation>
<affilitation>Dana-Farber Cancer Institute, Harvard Medical School, Boston, Massachusetts 02115, USA.</affilitation>
<affilitation>Department of Biochemistry, University of California, Riverside, Riverside, California 92521, USA.</affilitation>
<affilitation>Department of Biochemistry, McGill University, Montréal, Quebec, Canada.</affilitation>
<affilitation>Department of Biochemistry and McGill Cancer Center, McGill University, Montréal, Québec, Canada.</affilitation>
<affilitation>Department of Biochemistry and McGill Cancer Centre, McGill University, Drummond Street 3655, Montreal, Quebec, Canada H3G 1Y6.</affilitation>
<affilitation>Department of Biochemistry, McGill Cancer Center, McGill University, Montreal, Quebec, Canada H3G 1Y6.</affilitation>
<affilitation>Department of Biochemistry and Molecular Genetics, University of Illinois at Chicago, 60607, USA. nhay@uic.edu</affilitation>
<affilitation>Department of Biochemistry, McGill University, Montréal, Québec, Canada.</affilitation>
<affilitation>Department of Medicine, University of Minnesota, Minneapolis, MN 55455, USA.</affilitation>
<affilitation>Department of Biochemistry, Case Western Reserve University, Cleveland, Ohio 44106.</affilitation>
</Person>
</gene>
<gene name="PRNP" geneId="5621" count-pmids="429">
<Person>
<firstName>John</firstName>
<lastName>Collinge</lastName>
<pmid>1352724</pmid>
<pmid>1677164</pmid>
<pmid>2159587</pmid>
<pmid>2567794</pmid>
<pmid>8364585</pmid>
<pmid>11283320</pmid>
<pmid>11704923</pmid>
<pmid>12514748</pmid>
<pmid>12621436</pmid>
<pmid>12690204</pmid>
<pmid>15123682</pmid>
<pmid>15539564</pmid>
<pmid>16099923</pmid>
<pmid>16156720</pmid>
<pmid>16342955</pmid>
<pmid>16824036</pmid>
<pmid>16847141</pmid>
<pmid>16925523</pmid>
<pmid>17709704</pmid>
<pmid>18638557</pmid>
<pmid>19081515</pmid>
<pmid>19218199</pmid>
<pmid>19321423</pmid>
<pmid>19369250</pmid>
<pmid>19923577</pmid>
<pmid>20109837</pmid>
<pmid>20583301</pmid>
<mail>j.collinge@ic.ac.uk</mail>
<affilitation>MRC Prion Unit, Department of Neurodegenerative Disease, Institute of Neurology, Queen Square, London WC1N 3BG, UK.</affilitation>
<affilitation>CNS Infection and Immunity Group, Department of Neurogenetics, Division of Neurosciences and Psychological Medicine, Faculty of Medicine, Imperial College, Norfolk Place, London W2 1PG, UK.</affilitation>
<affilitation>Krebs Institute for Biomolecular Research, Department of Molecular Biology and Biotechnology, University of Sheffield, Sheffield S10 2TN, UK.</affilitation>
<affilitation>MRC Prion Unit and Department of Neurogenetics, Imperial College School of Medicine at St. Mary's, London, United Kingdom. J.Collinge@ic.ac.uk</affilitation>
<affilitation>Division of Neuroscience (Neurophysiology), Medical School, University of Birmingham, Edgbaston, Birmingham, UK. sratte@pitt.edu</affilitation>
<affilitation>Division of Psychiatry, Clinical Research Centre, Harrow, Middlesex, UK.</affilitation>
<affilitation>MRC Prion Unit and National Prion Clinic, UCL Institute of Neurology and National Hospital for Neurology and Neurosurgery, London, UK.</affilitation>
<affilitation>Medical Research Council Prion Unit and Department of Neurodegenerative Disease, Institute of Neurology, Queen Square, London, UK.</affilitation>
<affilitation>MRC Prion Unit and Department of Neurodegenerative Disease, UCL Institute of Neurology, National Hospital for Neurology and Neurosurgery, Queen Square, London WC1N 3BG, UK.</affilitation>
<affilitation>MRC Prion Unit, Department of Neurodegenerative Disease, UCL Institute of Neurology, National Hospital for Neurology and Neurosurgery, Queen Square, London, WC1N 3BG, UK.</affilitation>
<affilitation>MRC Prion Unit, Department of Neurodegenerative Disease, Institute of Neurology, University College London, Queen Square, London WC1N 3BG, UK.</affilitation>
<affilitation>MRC Prion Unit and Department of Neurogenetics, Imperial College, St. Mary's Hospital, London, United Kingdom.</affilitation>
<affilitation>Medical Research Council Prion Unit, Department of Neurodegenerative Disease, Institute of Neurology, University College London, Queen Square, London WC1N 3BG, United Kingdom.</affilitation>
<affilitation>Medical Research Council (MRC) Prion Unit and Department of Neurodegenerative Disease, Institute of Neurology, University College London, Queen Square, London WC1N 3BG, UK.</affilitation>
<affilitation>MRC Prion Unit and National Prion Clinic, Institute of Neurology and National Hospital for Neurology and Neurosurgery, Queen Square, London, UK. l.l.hosszu@shef.ac.uk</affilitation>
<affilitation>MRC Prion Unit, Department of Neurodegenerative Diseases, Institute of Neurology, University College London, Queen Square, London WC1N 3BG, UK.</affilitation>
<affilitation>Medical Research Council Prion Unit, Institute of Neurology, Queen Square, London WC1N 3BG, United Kingdom.</affilitation>
<affilitation>Division of Psychiatry, Clinical Research Centre, Harrow, Middlesex, U.K.</affilitation>
<affilitation>Medical Research Council Prion Unit, Department of Neurodegenerative Disease, University College London Institute of Neurology, United Kingdom.</affilitation>
<affilitation>Division of Psychiatry, Clinical Research Centre, Harrow, UK.</affilitation>
<affilitation>Department of Biochemistry and Molecular Genetics, St Mary's Hospital Medical School, London, UK.</affilitation>
<affilitation>Department of Biochemistry and Molecular Genetics, St. Mary's Hospital Medical School, Norfolk Place, London, UK.</affilitation>
<affilitation>MRC Prion Unit and Department of Neurodegenerative Disease, Institute of Neurology, Queen Square, London, UK.</affilitation>
<affilitation>Medical Research Council Prion Unit, and Department of Neurodegenerative Disease, Institute of Neurology, University College, Queen Square, London WC1N 3BG, UK.</affilitation>
</Person>
</gene>
</experts>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment