Created
February 14, 2011 16:54
-
-
Save lindenb/826151 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package sandbox; | |
import java.io.BufferedReader; | |
import java.io.File; | |
import java.io.FileInputStream; | |
import java.io.FileOutputStream; | |
import java.io.IOException; | |
import java.io.InputStream; | |
import java.io.InputStreamReader; | |
import java.io.OutputStream; | |
import java.util.ArrayList; | |
import java.util.HashMap; | |
import java.util.HashSet; | |
import java.util.Iterator; | |
import java.util.List; | |
import java.util.Map; | |
import java.util.Set; | |
import java.util.TreeSet; | |
import java.util.logging.Logger; | |
import java.util.zip.GZIPInputStream; | |
import javax.xml.namespace.QName; | |
import javax.xml.parsers.DocumentBuilder; | |
import javax.xml.parsers.DocumentBuilderFactory; | |
import javax.xml.parsers.ParserConfigurationException; | |
import javax.xml.stream.XMLEventReader; | |
import javax.xml.stream.XMLInputFactory; | |
import javax.xml.stream.XMLOutputFactory; | |
import javax.xml.stream.XMLStreamException; | |
import javax.xml.stream.XMLStreamWriter; | |
import javax.xml.stream.events.Attribute; | |
import javax.xml.stream.events.StartElement; | |
import javax.xml.stream.events.XMLEvent; | |
import org.w3c.dom.Attr; | |
import org.w3c.dom.Document; | |
import org.w3c.dom.Element; | |
import org.w3c.dom.Node; | |
import org.w3c.dom.Text; | |
public class PsiToGexf | |
{ | |
private static final Logger LOG=Logger.getLogger("psi2gexf"); | |
/** PSI-MI XML FIle */ | |
private String psiSource; | |
/** organism taxId */ | |
private Integer organismTaxId=null; | |
/** candidate genes */ | |
private Set<String> candidates=new TreeSet<String>(String.CASE_INSENSITIVE_ORDER); | |
/** id2interactors */ | |
private Map<String,Interactor> id2interactor=new HashMap<String, Interactor>(); | |
/** interactions */ | |
private Set<Interaction> interactions=new HashSet<Interaction>(); | |
/** interactor IDs remaining to find */ | |
private Set<String> remainsInteractorIds=new HashSet<String>(); | |
/** restrict to only candidates */ | |
private boolean restrictToCandidates=false; | |
private class Interactor | |
{ | |
String id; | |
String shortLabel; | |
} | |
private class Interaction | |
{ | |
List<String> interactorsId=new ArrayList<String>(2); | |
public String getSource() | |
{ | |
return interactorsId.get(0).compareTo(interactorsId.get(1))<0? | |
this.interactorsId.get(0): | |
this.interactorsId.get(1) | |
; | |
} | |
public String getTarget() | |
{ | |
return interactorsId.get(0).compareTo(interactorsId.get(1))<0? | |
this.interactorsId.get(1): | |
this.interactorsId.get(0) | |
; | |
} | |
@Override | |
public int hashCode() { | |
final int prime = 31; | |
int result = 1; | |
result = prime * result + getSource().hashCode(); | |
result = prime * result + getTarget().hashCode(); | |
return result; | |
} | |
@Override | |
public boolean equals(Object obj) { | |
if (this == obj) | |
return true; | |
if (obj == null) | |
return false; | |
if (getClass() != obj.getClass()) | |
return false; | |
Interaction other = (Interaction) obj; | |
return other.getTarget().equals(this.getTarget()) && | |
other.getSource().equals(this.getSource()) | |
; | |
} | |
} | |
private abstract class AbstractParser | |
{ | |
protected Document dom=null; | |
protected AbstractParser() | |
{ | |
DocumentBuilderFactory factory=DocumentBuilderFactory.newInstance(); | |
factory.setCoalescing(true); | |
factory.setExpandEntityReferences(true); | |
factory.setValidating(false); | |
factory.setNamespaceAware(true); | |
factory.setIgnoringComments(true); | |
factory.setIgnoringElementContentWhitespace(true); | |
try { | |
DocumentBuilder builder=factory.newDocumentBuilder(); | |
this.dom=builder.newDocument(); | |
} | |
catch (ParserConfigurationException e) | |
{ | |
throw new RuntimeException(e); | |
} | |
} | |
public void interactor(Element e) | |
{ | |
} | |
public void interaction(Element e) | |
{ | |
} | |
public boolean isBreakingWhenInteractor() | |
{ | |
return false; | |
} | |
public boolean isBreakingWhenInteraction() | |
{ | |
return false; | |
} | |
public void parse() throws IOException,XMLStreamException | |
{ | |
InputStream input=null; | |
try | |
{ | |
LOG.info("opening "+psiSource+" for "+getClass()); | |
input=open(psiSource); | |
XMLInputFactory factory = XMLInputFactory.newInstance(); | |
factory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, Boolean.TRUE); | |
factory.setProperty(XMLInputFactory.IS_VALIDATING, Boolean.FALSE); | |
factory.setProperty(XMLInputFactory.IS_COALESCING, Boolean.TRUE); | |
factory.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, Boolean.TRUE); | |
XMLEventReader in=factory.createXMLEventReader(input); | |
while((in.hasNext())) | |
{ | |
XMLEvent evt=in.nextEvent(); | |
if(!evt.isStartElement()) continue; | |
String localName=evt.asStartElement().getName().getLocalPart(); | |
if(localName.equals("interactor")) | |
{ | |
if(isBreakingWhenInteractor()) break; | |
interactor(parseDom(dom,in,evt.asStartElement())); | |
} | |
else if(localName.equals("interaction")) | |
{ | |
if(isBreakingWhenInteraction()) break; | |
interaction(parseDom(dom,in,evt.asStartElement())); | |
} | |
} | |
} | |
finally | |
{ | |
if(input!=null) | |
{ | |
try { input.close();} | |
catch(IOException err) {} | |
} | |
} | |
} | |
} | |
private class CollectCandidateIds | |
extends AbstractParser | |
{ | |
@Override | |
public boolean isBreakingWhenInteraction() { | |
return true; | |
} | |
@Override | |
public void interactor(Element root) | |
{ | |
if(organismTaxId!=null) | |
{ | |
Element organism=first(root,"organism"); | |
if(organism==null) | |
{ | |
return; | |
} | |
Attr att= organism.getAttributeNode("ncbiTaxId"); | |
if(att==null) | |
{ | |
return; | |
} | |
if(organismTaxId!=Integer.parseInt(att.getValue())) return; | |
} | |
Element names=first(root,"names"); | |
if(names==null) | |
{ | |
LOG.info("names missing"); | |
return; | |
} | |
Element shortLabelE=first(names,"shortLabel"); | |
if(shortLabelE==null) | |
{ | |
LOG.info("shortLabel missing"); | |
return; | |
} | |
String shortLabel=shortLabelE.getTextContent().trim(); | |
if(!candidates.contains(shortLabel)) return; | |
LOG.info(shortLabel); | |
Attr att= root.getAttributeNode("id"); | |
if(att==null) return; | |
Interactor interactor=new Interactor(); | |
interactor.id=att.getValue(); | |
interactor.shortLabel=shortLabel; | |
id2interactor.put(interactor.id, interactor); | |
} | |
} | |
private class CollectRemainingInteractors | |
extends AbstractParser | |
{ | |
@Override | |
public boolean isBreakingWhenInteractor() | |
{ | |
return remainsInteractorIds.isEmpty(); | |
} | |
@Override | |
public boolean isBreakingWhenInteraction() | |
{ | |
return true; | |
} | |
@Override | |
public void interactor(Element root) | |
{ | |
Attr idAtt=root.getAttributeNode("id"); | |
if(idAtt==null) return ; | |
if(!remainsInteractorIds.contains(idAtt.getValue())) return; | |
LOG.info("found remains @id="+idAtt.getValue()); | |
Interactor interactor=new Interactor(); | |
interactor.id=idAtt.getValue(); | |
interactor.shortLabel=interactor.id; | |
remainsInteractorIds.remove(interactor.id); | |
id2interactor.put(interactor.id, interactor); | |
Element names=first(root,"names"); | |
if(names!=null) | |
{ | |
Element shortLabelE=first(names,"shortLabel"); | |
if(shortLabelE!=null) | |
{ | |
interactor.shortLabel=shortLabelE.getTextContent().trim(); | |
} | |
} | |
} | |
} | |
private class CollectInteractions | |
extends AbstractParser | |
{ | |
@Override | |
public void interaction(Element root) | |
{ | |
Element participantList=first(root,"participantList"); | |
if(participantList==null) return; | |
boolean foundKnownInteractor=false; | |
Set<String> ids=new HashSet<String>(); | |
for(Element participantE:list(participantList,"participant")) | |
{ | |
Element interactorRef=first(participantE,"interactorRef"); | |
String interactorId=interactorRef.getTextContent().trim(); | |
ids.add(interactorId); | |
if(id2interactor.containsKey(interactorId)) | |
{ | |
foundKnownInteractor=true; | |
} | |
} | |
if(!foundKnownInteractor) return; | |
if(restrictToCandidates) | |
{ | |
for(String interactorId:ids) | |
{ | |
if(!id2interactor.containsKey(interactorId)) | |
{ | |
return; | |
} | |
} | |
} | |
if(ids.size()==1) return;//dimer | |
if(ids.size()!=2) | |
{ | |
LOG.info("Found interaction with more than two Interactor: @id="+root.getAttribute("id")); | |
} | |
Interaction interaction=new Interaction(); | |
for(String id:ids) | |
{ | |
interaction.interactorsId.add(id); | |
if(!restrictToCandidates && !id2interactor.containsKey(id)) | |
{ | |
LOG.info("adding remains @id:"+id); | |
remainsInteractorIds.add(id); | |
} | |
} | |
interactions.add(interaction); | |
} | |
} | |
private static Element first(Node root,String localName) | |
{ | |
for(Node n1=root.getFirstChild();n1!=null;n1=n1.getNextSibling()) | |
{ | |
if(n1.getNodeType()!=Node.ELEMENT_NODE) continue; | |
Element e1=Element.class.cast(n1); | |
if(e1.getLocalName().equals(localName)) return e1; | |
} | |
return null; | |
} | |
private static List<Element> list(Node root,String localName) | |
{ | |
List<Element> L=new ArrayList<Element>(); | |
for(Node n1=root.getFirstChild();n1!=null;n1=n1.getNextSibling()) | |
{ | |
if(n1.getNodeType()!=Node.ELEMENT_NODE) continue; | |
Element e1=Element.class.cast(n1); | |
if(!e1.getLocalName().equals(localName)) continue; | |
L.add(e1); | |
} | |
return L; | |
} | |
private Element parseDom(Document dom,XMLEventReader in,StartElement element) | |
throws XMLStreamException | |
{ | |
boolean containsE=false; | |
List<Text> textNodes=new ArrayList<Text>(); | |
QName qName=element.getName(); | |
Element elt=null; | |
elt=dom.createElementNS(qName.getNamespaceURI(),qName.getLocalPart()); | |
Iterator<?> r=element.getAttributes(); | |
while(r.hasNext()) | |
{ | |
Attribute att= (Attribute)r.next(); | |
elt.setAttribute(att.getName().getLocalPart(), att.getValue()); | |
} | |
while((in.hasNext())) | |
{ | |
XMLEvent evt=in.nextEvent(); | |
if(evt.isStartElement()) | |
{ | |
containsE=true; | |
elt.appendChild(parseDom(dom,in,evt.asStartElement())); | |
} | |
else if(evt.isEndElement()) | |
{ | |
break; | |
} | |
else if(evt.isCharacters() && !containsE) | |
{ | |
Text text=dom.createTextNode(evt.asCharacters().getData()); | |
textNodes.add(text); | |
elt.appendChild(text); | |
} | |
} | |
if(containsE) | |
{ | |
for(Text text:textNodes) elt.removeChild(text); | |
} | |
return elt; | |
} | |
private void run() throws Exception | |
{ | |
new CollectCandidateIds().parse(); | |
new CollectInteractions().parse(); | |
new CollectRemainingInteractors().parse(); | |
} | |
private void writeGexf(OutputStream out) throws XMLStreamException | |
{ | |
LOG.info("saving as GEXF"); | |
Set<String> keepIds=new HashSet<String>(); | |
for(Interaction interaction:this.interactions) | |
{ | |
//Interactor i1=id2interactor.get(interaction.getSource()); | |
//Interactor i2=id2interactor.get(interaction.getTarget()); | |
keepIds.add(interaction.getSource()); | |
keepIds.add(interaction.getTarget()); | |
} | |
XMLOutputFactory xmlfactory= XMLOutputFactory.newInstance(); | |
XMLStreamWriter w= xmlfactory.createXMLStreamWriter(out,"UTF-8"); | |
w.writeStartDocument("UTF-8","1.0"); | |
w.writeStartElement("gexf"); | |
w.writeNamespace("","http://www.gexf.net/1.2draft"); | |
w.writeAttribute("version", "1.2"); | |
/* meta */ | |
w.writeStartElement("meta"); | |
w.writeStartElement("creator"); | |
w.writeCharacters(System.getProperty("user.name","me")); | |
w.writeEndElement(); | |
w.writeStartElement("description"); | |
w.writeCharacters("Graph for "+this.psiSource); | |
w.writeEndElement(); | |
w.writeEndElement(); | |
/* graph */ | |
w.writeStartElement("graph"); | |
w.writeAttribute("mode", "static"); | |
w.writeAttribute("defaultedgetype", "undirected"); | |
/* attributes */ | |
w.writeStartElement("attributes"); | |
w.writeAttribute("class","node"); | |
w.writeAttribute("mode","static"); | |
w.writeEndElement();//attributes | |
/* nodes */ | |
w.writeStartElement("nodes"); | |
for(Interactor interactor: this.id2interactor.values()) | |
{ | |
if(!keepIds.contains(interactor.id)) continue; | |
w.writeStartElement("node"); | |
w.writeAttribute("id", interactor.id); | |
w.writeAttribute("label",interactor.shortLabel); | |
w.writeEndElement(); | |
} | |
w.writeEndElement();//nodes | |
/* edges */ | |
w.writeStartElement("edges"); | |
int countEdges=0; | |
for(Interaction interaction:this.interactions) | |
{ | |
w.writeEmptyElement("edge"); | |
w.writeAttribute("id", "E"+(++countEdges)); | |
w.writeAttribute("source",interaction.getSource()); | |
w.writeAttribute("target",interaction.getTarget()); | |
} | |
w.writeEndElement();//edges | |
w.writeEndElement();//graph | |
w.writeEndElement();//gexf | |
w.writeEndDocument(); | |
w.flush(); | |
} | |
private static InputStream open(String filename) throws IOException | |
{ | |
InputStream input=null; | |
if( filename.startsWith("http://") || | |
filename.startsWith("ftp://") || | |
filename.startsWith("https://")) | |
{ | |
input=new java.net.URL(filename).openStream(); | |
} | |
else | |
{ | |
input=new FileInputStream(filename); | |
} | |
if(filename.endsWith(".gz")) | |
{ | |
input=new GZIPInputStream(input); | |
} | |
return input; | |
} | |
private void readCandidates(BufferedReader in) | |
throws IOException | |
{ | |
String line; | |
while((line=in.readLine())!=null) | |
{ | |
if(line.isEmpty() || line.startsWith("#")) continue; | |
this.candidates.add(line); | |
} | |
} | |
public static void main(String[] args) | |
{ | |
try { | |
File outputFile=null; | |
PsiToGexf app=new PsiToGexf(); | |
int optind=0; | |
while(optind<args.length) | |
{ | |
if(args[optind].equals("-h")) | |
{ | |
System.err.println("Pierre Lindenbaum PhD. 2011"); | |
System.err.println("Options:"); | |
System.err.println(" -R : restrict result to the only candidate genes. The binary interactions will carry two candidates."); | |
System.err.println(" -o <filename> fileout (else stdout)"); | |
System.err.println(" -c <gene> add a candidate"); | |
System.err.println(" -t <int> restrict to taxon-ID"); | |
System.err.println(" -x <psimi2.5> REQUIRED: input XML file"); | |
System.err.println("<stdin>|<filenames>"); | |
return; | |
} | |
else if(args[optind].equals("-R")) | |
{ | |
app.restrictToCandidates=true; | |
} | |
else if(args[optind].equals("-c")) | |
{ | |
app.candidates.add(args[++optind]); | |
} | |
else if(args[optind].equals("-t")) | |
{ | |
app.organismTaxId=Integer.parseInt(args[++optind]); | |
} | |
else if(args[optind].equals("-o")) | |
{ | |
outputFile=new File(args[++optind]); | |
} | |
else if(args[optind].equals("-x")) | |
{ | |
app.psiSource=args[++optind]; | |
} | |
else if(args[optind].equals("--")) | |
{ | |
optind++; | |
break; | |
} | |
else if(args[optind].startsWith("-")) | |
{ | |
System.err.println("Unnown option: "+args[optind]); | |
return; | |
} | |
else | |
{ | |
break; | |
} | |
++optind; | |
} | |
if(app.psiSource==null) | |
{ | |
System.err.println("No XML PSI source"); | |
return; | |
} | |
if(optind==args.length) | |
{ | |
if(app.candidates.isEmpty()) | |
{ | |
app.readCandidates(new BufferedReader(new InputStreamReader(System.in))); | |
} | |
} | |
else | |
{ | |
while(optind< args.length) | |
{ | |
String inputName=args[optind++]; | |
BufferedReader r=new BufferedReader(new InputStreamReader(open(inputName))); | |
app.readCandidates(r); | |
r.close(); | |
} | |
} | |
if(app.candidates.isEmpty()) | |
{ | |
System.err.println("No Candidates"); | |
return; | |
} | |
app.run(); | |
if(outputFile!=null) | |
{ | |
FileOutputStream fout=new FileOutputStream(outputFile); | |
app.writeGexf(fout); | |
fout.flush(); | |
fout.close(); | |
} | |
else | |
{ | |
app.writeGexf(System.out); | |
} | |
} | |
catch (Exception e) | |
{ | |
e.printStackTrace(); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment