Skip to content

Instantly share code, notes, and snippets.

@lindenb
Created February 14, 2011 16:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lindenb/826151 to your computer and use it in GitHub Desktop.
Save lindenb/826151 to your computer and use it in GitHub Desktop.
package sandbox;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.logging.Logger;
import java.util.zip.GZIPInputStream;
import javax.xml.namespace.QName;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamWriter;
import javax.xml.stream.events.Attribute;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;
import org.w3c.dom.Attr;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.Text;
public class PsiToGexf
{
private static final Logger LOG=Logger.getLogger("psi2gexf");
/** PSI-MI XML FIle */
private String psiSource;
/** organism taxId */
private Integer organismTaxId=null;
/** candidate genes */
private Set<String> candidates=new TreeSet<String>(String.CASE_INSENSITIVE_ORDER);
/** id2interactors */
private Map<String,Interactor> id2interactor=new HashMap<String, Interactor>();
/** interactions */
private Set<Interaction> interactions=new HashSet<Interaction>();
/** interactor IDs remaining to find */
private Set<String> remainsInteractorIds=new HashSet<String>();
/** restrict to only candidates */
private boolean restrictToCandidates=false;
private class Interactor
{
String id;
String shortLabel;
}
private class Interaction
{
List<String> interactorsId=new ArrayList<String>(2);
public String getSource()
{
return interactorsId.get(0).compareTo(interactorsId.get(1))<0?
this.interactorsId.get(0):
this.interactorsId.get(1)
;
}
public String getTarget()
{
return interactorsId.get(0).compareTo(interactorsId.get(1))<0?
this.interactorsId.get(1):
this.interactorsId.get(0)
;
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + getSource().hashCode();
result = prime * result + getTarget().hashCode();
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
Interaction other = (Interaction) obj;
return other.getTarget().equals(this.getTarget()) &&
other.getSource().equals(this.getSource())
;
}
}
private abstract class AbstractParser
{
protected Document dom=null;
protected AbstractParser()
{
DocumentBuilderFactory factory=DocumentBuilderFactory.newInstance();
factory.setCoalescing(true);
factory.setExpandEntityReferences(true);
factory.setValidating(false);
factory.setNamespaceAware(true);
factory.setIgnoringComments(true);
factory.setIgnoringElementContentWhitespace(true);
try {
DocumentBuilder builder=factory.newDocumentBuilder();
this.dom=builder.newDocument();
}
catch (ParserConfigurationException e)
{
throw new RuntimeException(e);
}
}
public void interactor(Element e)
{
}
public void interaction(Element e)
{
}
public boolean isBreakingWhenInteractor()
{
return false;
}
public boolean isBreakingWhenInteraction()
{
return false;
}
public void parse() throws IOException,XMLStreamException
{
InputStream input=null;
try
{
LOG.info("opening "+psiSource+" for "+getClass());
input=open(psiSource);
XMLInputFactory factory = XMLInputFactory.newInstance();
factory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, Boolean.TRUE);
factory.setProperty(XMLInputFactory.IS_VALIDATING, Boolean.FALSE);
factory.setProperty(XMLInputFactory.IS_COALESCING, Boolean.TRUE);
factory.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, Boolean.TRUE);
XMLEventReader in=factory.createXMLEventReader(input);
while((in.hasNext()))
{
XMLEvent evt=in.nextEvent();
if(!evt.isStartElement()) continue;
String localName=evt.asStartElement().getName().getLocalPart();
if(localName.equals("interactor"))
{
if(isBreakingWhenInteractor()) break;
interactor(parseDom(dom,in,evt.asStartElement()));
}
else if(localName.equals("interaction"))
{
if(isBreakingWhenInteraction()) break;
interaction(parseDom(dom,in,evt.asStartElement()));
}
}
}
finally
{
if(input!=null)
{
try { input.close();}
catch(IOException err) {}
}
}
}
}
private class CollectCandidateIds
extends AbstractParser
{
@Override
public boolean isBreakingWhenInteraction() {
return true;
}
@Override
public void interactor(Element root)
{
if(organismTaxId!=null)
{
Element organism=first(root,"organism");
if(organism==null)
{
return;
}
Attr att= organism.getAttributeNode("ncbiTaxId");
if(att==null)
{
return;
}
if(organismTaxId!=Integer.parseInt(att.getValue())) return;
}
Element names=first(root,"names");
if(names==null)
{
LOG.info("names missing");
return;
}
Element shortLabelE=first(names,"shortLabel");
if(shortLabelE==null)
{
LOG.info("shortLabel missing");
return;
}
String shortLabel=shortLabelE.getTextContent().trim();
if(!candidates.contains(shortLabel)) return;
LOG.info(shortLabel);
Attr att= root.getAttributeNode("id");
if(att==null) return;
Interactor interactor=new Interactor();
interactor.id=att.getValue();
interactor.shortLabel=shortLabel;
id2interactor.put(interactor.id, interactor);
}
}
private class CollectRemainingInteractors
extends AbstractParser
{
@Override
public boolean isBreakingWhenInteractor()
{
return remainsInteractorIds.isEmpty();
}
@Override
public boolean isBreakingWhenInteraction()
{
return true;
}
@Override
public void interactor(Element root)
{
Attr idAtt=root.getAttributeNode("id");
if(idAtt==null) return ;
if(!remainsInteractorIds.contains(idAtt.getValue())) return;
LOG.info("found remains @id="+idAtt.getValue());
Interactor interactor=new Interactor();
interactor.id=idAtt.getValue();
interactor.shortLabel=interactor.id;
remainsInteractorIds.remove(interactor.id);
id2interactor.put(interactor.id, interactor);
Element names=first(root,"names");
if(names!=null)
{
Element shortLabelE=first(names,"shortLabel");
if(shortLabelE!=null)
{
interactor.shortLabel=shortLabelE.getTextContent().trim();
}
}
}
}
private class CollectInteractions
extends AbstractParser
{
@Override
public void interaction(Element root)
{
Element participantList=first(root,"participantList");
if(participantList==null) return;
boolean foundKnownInteractor=false;
Set<String> ids=new HashSet<String>();
for(Element participantE:list(participantList,"participant"))
{
Element interactorRef=first(participantE,"interactorRef");
String interactorId=interactorRef.getTextContent().trim();
ids.add(interactorId);
if(id2interactor.containsKey(interactorId))
{
foundKnownInteractor=true;
}
}
if(!foundKnownInteractor) return;
if(restrictToCandidates)
{
for(String interactorId:ids)
{
if(!id2interactor.containsKey(interactorId))
{
return;
}
}
}
if(ids.size()==1) return;//dimer
if(ids.size()!=2)
{
LOG.info("Found interaction with more than two Interactor: @id="+root.getAttribute("id"));
}
Interaction interaction=new Interaction();
for(String id:ids)
{
interaction.interactorsId.add(id);
if(!restrictToCandidates && !id2interactor.containsKey(id))
{
LOG.info("adding remains @id:"+id);
remainsInteractorIds.add(id);
}
}
interactions.add(interaction);
}
}
private static Element first(Node root,String localName)
{
for(Node n1=root.getFirstChild();n1!=null;n1=n1.getNextSibling())
{
if(n1.getNodeType()!=Node.ELEMENT_NODE) continue;
Element e1=Element.class.cast(n1);
if(e1.getLocalName().equals(localName)) return e1;
}
return null;
}
private static List<Element> list(Node root,String localName)
{
List<Element> L=new ArrayList<Element>();
for(Node n1=root.getFirstChild();n1!=null;n1=n1.getNextSibling())
{
if(n1.getNodeType()!=Node.ELEMENT_NODE) continue;
Element e1=Element.class.cast(n1);
if(!e1.getLocalName().equals(localName)) continue;
L.add(e1);
}
return L;
}
private Element parseDom(Document dom,XMLEventReader in,StartElement element)
throws XMLStreamException
{
boolean containsE=false;
List<Text> textNodes=new ArrayList<Text>();
QName qName=element.getName();
Element elt=null;
elt=dom.createElementNS(qName.getNamespaceURI(),qName.getLocalPart());
Iterator<?> r=element.getAttributes();
while(r.hasNext())
{
Attribute att= (Attribute)r.next();
elt.setAttribute(att.getName().getLocalPart(), att.getValue());
}
while((in.hasNext()))
{
XMLEvent evt=in.nextEvent();
if(evt.isStartElement())
{
containsE=true;
elt.appendChild(parseDom(dom,in,evt.asStartElement()));
}
else if(evt.isEndElement())
{
break;
}
else if(evt.isCharacters() && !containsE)
{
Text text=dom.createTextNode(evt.asCharacters().getData());
textNodes.add(text);
elt.appendChild(text);
}
}
if(containsE)
{
for(Text text:textNodes) elt.removeChild(text);
}
return elt;
}
private void run() throws Exception
{
new CollectCandidateIds().parse();
new CollectInteractions().parse();
new CollectRemainingInteractors().parse();
}
private void writeGexf(OutputStream out) throws XMLStreamException
{
LOG.info("saving as GEXF");
Set<String> keepIds=new HashSet<String>();
for(Interaction interaction:this.interactions)
{
//Interactor i1=id2interactor.get(interaction.getSource());
//Interactor i2=id2interactor.get(interaction.getTarget());
keepIds.add(interaction.getSource());
keepIds.add(interaction.getTarget());
}
XMLOutputFactory xmlfactory= XMLOutputFactory.newInstance();
XMLStreamWriter w= xmlfactory.createXMLStreamWriter(out,"UTF-8");
w.writeStartDocument("UTF-8","1.0");
w.writeStartElement("gexf");
w.writeNamespace("","http://www.gexf.net/1.2draft");
w.writeAttribute("version", "1.2");
/* meta */
w.writeStartElement("meta");
w.writeStartElement("creator");
w.writeCharacters(System.getProperty("user.name","me"));
w.writeEndElement();
w.writeStartElement("description");
w.writeCharacters("Graph for "+this.psiSource);
w.writeEndElement();
w.writeEndElement();
/* graph */
w.writeStartElement("graph");
w.writeAttribute("mode", "static");
w.writeAttribute("defaultedgetype", "undirected");
/* attributes */
w.writeStartElement("attributes");
w.writeAttribute("class","node");
w.writeAttribute("mode","static");
w.writeEndElement();//attributes
/* nodes */
w.writeStartElement("nodes");
for(Interactor interactor: this.id2interactor.values())
{
if(!keepIds.contains(interactor.id)) continue;
w.writeStartElement("node");
w.writeAttribute("id", interactor.id);
w.writeAttribute("label",interactor.shortLabel);
w.writeEndElement();
}
w.writeEndElement();//nodes
/* edges */
w.writeStartElement("edges");
int countEdges=0;
for(Interaction interaction:this.interactions)
{
w.writeEmptyElement("edge");
w.writeAttribute("id", "E"+(++countEdges));
w.writeAttribute("source",interaction.getSource());
w.writeAttribute("target",interaction.getTarget());
}
w.writeEndElement();//edges
w.writeEndElement();//graph
w.writeEndElement();//gexf
w.writeEndDocument();
w.flush();
}
private static InputStream open(String filename) throws IOException
{
InputStream input=null;
if( filename.startsWith("http://") ||
filename.startsWith("ftp://") ||
filename.startsWith("https://"))
{
input=new java.net.URL(filename).openStream();
}
else
{
input=new FileInputStream(filename);
}
if(filename.endsWith(".gz"))
{
input=new GZIPInputStream(input);
}
return input;
}
private void readCandidates(BufferedReader in)
throws IOException
{
String line;
while((line=in.readLine())!=null)
{
if(line.isEmpty() || line.startsWith("#")) continue;
this.candidates.add(line);
}
}
public static void main(String[] args)
{
try {
File outputFile=null;
PsiToGexf app=new PsiToGexf();
int optind=0;
while(optind<args.length)
{
if(args[optind].equals("-h"))
{
System.err.println("Pierre Lindenbaum PhD. 2011");
System.err.println("Options:");
System.err.println(" -R : restrict result to the only candidate genes. The binary interactions will carry two candidates.");
System.err.println(" -o <filename> fileout (else stdout)");
System.err.println(" -c <gene> add a candidate");
System.err.println(" -t <int> restrict to taxon-ID");
System.err.println(" -x <psimi2.5> REQUIRED: input XML file");
System.err.println("<stdin>|<filenames>");
return;
}
else if(args[optind].equals("-R"))
{
app.restrictToCandidates=true;
}
else if(args[optind].equals("-c"))
{
app.candidates.add(args[++optind]);
}
else if(args[optind].equals("-t"))
{
app.organismTaxId=Integer.parseInt(args[++optind]);
}
else if(args[optind].equals("-o"))
{
outputFile=new File(args[++optind]);
}
else if(args[optind].equals("-x"))
{
app.psiSource=args[++optind];
}
else if(args[optind].equals("--"))
{
optind++;
break;
}
else if(args[optind].startsWith("-"))
{
System.err.println("Unnown option: "+args[optind]);
return;
}
else
{
break;
}
++optind;
}
if(app.psiSource==null)
{
System.err.println("No XML PSI source");
return;
}
if(optind==args.length)
{
if(app.candidates.isEmpty())
{
app.readCandidates(new BufferedReader(new InputStreamReader(System.in)));
}
}
else
{
while(optind< args.length)
{
String inputName=args[optind++];
BufferedReader r=new BufferedReader(new InputStreamReader(open(inputName)));
app.readCandidates(r);
r.close();
}
}
if(app.candidates.isEmpty())
{
System.err.println("No Candidates");
return;
}
app.run();
if(outputFile!=null)
{
FileOutputStream fout=new FileOutputStream(outputFile);
app.writeGexf(fout);
fout.flush();
fout.close();
}
else
{
app.writeGexf(System.out);
}
}
catch (Exception e)
{
e.printStackTrace();
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment