Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@lindenb
Created March 21, 2018 15:56
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lindenb/5a76d95397a86b860386cdf3976726a2 to your computer and use it in GitHub Desktop.
Save lindenb/5a76d95397a86b860386cdf3976726a2 to your computer and use it in GitHub Desktop.
Parsing drugbank using a xml schema and java keywords: drugbank xjc xsd schema xml java javac
/*
Author: Pierre Lindenbaum @yokofakun
related:
http://bionics.it/posts/parsing-drugbank-xml-or-any-large-xml-file-in-streaming-mode-in-go
*/
import ca.drugbank.*;
import java.util.*;
import java.util.stream.*;
import java.io.*;
import javax.xml.stream.*;
import javax.xml.stream.events.*;
import javax.xml.bind.*;
import javax.xml.*;
public class Drugbank
{
public static void main(final String args[]) throws Exception {
final JAXBContext jc = JAXBContext.newInstance("ca.drugbank");
final Unmarshaller unmarshaller =jc.createUnmarshaller();
final XMLInputFactory xmlInputFactory=XMLInputFactory.newFactory();
xmlInputFactory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, Boolean.TRUE);
final XMLEventReader r=xmlInputFactory.createXMLEventReader(System.in);
final ExternalIdentifierResourceType EXTIDS[]={
ExternalIdentifierResourceType.CH_EMBL,
ExternalIdentifierResourceType.PUB_CHEM_COMPOUND,
ExternalIdentifierResourceType.PUB_CHEM_SUBSTANCE
};
final PrintStream w = System.out;
while(r.hasNext())
{
final XMLEvent evt=r.peek();
if(evt.isStartElement() && evt.asStartElement().getName().getLocalPart().equals("drug") ) {
final DrugType d = unmarshaller.unmarshal(r,DrugType.class).getValue();
w.print(d.getName());
w.print('\t');
if(d.getCalculatedProperties()!=null)
w.print(d.getCalculatedProperties().
getProperty().
stream().
filter(P->P.getKind().equals(CalculatedPropertyKindType.IN_CH_I_KEY)).
map(P->P.getValue()).
collect(Collectors.joining(" ")));
w.print('\t');
if(d.getGroups()!=null)
w.print(d.getGroups().
getGroup().
stream().
map(G->G.value()).
collect(Collectors.joining("->")));
for(final ExternalIdentifierResourceType extId: EXTIDS) {
w.print('\t');
if(d.getExternalIdentifiers()!=null)
w.print(d.getExternalIdentifiers().
getExternalIdentifier().
stream().
filter(P->P.getResource()==extId).
map(P->P.getIdentifier()).
collect(Collectors.joining(" ")));
}
w.println();
}
else {
r.next();//consumme
}
}
r.close();
}
}
DATABASE?=${HOME}/full_database.xml
all: tmp/Drugbank.class $(DATABASE)
echo "Scanning the XML drug-bank file $(DATABASE)"
cat $(DATABASE) | java -cp tmp Drugbank
tmp/Drugbank.class: Drugbank.java tmp/ca/drugbank/ObjectFactory.java
javac -d tmp -cp tmp Drugbank.java `find tmp/ca/ -type f -name '*.java'`
tmp/ca/drugbank/ObjectFactory.java:
mkdir -p tmp && xjc -d tmp "https://www.drugbank.ca/docs/drugbank.xsd" && touch -c $@
clean:
rm -rf tmp
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment