Accompanying code for this blog post: http://bionics.it/posts/parsing-drugbank-xml-or-any-large-xml-file-in-streaming-mode-in-go
package main | |
import ( | |
"encoding/csv" | |
"encoding/xml" | |
"io" | |
"os" | |
sp "github.com/scipipe/scipipe" | |
) | |
// -------------------------------------------------------------------------------- | |
// Workflow definition | |
// -------------------------------------------------------------------------------- | |
func main() { | |
wf := sp.NewWorkflow("exvsdb", 2) | |
// DrugBank XML | |
download := wf.NewProc("download", "curl -Lfv -o {o:zip} -u $(cat drugbank_userinfo.txt) https://www.drugbank.ca/releases/5-0-11/downloads/all-full-database") | |
download.SetPathStatic("zip", "dat/drugbank.zip") | |
unzip := wf.NewProc("unzip", `unzip -d dat/ {i:zip}; mv "dat/full database.xml" {o:xml}`) | |
unzip.SetPathStatic("xml", "dat/drugbank.xml") | |
unzip.In("zip").Connect(download.Out("zip")) | |
xmlToTSV := wf.NewProc("xml2tsv", "# Custom Go code with input: {i:xml} and output: {o:tsv}") | |
xmlToTSV.SetPathExtend("xml", "tsv", ".extr.tsv") | |
xmlToTSV.In("xml").Connect(unzip.Out("xml")) | |
xmlToTSV.CustomExecute = NewXMLToTSVFunc() // Getting the custom Go function in a factory method for readability | |
wf.Run() | |
} | |
// -------------------------------------------------------------------------------- | |
// DrugBank struct definitions | |
// -------------------------------------------------------------------------------- | |
type Drugbank struct { | |
XMLName xml.Name `xml:"drugbank"` | |
Drugs []Drug `xml:"drug"` | |
} | |
type Drug struct { | |
XMLName xml.Name `xml:"drug"` | |
Name string `xml:"name"` | |
Groups []string `xml:"groups>group"` | |
CalculatedProperties []Property `xml:"calculated-properties>property"` | |
ExternalIdentifiers []ExternalIdentifier `xml:"external-identifiers>external-identifier"` | |
} | |
type Property struct { | |
XMLName xml.Name `xml:"property"` | |
Kind string `xml:"kind"` | |
Value string `xml:"value"` | |
Source string `xml:"source"` | |
} | |
type ExternalIdentifier struct { | |
XMLName xml.Name `xml:"external-identifier"` | |
Resource string `xml:"resource"` | |
Identifier string `xml:"identifier"` | |
} | |
// -------------------------------------------------------------------------------- | |
// Components | |
// -------------------------------------------------------------------------------- | |
// NewXMLToTSVFunc returns a CustomExecute function to be used by the XML to TSV | |
// component in the workflow above | |
func NewXMLToTSVFunc() func(t *sp.Task) { | |
return func(t *sp.Task) { | |
fh, err := os.Open(t.InPath("xml")) | |
if err != nil { | |
sp.Fail("Could not open file", t.InPath("xml")) | |
} | |
tsvWrt := csv.NewWriter(t.OutIP("tsv").OpenWriteTemp()) | |
tsvWrt.Comma = '\t' | |
tsvHeader := []string{"inchikey", "status", "chembl_id", "pubchem_sid", "pubchem_cid"} | |
tsvWrt.Write(tsvHeader) | |
// Implement a streaming XML parser according to guide in | |
// http://blog.davidsingleton.org/parsing-huge-xml-files-with-go | |
xmlDec := xml.NewDecoder(fh) | |
for { | |
t, tokenErr := xmlDec.Token() | |
if tokenErr != nil { | |
if tokenErr == io.EOF { | |
break | |
} else { | |
sp.Fail("Failed to read token:", tokenErr) | |
} | |
} | |
switch startElem := t.(type) { | |
case xml.StartElement: | |
if startElem.Name.Local == "drug" { | |
var status string | |
var inchiKey string | |
var chemblID string | |
var pubchemSID string | |
var pubchemCID string | |
drug := &Drug{} | |
decErr := xmlDec.DecodeElement(drug, &startElem) | |
if err != nil { | |
sp.Fail("Could not decode element", decErr) | |
} | |
for _, g := range drug.Groups { | |
if g == "approved" { | |
status = "A" | |
} | |
// Withdrawn till "shadow" (what's the correct term?) approved status | |
if g == "withdrawn" { | |
status = "W" | |
} | |
} | |
for _, p := range drug.CalculatedProperties { | |
if p.Kind == "InChIKey" { | |
inchiKey = p.Value | |
} | |
} | |
for _, eid := range drug.ExternalIdentifiers { | |
if eid.Resource == "ChEMBL" { | |
chemblID = eid.Identifier | |
} else if eid.Resource == "PubChem Substance" { | |
pubchemSID = eid.Identifier | |
} else if eid.Resource == "PubChem Compound" { | |
pubchemCID = eid.Identifier | |
} | |
} | |
tsvWrt.Write([]string{inchiKey, status, chemblID, pubchemSID, pubchemCID}) | |
} | |
case xml.EndElement: | |
continue | |
} | |
} | |
tsvWrt.Flush() | |
fh.Close() | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment