Last active
March 26, 2019 03:52
-
-
Save samuell/fc82fad39e7efda7987fc18173777f7f to your computer and use it in GitHub Desktop.
Accompanying code for this blog post: http://bionics.it/posts/parsing-drugbank-xml-or-any-large-xml-file-in-streaming-mode-in-go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"encoding/csv" | |
"encoding/xml" | |
"io" | |
"os" | |
sp "github.com/scipipe/scipipe" | |
) | |
// -------------------------------------------------------------------------------- | |
// Workflow definition | |
// -------------------------------------------------------------------------------- | |
func main() { | |
wf := sp.NewWorkflow("exvsdb", 2) | |
// DrugBank XML | |
download := wf.NewProc("download", "curl -Lfv -o {o:zip} -u $(cat drugbank_userinfo.txt) https://www.drugbank.ca/releases/5-0-11/downloads/all-full-database") | |
download.SetPathStatic("zip", "dat/drugbank.zip") | |
unzip := wf.NewProc("unzip", `unzip -d dat/ {i:zip}; mv "dat/full database.xml" {o:xml}`) | |
unzip.SetPathStatic("xml", "dat/drugbank.xml") | |
unzip.In("zip").Connect(download.Out("zip")) | |
xmlToTSV := wf.NewProc("xml2tsv", "# Custom Go code with input: {i:xml} and output: {o:tsv}") | |
xmlToTSV.SetPathExtend("xml", "tsv", ".extr.tsv") | |
xmlToTSV.In("xml").Connect(unzip.Out("xml")) | |
xmlToTSV.CustomExecute = NewXMLToTSVFunc() // Getting the custom Go function in a factory method for readability | |
wf.Run() | |
} | |
// -------------------------------------------------------------------------------- | |
// DrugBank struct definitions | |
// -------------------------------------------------------------------------------- | |
type Drugbank struct { | |
XMLName xml.Name `xml:"drugbank"` | |
Drugs []Drug `xml:"drug"` | |
} | |
type Drug struct { | |
XMLName xml.Name `xml:"drug"` | |
Name string `xml:"name"` | |
Groups []string `xml:"groups>group"` | |
CalculatedProperties []Property `xml:"calculated-properties>property"` | |
ExternalIdentifiers []ExternalIdentifier `xml:"external-identifiers>external-identifier"` | |
} | |
type Property struct { | |
XMLName xml.Name `xml:"property"` | |
Kind string `xml:"kind"` | |
Value string `xml:"value"` | |
Source string `xml:"source"` | |
} | |
type ExternalIdentifier struct { | |
XMLName xml.Name `xml:"external-identifier"` | |
Resource string `xml:"resource"` | |
Identifier string `xml:"identifier"` | |
} | |
// -------------------------------------------------------------------------------- | |
// Components | |
// -------------------------------------------------------------------------------- | |
// NewXMLToTSVFunc returns a CustomExecute function to be used by the XML to TSV | |
// component in the workflow above | |
func NewXMLToTSVFunc() func(t *sp.Task) { | |
return func(t *sp.Task) { | |
fh, err := os.Open(t.InPath("xml")) | |
if err != nil { | |
sp.Fail("Could not open file", t.InPath("xml")) | |
} | |
tsvWrt := csv.NewWriter(t.OutIP("tsv").OpenWriteTemp()) | |
tsvWrt.Comma = '\t' | |
tsvHeader := []string{"inchikey", "status", "chembl_id", "pubchem_sid", "pubchem_cid"} | |
tsvWrt.Write(tsvHeader) | |
// Implement a streaming XML parser according to guide in | |
// http://blog.davidsingleton.org/parsing-huge-xml-files-with-go | |
xmlDec := xml.NewDecoder(fh) | |
for { | |
t, tokenErr := xmlDec.Token() | |
if tokenErr != nil { | |
if tokenErr == io.EOF { | |
break | |
} else { | |
sp.Fail("Failed to read token:", tokenErr) | |
} | |
} | |
switch startElem := t.(type) { | |
case xml.StartElement: | |
if startElem.Name.Local == "drug" { | |
var status string | |
var inchiKey string | |
var chemblID string | |
var pubchemSID string | |
var pubchemCID string | |
drug := &Drug{} | |
decErr := xmlDec.DecodeElement(drug, &startElem) | |
if err != nil { | |
sp.Fail("Could not decode element", decErr) | |
} | |
for _, g := range drug.Groups { | |
if g == "approved" { | |
status = "A" | |
} | |
// Withdrawn till "shadow" (what's the correct term?) approved status | |
if g == "withdrawn" { | |
status = "W" | |
} | |
} | |
for _, p := range drug.CalculatedProperties { | |
if p.Kind == "InChIKey" { | |
inchiKey = p.Value | |
} | |
} | |
for _, eid := range drug.ExternalIdentifiers { | |
if eid.Resource == "ChEMBL" { | |
chemblID = eid.Identifier | |
} else if eid.Resource == "PubChem Substance" { | |
pubchemSID = eid.Identifier | |
} else if eid.Resource == "PubChem Compound" { | |
pubchemCID = eid.Identifier | |
} | |
} | |
tsvWrt.Write([]string{inchiKey, status, chemblID, pubchemSID, pubchemCID}) | |
} | |
case xml.EndElement: | |
continue | |
} | |
} | |
tsvWrt.Flush() | |
fh.Close() | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment