Skip to content

Instantly share code, notes, and snippets.

@samuell
Last active March 26, 2019 03:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save samuell/fc82fad39e7efda7987fc18173777f7f to your computer and use it in GitHub Desktop.
Save samuell/fc82fad39e7efda7987fc18173777f7f to your computer and use it in GitHub Desktop.
package main
import (
"encoding/csv"
"encoding/xml"
"io"
"os"
sp "github.com/scipipe/scipipe"
)
// --------------------------------------------------------------------------------
// Workflow definition
// --------------------------------------------------------------------------------
func main() {
wf := sp.NewWorkflow("exvsdb", 2)
// DrugBank XML
download := wf.NewProc("download", "curl -Lfv -o {o:zip} -u $(cat drugbank_userinfo.txt) https://www.drugbank.ca/releases/5-0-11/downloads/all-full-database")
download.SetPathStatic("zip", "dat/drugbank.zip")
unzip := wf.NewProc("unzip", `unzip -d dat/ {i:zip}; mv "dat/full database.xml" {o:xml}`)
unzip.SetPathStatic("xml", "dat/drugbank.xml")
unzip.In("zip").Connect(download.Out("zip"))
xmlToTSV := wf.NewProc("xml2tsv", "# Custom Go code with input: {i:xml} and output: {o:tsv}")
xmlToTSV.SetPathExtend("xml", "tsv", ".extr.tsv")
xmlToTSV.In("xml").Connect(unzip.Out("xml"))
xmlToTSV.CustomExecute = NewXMLToTSVFunc() // Getting the custom Go function in a factory method for readability
wf.Run()
}
// --------------------------------------------------------------------------------
// DrugBank struct definitions
// --------------------------------------------------------------------------------
type Drugbank struct {
XMLName xml.Name `xml:"drugbank"`
Drugs []Drug `xml:"drug"`
}
type Drug struct {
XMLName xml.Name `xml:"drug"`
Name string `xml:"name"`
Groups []string `xml:"groups>group"`
CalculatedProperties []Property `xml:"calculated-properties>property"`
ExternalIdentifiers []ExternalIdentifier `xml:"external-identifiers>external-identifier"`
}
type Property struct {
XMLName xml.Name `xml:"property"`
Kind string `xml:"kind"`
Value string `xml:"value"`
Source string `xml:"source"`
}
type ExternalIdentifier struct {
XMLName xml.Name `xml:"external-identifier"`
Resource string `xml:"resource"`
Identifier string `xml:"identifier"`
}
// --------------------------------------------------------------------------------
// Components
// --------------------------------------------------------------------------------
// NewXMLToTSVFunc returns a CustomExecute function to be used by the XML to TSV
// component in the workflow above
func NewXMLToTSVFunc() func(t *sp.Task) {
return func(t *sp.Task) {
fh, err := os.Open(t.InPath("xml"))
if err != nil {
sp.Fail("Could not open file", t.InPath("xml"))
}
tsvWrt := csv.NewWriter(t.OutIP("tsv").OpenWriteTemp())
tsvWrt.Comma = '\t'
tsvHeader := []string{"inchikey", "status", "chembl_id", "pubchem_sid", "pubchem_cid"}
tsvWrt.Write(tsvHeader)
// Implement a streaming XML parser according to guide in
// http://blog.davidsingleton.org/parsing-huge-xml-files-with-go
xmlDec := xml.NewDecoder(fh)
for {
t, tokenErr := xmlDec.Token()
if tokenErr != nil {
if tokenErr == io.EOF {
break
} else {
sp.Fail("Failed to read token:", tokenErr)
}
}
switch startElem := t.(type) {
case xml.StartElement:
if startElem.Name.Local == "drug" {
var status string
var inchiKey string
var chemblID string
var pubchemSID string
var pubchemCID string
drug := &Drug{}
decErr := xmlDec.DecodeElement(drug, &startElem)
if err != nil {
sp.Fail("Could not decode element", decErr)
}
for _, g := range drug.Groups {
if g == "approved" {
status = "A"
}
// Withdrawn till "shadow" (what's the correct term?) approved status
if g == "withdrawn" {
status = "W"
}
}
for _, p := range drug.CalculatedProperties {
if p.Kind == "InChIKey" {
inchiKey = p.Value
}
}
for _, eid := range drug.ExternalIdentifiers {
if eid.Resource == "ChEMBL" {
chemblID = eid.Identifier
} else if eid.Resource == "PubChem Substance" {
pubchemSID = eid.Identifier
} else if eid.Resource == "PubChem Compound" {
pubchemCID = eid.Identifier
}
}
tsvWrt.Write([]string{inchiKey, status, chemblID, pubchemSID, pubchemCID})
}
case xml.EndElement:
continue
}
}
tsvWrt.Flush()
fh.Close()
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment