Skip to content

Instantly share code, notes, and snippets.

@yufree
Created November 1, 2018 18:55
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yufree/f552d865096010445fc7b969e7e9d439 to your computer and use it in GitHub Desktop.
Save yufree/f552d865096010445fc7b969e7e9d439 to your computer and use it in GitHub Desktop.
python and R code to process hmdb xml database
library(tidyverse)
hmdb <- read_csv('hmdb.csv')
hmdb$iupac_name <- gsub( "b'", "", as.character(hmdb$iupac_name))
hmdb$iupac_name <- gsub( "'$", "", as.character(hmdb$iupac_name))
hmdb$name <- gsub( "b'", "", as.character(hmdb$name))
hmdb$name <- gsub( "'$", "", as.character(hmdb$name))
hmdb$omim <- gsub( "^\\[", "", as.character(hmdb$omim))
hmdb$omim <- gsub( "]$", "", as.character(hmdb$omim))
hmdb$dname <- gsub( "^\\[", "", as.character(hmdb$dname))
hmdb$dname <- gsub( "]$", "", as.character(hmdb$dname))
hmdb$smpdb <- gsub( "^\\[", "", as.character(hmdb$smpdb))
hmdb$smpdb <- gsub( "]$", "", as.character(hmdb$smpdb))
hmdb$keggmap <- gsub( "^\\[", "", as.character(hmdb$keggmap))
hmdb$keggmap <- gsub( "]$", "", as.character(hmdb$keggmap))
from io import StringIO
from lxml import etree
import csv
xml = 'hmdb_metabolites.xml'
ns = {'hmdb': 'http://www.hmdb.ca'}
context = etree.iterparse(xml, tag='{http://www.hmdb.ca}metabolite')
csvfile = open('hmdb.csv', 'w')
fieldnames = ['omim', 'dname', 'smpdb', 'keggmap', 'accession', 'monisotopic_molecular_weight', 'iupac_name', 'name', 'chemical_formula', 'InChIKey', 'cas_registry_number', 'kegg', 'food','drugbank','drugbankmet', 'biocyc', 'pubchem', 'chemspider', 'smiles', 'metlin_id', 'kingdom', 'direct_parent', 'super_class', 'class', 'sub_class', 'molecular_framework', 'logpexp']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for event, elem in context:
try:
omim = elem.xpath('hmdb:diseases/hmdb:disease/hmdb:omim_id/text()', namespaces=ns)
except:
omim = 'NA'
try:
dname = elem.xpath('hmdb:diseases/hmdb:disease/hmdb:name/text()', namespaces=ns)
except:
dname = 'NA'
try:
smpdb = elem.xpath('hmdb:pathways/hmdb:pathway/hmdb:smpdb_id/text()', namespaces=ns)
except:
smpdb = 'NA'
try:
keggmap = elem.xpath('hmdb:pathways/hmdb:pathway/hmdb:kegg_map_id/text()', namespaces=ns)
except:
keggmap = 'NA'
accession = elem.xpath('hmdb:accession/text()', namespaces=ns)[0]
try:
monisotopic_molecular_weight = elem.xpath('hmdb:monisotopic_molecular_weight/text()', namespaces=ns)[0]
except:
monisotopic_molecular_weight = 'NA'
try:
iupac_name = elem.xpath('hmdb:iupac_name/text()', namespaces=ns)[0].encode('utf-8')
except:
iupac_name = 'NA'
name = elem.xpath('hmdb:name/text()', namespaces=ns)[0].encode('utf-8')
try:
chemical_formula = elem.xpath('hmdb:chemical_formula/text()', namespaces=ns)[0]
except:
chemical_formula = 'NA'
try:
inchikey = elem.xpath('hmdb:inchikey/text()', namespaces=ns)[0]
except:
inchikey = 'NA'
try:
cas_registry_number = elem.xpath('hmdb:cas_registry_number/text()', namespaces=ns)[0]
except:
cas_registry_number = 'NA'
try:
kegg = elem.xpath('hmdb:kegg_id/text()', namespaces=ns)[0]
except:
kegg = 'NA'
try:
food = elem.xpath('hmdb:foodb_id/text()', namespaces=ns)[0]
except:
food = 'NA'
try:
drugbank = elem.xpath('hmdb:drugbank_id/text()', namespaces=ns)[0]
except:
drugbank = 'NA'
try:
drugbankmet = elem.xpath('hmdb:drugbank_metabolite_id/text()', namespaces=ns)[0]
except:
drugbankmet = 'NA'
try:
biocyc = elem.xpath('hmdb:biocyc_id/text()', namespaces=ns)[0]
except:
biocyc = 'NA'
try:
pubchem = elem.xpath('hmdb:pubchem_compound_id/text()', namespaces=ns)[0]
except:
pubchem = 'NA'
try:
chemspider = elem.xpath('hmdb:chemspider_id/text()', namespaces=ns)[0]
except:
chemspider = 'NA'
try:
smiles = elem.xpath('hmdb:smiles/text()', namespaces=ns)[0]
except:
smiles = 'NA'
try:
metlin_id = elem.xpath('hmdb:metlin_id/text()', namespaces=ns)[0]
except:
metlin_id = 'NA'
try:
logpexp = elem.xpath('hmdb:experimental_properties/hmdb:property[hmdb:kind ="logp"]/hmdb:value/text()', namespaces=ns)[0]
except:
logpexp = 'NA'
try:
kingdom = elem.xpath('hmdb:taxonomy/hmdb:kingdom/text()', namespaces=ns)[0]
except:
kingdom = 'NA'
try:
direct_parent = elem.xpath('hmdb:taxonomy/hmdb:direct_parent/text()', namespaces=ns)[0]
except:
direct_parent = 'NA'
try:
super_class = elem.xpath('hmdb:taxonomy/hmdb:super_class/text()', namespaces=ns)[0]
except:
super_class = 'NA'
try:
classorg = elem.xpath('hmdb:taxonomy/hmdb:class/text()', namespaces=ns)[0]
except:
classorg = 'NA'
try:
sub_class = elem.xpath('hmdb:taxonomy/hmdb:sub_class/text()', namespaces=ns)[0]
except:
sub_class = 'NA'
try:
molecular_framework = elem.xpath('hmdb:taxonomy/hmdb:molecular_framework/text()', namespaces=ns)[0]
except:
molecular_framework = 'NA'
writer.writerow({'omim': omim, 'dname': dname, 'smpdb': smpdb, 'keggmap': keggmap, 'accession': accession, 'monisotopic_molecular_weight': monisotopic_molecular_weight, 'iupac_name': iupac_name, 'name': name, 'chemical_formula': chemical_formula, 'InChIKey': inchikey, 'cas_registry_number': cas_registry_number, 'kegg': kegg, 'food': food, 'drugbank': drugbank, 'drugbankmet': drugbankmet,'biocyc': biocyc, 'pubchem': pubchem, 'chemspider': chemspider, 'smiles': smiles, 'metlin_id': metlin_id, 'kingdom': kingdom, 'direct_parent': direct_parent, 'super_class': super_class, 'class': classorg, 'sub_class': sub_class, 'molecular_framework': molecular_framework, 'logpexp': logpexp})
# It's safe to call clear() here because no descendants will be
# accessed
elem.clear()
# Also eliminate now-empty references from the root node to elem
for ancestor in elem.xpath('ancestor-or-self::*'):
while ancestor.getprevious() is not None:
del ancestor.getparent()[0]
del context
@ajnonev
Copy link

ajnonev commented Feb 9, 2024

Hi yufree, thank you for posting this extremely helpful code. I've utilized it to ingest hmdb xml data, but I see it is skipping over empty list values. Is there a systematic way to address it in the code. Here is an example, if you look at metabolite "HMDB0000002" it has 3 abnormal specimens, but only 2 of them have concentrations. the text() lists extract all non-empty values and the specimen-concentration pairs can't be reproduced, because the indexing is no longer preserved.

Is there a way to address the empty elements and print them as well to preserve the data integrity?

Thank you kindly,
AJ Nonev

from io import StringIO
from lxml import etree
import csv
def hmdbextract(name, file):
  ns = {'hmdb': 'http://www.hmdb.ca'}
  context = etree.iterparse(name, tag='{http://www.hmdb.ca}metabolite')
  csvfile = open(file, 'w')
  fieldnames = ['accession', 'monisotopic_molecular_weight', 'iupac_name', 'name', 'chemical_formula', 'InChIKey', 'cas_registry_number', 'smiles', 'drugbank','chebi_id', 'pubchem', 'phenol_explorer_compound_id','food','knapsack', 'chemspider', 'kegg', 'meta_cyc','bigg','metlin_id','pdb_id', 'logpexp','kingdom',  'direct_parent', 'super_class', 'class', 'sub_class', 'molecular_framework', 'diseases', 'biospecimen_normal','normal_concentration_value', 'biospecimen_abnormal', 'abnormal_concentration_value']
  writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
  writer.writeheader()
  for event, elem in context:

    accession = elem.xpath('hmdb:accession/text()', namespaces=ns)[0]
    try:
        monisotopic_molecular_weight = elem.xpath('hmdb:monisotopic_molecular_weight/text()', namespaces=ns)[0]
    except:
        monisotopic_molecular_weight = 'NA'
    try:
        iupac_name = elem.xpath('hmdb:iupac_name/text()', namespaces=ns)[0]
    except:
        iupac_name = 'NA'
    try:
        name = elem.xpath('hmdb:name/text()', namespaces=ns)[0]
    except:
        name = 'NA'
    try:
        chemical_formula = elem.xpath('hmdb:chemical_formula/text()', namespaces=ns)[0]
    except:
        chemical_formula = 'NA'
    try:
        inchikey = elem.xpath('hmdb:inchikey/text()', namespaces=ns)[0]
    except:
        inchikey = 'NA'
    try:
        cas_registry_number = elem.xpath('hmdb:cas_registry_number/text()', namespaces=ns)[0]
    except:
        cas_registry_number = 'NA'
    try:
        smiles = elem.xpath('hmdb:smiles/text()', namespaces=ns)[0]
    except:
        smiles = 'NA'
    try:
        drugbank = elem.xpath('hmdb:drugbank_id/text()', namespaces=ns)[0]
    except:
        drugbank = 'NA'
    try:
        chebi_id = elem.xpath('hmdb:chebi_id/text()', namespaces=ns)[0]
    except:
        chebi_id = 'NA'
    try:
        pubchem = elem.xpath('hmdb:pubchem_compound_id/text()', namespaces=ns)[0]
    except:
        pubchem = 'NA'
    try:
        phenol_explorer_compound_id = elem.xpath('hmdb:phenol_explorer_compound_id/text()', namespaces=ns)[0]
    except:
        phenol_explorer_compound_id = 'NA'
    try:
        food = elem.xpath('hmdb:foodb_id/text()', namespaces=ns)[0]
    except:
        food = 'NA'
    try:
        knapsack = elem.xpath('hmdb:knapsack_id/text()', namespaces=ns)[0]
    except:
        knapsack = 'NA'
    try:
        chemspider = elem.xpath('hmdb:chemspider_id/text()', namespaces=ns)[0]
    except:
        chemspider = 'NA'
    try:
        kegg = elem.xpath('hmdb:kegg_id/text()', namespaces=ns)[0]
    except:
        kegg = 'NA'
    try:
        meta_cyc = elem.xpath('hmdb:meta_cyc_id/text()', namespaces=ns)[0]
    except:
        meta_cyc = 'NA'
    try:
        bigg = elem.xpath('hmdb:bigg_id/text()', namespaces=ns)[0]
    except:
        bigg = 'NA'
    try:
        metlin_id = elem.xpath('hmdb:metlin_id/text()', namespaces=ns)[0]
    except:
        metlin_id = 'NA'
    try:
        pdb_id = elem.xpath('hmdb:pdb_id/text()', namespaces=ns)[0]
    except:
        pdb_id = 'NA'
    try:
        logpexp = elem.xpath('hmdb:experimental_properties/hmdb:property[hmdb:kind = "logp"]/hmdb:value/text()', namespaces=ns)[0]
    except:
        logpexp = 'NA'
    try:
        kingdom = elem.xpath('hmdb:taxonomy/hmdb:kingdom/text()', namespaces=ns)[0]
    except:
        kingdom = 'NA'
    try:
        direct_parent = elem.xpath('hmdb:taxonomy/hmdb:direct_parent/text()', namespaces=ns)[0]
    except:
        direct_parent = 'NA'
    try:
        super_class = elem.xpath('hmdb:taxonomy/hmdb:super_class/text()', namespaces=ns)[0]
    except:
        super_class = 'NA'
    try:
        classorg = elem.xpath('hmdb:taxonomy/hmdb:class/text()', namespaces=ns)[0]
    except:
        classorg = 'NA'
    try:
        sub_class = elem.xpath('hmdb:taxonomy/hmdb:sub_class/text()', namespaces=ns)[0]
    except:
        sub_class = 'NA'
    try:
        molecular_framework = elem.xpath('hmdb:taxonomy/hmdb:molecular_framework/text()', namespaces=ns)[0]
    except:
        molecular_framework = 'NA'
    try:
        diseases = elem.xpath('hmdb:diseases/hmdb:disease/hmdb:name/text()', namespaces=ns)
    except:
        diseases = 'NA'
    try:
        biospecimen_normal = elem.xpath('hmdb:normal_concentrations/hmdb:concentration/hmdb:biospecimen/text()', namespaces=ns)
    except:
        biospecimen_normal = 'NA'
    try:
        normal_concentration_value = elem.xpath('hmdb:normal_concentrations/hmdb:concentration/hmdb:concentration_value/text()', namespaces=ns)
    except:
        normal_concentration_value = 'NA'
    try:
        biospecimen_abnormal = elem.xpath('hmdb:abnormal_concentrations/hmdb:concentration/hmdb:biospecimen/text()', namespaces=ns)
    except:
        biospecimen_abnormal = 'NA'
    try:
        abnormal_concentration_value = elem.xpath('hmdb:abnormal_concentrations/hmdb:concentration/hmdb:concentration_value/text()', namespaces=ns)
    except:
        abnormal_concentration_value = 'NA'

    writer.writerow({'accession': accession, 'monisotopic_molecular_weight': monisotopic_molecular_weight, 'iupac_name': iupac_name, 'name': name, 'chemical_formula': chemical_formula, 'InChIKey': inchikey, 'cas_registry_number': cas_registry_number, 'smiles': smiles,'drugbank': drugbank,'chebi_id': chebi_id,'pubchem': pubchem,'phenol_explorer_compound_id':phenol_explorer_compound_id, 'food': food,'knapsack': knapsack, 'chemspider': chemspider,'kegg': kegg, 'meta_cyc': meta_cyc, 'bigg':bigg, 'metlin_id': metlin_id, 'pdb_id':pdb_id,'logpexp':logpexp, 'kingdom': kingdom, 'direct_parent': direct_parent, 'super_class': super_class, 'class': classorg, 'sub_class': sub_class, 'molecular_framework': molecular_framework, 'diseases': diseases, 'biospecimen_normal': biospecimen_normal,'normal_concentration_value': normal_concentration_value, 'biospecimen_abnormal': biospecimen_abnormal, 'abnormal_concentration_value': abnormal_concentration_value})
    # It's safe to call clear() here because no descendants will be
    # accessed
    elem.clear()
# Also eliminate now-empty references from the root node to elem
    for ancestor in elem.xpath('ancestor-or-self::*'):
        while ancestor.getprevious() is not None:
            del ancestor.getparent()[0]
  del context
  return;

@yufree
Copy link
Author

yufree commented Feb 10, 2024

Hi, I am in vacation and don't have an IDE to test. Since each compound will have multiple biospecimen/concenrtration pairs, it's better to save those value as a new table to reserve NA value for each biospecimen. You might write another for loop inside the original for loop to extract all the biospecimen with concentration values and save them as a table of "hmdbID-biospecimen-concentration". The original code will only have one line for each hmdbID while you need multiple lines for different biospecimen.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment