Skip to content

Instantly share code, notes, and snippets.

@yufree
Created October 2, 2018 18:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yufree/834c8d2679961f6f0988716dcbb6d189 to your computer and use it in GitHub Desktop.
Save yufree/834c8d2679961f6f0988716dcbb6d189 to your computer and use it in GitHub Desktop.
t3db
t3db <- read_csv("t3db.csv")
t3db$iupac_name <- gsub( "b'", "", as.character(t3db$iupac_name))
t3db$iupac_name <- gsub( "'$", "", as.character(t3db$iupac_name))
t3db$name <- gsub( "b'", "", as.character(t3db$name))
t3db$name <- gsub( "'$", "", as.character(t3db$name))
t3db$InChIKey <- gsub( "InChIKey=", "", as.character(t3db$InChIKey))
write.csv(t3db,file = 't3dbnew.csv')
# remove multiple <?xml version="1.0" encoding="UTF-8"?> and add <T3DB></T3DB> to make a formal format
from io import StringIO
from lxml import etree
import csv
xml = 'toxins.xml'
context = etree.iterparse(xml, tag='compound')
csvfile = open('t3db.csv', 'w')
fieldnames = ['accession', 'monisotopic_molecular_weight', 'iupac_name', 'name', 'chemical_formula', 'InChIKey', 'cas_registry_number', 'hmdb', 'kegg', 'omim', 'biocyc', 'pubchem', 'chemspider', 'smiles', 'kingdom', 'direct_parent', 'super_class', 'class', 'sub_class', 'molecular_framework','origin']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for event, elem in context:
accession = elem.xpath('accession/text()')[0]
try:
monisotopic_molecular_weight = elem.xpath('monisotopic_moleculate_weight/text()')[0]
except:
monisotopic_molecular_weight = 'NA'
try:
iupac_name = elem.xpath('iupac_name/text()')[0].encode('utf-8')
except:
iupac_name = 'NA'
try:
name = elem.xpath('common_name/text()')[0].encode('utf-8')
except:
name = 'NA'
try:
chemical_formula = elem.xpath('chemical_formula/text()')[0]
except:
chemical_formula = 'NA'
try:
inchikey = elem.xpath('inchikey/text()')[0]
except:
inchikey = 'NA'
try:
cas_registry_number = elem.xpath('cas_registry_number/text()')[0]
except:
cas_registry_number = 'NA'
try:
hmdb = elem.xpath('hmdb_id/text()')[0]
except:
hmdb = 'NA'
try:
kegg = elem.xpath('kegg_id/text()')[0]
except:
kegg = 'NA'
try:
omim = elem.xpath('omim_id/text()')[0]
except:
omim = 'NA'
try:
biocyc = elem.xpath('biocyc_id/text()')[0]
except:
biocyc = 'NA'
try:
pubchem = elem.xpath('pubchem_compound_id/text()')[0]
except:
pubchem = 'NA'
try:
chemspider = elem.xpath('chemspider_id/text()')[0]
except:
chemspider = 'NA'
try:
smiles = elem.xpath('smiles/text()')[0]
except:
smiles = 'NA'
try:
kingdom = elem.xpath('taxonomy/kingdom/text()')[0]
except:
kingdom = 'NA'
try:
direct_parent = elem.xpath('taxonomy/direct_parent/text()')[0]
except:
direct_parent = 'NA'
try:
super_class = elem.xpath('taxonomy/super_class/text()')[0]
except:
super_class = 'NA'
try:
classorg = elem.xpath('taxonomy/class/text()')[0]
except:
classorg = 'NA'
try:
sub_class = elem.xpath('taxonomy/sub_class/text()')[0]
except:
sub_class = 'NA'
try:
molecular_framework = elem.xpath('taxonomy/molecular_framework/text()')[0]
except:
molecular_framework = 'NA'
try:
origin = elem.xpath('origin/text()')[0]
except:
origin = 'NA'
writer.writerow({'accession': accession, 'monisotopic_molecular_weight': monisotopic_molecular_weight, 'iupac_name': iupac_name, 'name': name, 'chemical_formula': chemical_formula, 'InChIKey': inchikey, 'cas_registry_number': cas_registry_number, 'hmdb': hmdb, 'kegg': kegg, 'omim': omim, 'biocyc': biocyc, 'pubchem': pubchem, 'chemspider': chemspider, 'smiles': smiles, 'kingdom': kingdom, 'direct_parent': direct_parent, 'super_class': super_class, 'class': classorg, 'sub_class': sub_class, 'molecular_framework': molecular_framework, 'origin':origin})
# It's safe to call clear() here because no descendants will be
# accessed
elem.clear()
# Also eliminate now-empty references from the root node to elem
for ancestor in elem.xpath('ancestor-or-self::*'):
while ancestor.getprevious() is not None:
del ancestor.getparent()[0]
del context
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment