yufree/clean.r

## clean.r
t3db <- read_csv("t3db.csv")
t3db$iupac_name <- gsub( "b'", "", as.character(t3db$iupac_name))
t3db$iupac_name <- gsub( "'$", "", as.character(t3db$iupac_name))

t3db$name <- gsub( "b'", "", as.character(t3db$name))
t3db$name <- gsub( "'$", "", as.character(t3db$name))

t3db$InChIKey <- gsub( "InChIKey=", "", as.character(t3db$InChIKey))
write.csv(t3db,file = 't3dbnew.csv')

## t3db.py
# remove multiple <?xml version="1.0" encoding="UTF-8"?> and add <T3DB></T3DB> to make a formal format
from io import StringIO
from lxml import etree
import csv
xml = 'toxins.xml'

context = etree.iterparse(xml, tag='compound')

csvfile = open('t3db.csv', 'w')
fieldnames = ['accession', 'monisotopic_molecular_weight', 'iupac_name', 'name', 'chemical_formula', 'InChIKey', 'cas_registry_number', 'hmdb', 'kegg', 'omim', 'biocyc', 'pubchem', 'chemspider', 'smiles', 'kingdom', 'direct_parent', 'super_class', 'class', 'sub_class', 'molecular_framework','origin']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()

for event, elem in context:

    accession = elem.xpath('accession/text()')[0]
    try:
        monisotopic_molecular_weight = elem.xpath('monisotopic_moleculate_weight/text()')[0]
    except:
        monisotopic_molecular_weight = 'NA'
    try:
        iupac_name = elem.xpath('iupac_name/text()')[0].encode('utf-8')
    except:
        iupac_name = 'NA'
    try:
        name = elem.xpath('common_name/text()')[0].encode('utf-8')
    except:
        name = 'NA'
    try:
        chemical_formula = elem.xpath('chemical_formula/text()')[0]
    except:
        chemical_formula = 'NA'

    try:
        inchikey = elem.xpath('inchikey/text()')[0]
    except:
        inchikey = 'NA'

    try:
        cas_registry_number = elem.xpath('cas_registry_number/text()')[0]
    except:
        cas_registry_number = 'NA'
    try:
        hmdb = elem.xpath('hmdb_id/text()')[0]
    except:
        hmdb = 'NA'
    try:
        kegg = elem.xpath('kegg_id/text()')[0]
    except:
        kegg = 'NA'
    try:
        omim = elem.xpath('omim_id/text()')[0]
    except:
        omim = 'NA'
    try:
        biocyc = elem.xpath('biocyc_id/text()')[0]
    except:
        biocyc = 'NA'
    try:
        pubchem = elem.xpath('pubchem_compound_id/text()')[0]
    except:
        pubchem = 'NA'
    try:
        chemspider = elem.xpath('chemspider_id/text()')[0]
    except:
        chemspider = 'NA'
    try:
        smiles = elem.xpath('smiles/text()')[0]
    except:
        smiles = 'NA'
    try:
        kingdom = elem.xpath('taxonomy/kingdom/text()')[0]
    except:
        kingdom = 'NA'
    try:
        direct_parent = elem.xpath('taxonomy/direct_parent/text()')[0]
    except:
        direct_parent = 'NA'
    try:
        super_class = elem.xpath('taxonomy/super_class/text()')[0]
    except:
        super_class = 'NA'
    try:
        classorg = elem.xpath('taxonomy/class/text()')[0]
    except:
        classorg = 'NA'
    try:
        sub_class = elem.xpath('taxonomy/sub_class/text()')[0]
    except:
        sub_class = 'NA'
    try:
        molecular_framework = elem.xpath('taxonomy/molecular_framework/text()')[0]
    except:
        molecular_framework = 'NA'
    try:
        origin = elem.xpath('origin/text()')[0]
    except:
        origin = 'NA'

    writer.writerow({'accession': accession, 'monisotopic_molecular_weight': monisotopic_molecular_weight, 'iupac_name': iupac_name, 'name': name, 'chemical_formula': chemical_formula, 'InChIKey': inchikey, 'cas_registry_number': cas_registry_number, 'hmdb': hmdb, 'kegg': kegg, 'omim': omim, 'biocyc': biocyc, 'pubchem': pubchem, 'chemspider': chemspider, 'smiles': smiles, 'kingdom': kingdom, 'direct_parent': direct_parent, 'super_class': super_class, 'class': classorg, 'sub_class': sub_class, 'molecular_framework': molecular_framework, 'origin':origin})
    # It's safe to call clear() here because no descendants will be
    # accessed
    elem.clear()
# Also eliminate now-empty references from the root node to elem
    for ancestor in elem.xpath('ancestor-or-self::*'):
        while ancestor.getprevious() is not None:
            del ancestor.getparent()[0]
del context
	t3db <- read_csv("t3db.csv")
	t3db$iupac_name <- gsub( "b'", "", as.character(t3db$iupac_name))
	t3db$iupac_name <- gsub( "'$", "", as.character(t3db$iupac_name))

	t3db$name <- gsub( "b'", "", as.character(t3db$name))
	t3db$name <- gsub( "'$", "", as.character(t3db$name))

	t3db$InChIKey <- gsub( "InChIKey=", "", as.character(t3db$InChIKey))
	write.csv(t3db,file = 't3dbnew.csv')
	# remove multiple <?xml version="1.0" encoding="UTF-8"?> and add <T3DB></T3DB> to make a formal format
	from io import StringIO
	from lxml import etree
	import csv
	xml = 'toxins.xml'

	context = etree.iterparse(xml, tag='compound')

	csvfile = open('t3db.csv', 'w')
	fieldnames = ['accession', 'monisotopic_molecular_weight', 'iupac_name', 'name', 'chemical_formula', 'InChIKey', 'cas_registry_number', 'hmdb', 'kegg', 'omim', 'biocyc', 'pubchem', 'chemspider', 'smiles', 'kingdom', 'direct_parent', 'super_class', 'class', 'sub_class', 'molecular_framework','origin']
	writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
	writer.writeheader()

	for event, elem in context:

	accession = elem.xpath('accession/text()')[0]
	try:
	monisotopic_molecular_weight = elem.xpath('monisotopic_moleculate_weight/text()')[0]
	except:
	monisotopic_molecular_weight = 'NA'
	try:
	iupac_name = elem.xpath('iupac_name/text()')[0].encode('utf-8')
	except:
	iupac_name = 'NA'
	try:
	name = elem.xpath('common_name/text()')[0].encode('utf-8')
	except:
	name = 'NA'
	try:
	chemical_formula = elem.xpath('chemical_formula/text()')[0]
	except:
	chemical_formula = 'NA'

	try:
	inchikey = elem.xpath('inchikey/text()')[0]
	except:
	inchikey = 'NA'

	try:
	cas_registry_number = elem.xpath('cas_registry_number/text()')[0]
	except:
	cas_registry_number = 'NA'
	try:
	hmdb = elem.xpath('hmdb_id/text()')[0]
	except:
	hmdb = 'NA'
	try:
	kegg = elem.xpath('kegg_id/text()')[0]
	except:
	kegg = 'NA'
	try:
	omim = elem.xpath('omim_id/text()')[0]
	except:
	omim = 'NA'
	try:
	biocyc = elem.xpath('biocyc_id/text()')[0]
	except:
	biocyc = 'NA'
	try:
	pubchem = elem.xpath('pubchem_compound_id/text()')[0]
	except:
	pubchem = 'NA'
	try:
	chemspider = elem.xpath('chemspider_id/text()')[0]
	except:
	chemspider = 'NA'
	try:
	smiles = elem.xpath('smiles/text()')[0]
	except:
	smiles = 'NA'
	try:
	kingdom = elem.xpath('taxonomy/kingdom/text()')[0]
	except:
	kingdom = 'NA'
	try:
	direct_parent = elem.xpath('taxonomy/direct_parent/text()')[0]
	except:
	direct_parent = 'NA'
	try:
	super_class = elem.xpath('taxonomy/super_class/text()')[0]
	except:
	super_class = 'NA'
	try:
	classorg = elem.xpath('taxonomy/class/text()')[0]
	except:
	classorg = 'NA'
	try:
	sub_class = elem.xpath('taxonomy/sub_class/text()')[0]
	except:
	sub_class = 'NA'
	try:
	molecular_framework = elem.xpath('taxonomy/molecular_framework/text()')[0]
	except:
	molecular_framework = 'NA'
	try:
	origin = elem.xpath('origin/text()')[0]
	except:
	origin = 'NA'

	writer.writerow({'accession': accession, 'monisotopic_molecular_weight': monisotopic_molecular_weight, 'iupac_name': iupac_name, 'name': name, 'chemical_formula': chemical_formula, 'InChIKey': inchikey, 'cas_registry_number': cas_registry_number, 'hmdb': hmdb, 'kegg': kegg, 'omim': omim, 'biocyc': biocyc, 'pubchem': pubchem, 'chemspider': chemspider, 'smiles': smiles, 'kingdom': kingdom, 'direct_parent': direct_parent, 'super_class': super_class, 'class': classorg, 'sub_class': sub_class, 'molecular_framework': molecular_framework, 'origin':origin})
	# It's safe to call clear() here because no descendants will be
	# accessed
	elem.clear()
	# Also eliminate now-empty references from the root node to elem
	for ancestor in elem.xpath('ancestor-or-self::*'):
	while ancestor.getprevious() is not None:
	del ancestor.getparent()[0]
	del context