Last active
January 13, 2024 06:40
-
-
Save rosherbal/56461421c69a8a7da775336c95fa62e0 to your computer and use it in GitHub Desktop.
Extract information from DrugBank xml file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
DRUG | |
->drugbank-id | |
->drugbank-id | |
->drugbank-id | |
->name | |
->description | |
->cas-number (5) | |
->unii | |
->state | |
->groups | |
->group | |
->general-references | |
->articles | |
->article | |
->textbooks | |
->textbook | |
->links | |
->link | |
->attachments | |
->synthesis-reference (10) | |
-> | |
->indication | |
->parmacodynamics | |
->mechanism-ofaction | |
->toxicity | |
->metabolism (15) | |
->absorption | |
->helf-life | |
->protein-binding | |
-> | |
->route-of-elimination | |
->volume-of-distribution (20) | |
->clearance | |
->classification | |
->description | |
->direct-parent | |
->kingdom | |
->superclass | |
->class | |
->subclass | |
->salts | |
-> | |
->synonyms | |
->synonym | |
->products (25) | |
->product | |
->name | |
->labeller | |
->ndc-id, | |
->ndc-product-code | |
->dpd-id | |
->ema-product-code | |
->ema-ma-number | |
->started-marketing-on | |
->ended-marketing-on | |
->dosage-form | |
->strength | |
->route | |
->fda-application-number | |
->generic | |
->over-the-counter | |
->approved | |
->country | |
->source | |
->international-brands | |
-> | |
->mixtures | |
->mixture | |
->name | |
->ingredients | |
->packagers | |
->packager | |
->name | |
->url | |
->manufacturers | |
->manufacturer | |
->prices (30) | |
->price | |
->description | |
->cost | |
->unit | |
->categories | |
->category | |
->category | |
->mesh-id | |
->affected-organisms | |
->affected-organism | |
->dosages | |
->dosage | |
->form | |
->route | |
->strength | |
->atc-codes | |
->atc-code | |
->level | |
->ahfs-codes (35) | |
->ahfs-code | |
->pdb-entries | |
->pdb-entry | |
->fda-label | |
->msds | |
->patents | |
->patent | |
->number | |
->country | |
->approved | |
->expires | |
->pediatric-extension | |
->food-interactions (40) | |
->food-interaction | |
->drug-interactions | |
->drug-interaction | |
->drugbank-id | |
->name | |
->description | |
->sequences | |
->sequence | |
->experimental-properties | |
->property | |
->kind | |
->value | |
->source | |
->external-identifiers | |
->external-identifier | |
->resource (DPD, PubChem, KEGG Drug, PharmGKB, UniProtKB, Therapeutic Targets Database, Wikipedia, ChEMBL) | |
->identifier | |
->external-links (45) | |
->external-link | |
->resource (RxList, Drugs.com) | |
->identifier | |
->pathways | |
->pathway | |
->smpdb-id | |
->name | |
->category | |
->drugs | |
->drug | |
->drugbank-id | |
->name | |
->enzymes | |
->uniprot-id | |
->reactions | |
->reaction | |
->snp-effects | |
->snp-effect | |
->snp-adverse-drug-reactions | |
->snp-adverse-drug-reaction | |
->targets | |
->target | |
->id | |
->name | |
->organism | |
->actions | |
->action | |
->references | |
->articles | |
->article | |
->textbooks | |
->textbook | |
->links | |
->link | |
->attachments | |
->known-action (5) | |
->polypeptide | |
->name | |
->general-function | |
->specific-function | |
->gene-name | |
->locus | |
->cellular-location (5) | |
->transmembrane-regions | |
-> | |
->signal-regions | |
->theoretical-pi | |
->molecular-weight | |
->chromosome-location (10) | |
->organism | |
->external-identifiers (HGNC,GenAtlas,GenBank Gene Database,GenBank Protein Database,Guide to Pharmacology,UniProtKB,UniProt Accession) | |
->external-identifier | |
->resource | |
->identifier | |
->synonyms | |
->synonym | |
->amino-acid-sequence | |
->gene-sequence (15) | |
->pfams | |
->pfam | |
->identifier | |
->name | |
->go-classifiers | |
->go-classifier | |
->category | |
->description | |
->enzymes | |
->enzyme | |
->carriers | |
->carrier | |
->transporters | |
->transporter |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
TITLE :parsing_DrugBank | |
AUTHOR : Hernansaiz Ballesteros, Rosa. | |
rosa.hernansaiz@bioquant.uni-heidelberg.de | |
DESCRIPTION : Parsing full database of DrugBank to extract information | |
about drugs that affects a specific organism. | |
Information retrieved: | |
- name | |
- synonyms | |
- classification (kingdom and superfamily) | |
- drug-interactions (with other drugs) | |
- external-identifiers (to connect to other sources) | |
- pathways | |
- targets (if polypeptides) | |
- target_name | |
- target_uniprot | |
- target_gene_name | |
- action (of the drug over the target) | |
- cell_loc (cell localitation) | |
To get the tree structure of the xml file, | |
see drugBank_tree_structure.txt | |
LICENSE : GPL-v3 | |
""" | |
# Classes # | |
class Drug: | |
""" | |
docstring for Drug. | |
""" | |
def __init__(self, features): | |
self.id = features['id'] | |
self.name = features['name'] | |
self.synonyms = features['synm'] | |
self.kingdom = features['kgd'] | |
self.superclass = features['sclass'] | |
self.interaction = features['itrc'] | |
self.external_id = features['ext_id'] | |
self.pathways = features['pathways'] | |
self.target = [] | |
def getDrugfeatures(self): | |
drug_dict = {"dg_id":self.id, | |
"dg_name":self.name, | |
"dg_synm":self.synonyms, | |
"dg_kingdom":self.kingdom, | |
"dg_superclass":self.superclass, | |
"dg_interactions":self.interaction, | |
"dg_ext_id":self.external_id, | |
"dg_pathways":self.pathways} | |
return drug_dict | |
def addTarget(self, feature_target): | |
self.target.append(feature_target) | |
# Parameters and required variables # | |
dB_file = '../00-Drugs/drugBank_v515_20200103.xml' | |
organism = 'Humans' | |
saveFile = '../00-Drugs/drugBank_v515_targetExtracted.csv' | |
# Main script # | |
''' | |
Get targets from drugBank database for the drugs on the ic50 file | |
''' | |
import xml.etree.ElementTree as ET | |
import time | |
from tqdm import tqdm | |
import pandas as pd | |
xtree = ET.parse(dB_file) | |
xroot = xtree.getroot() | |
drugs = list(xroot) | |
drug_targets = [] | |
for i in tqdm(range(len(drugs))): | |
drug = drugs[i] | |
idDB = drug[0].text # Drug Bank ID | |
for idx,feature in enumerate(drug): | |
if 'name' in str(feature): # drug name | |
drug_name = drug[idx].text | |
if 'synonyms' in str(feature): # drug's synonyms | |
drug_synm = ';'.join([synm.text \ | |
for synm in list(drug[idx])]) | |
if 'classification' in str(feature): #type of drug | |
drug_class_kingdom = list(drug[idx])[2].text | |
drug_class_superclass = list(drug[idx])[3].text | |
if 'drug-interactions' in str(feature): #interaction other drugs | |
drug_interaction = ';'.join([di[0].text | |
for di in list(drug[idx])]) | |
if 'external-identifiers' in str(feature): #other drug's IDs | |
aux = [ext_id[0].text + ":" + ext_id[1].text \ | |
for ext_id in list(drug[idx])] | |
drug_external_id = ';'.join(aux) | |
if 'pathways' in str(feature): #related pathways | |
drug_pathway = ';'.join([pathway[1].text \ | |
for pathway in list(drug[idx])]) | |
if 'targets' in str(feature): #if polypeptide, drug's targets | |
targets = list(drug[idx]) | |
# get all drug-related information in a dictionary | |
drug_dict = {"id":idDB, | |
"name":drug_name, | |
"synm":drug_synm, | |
"kgd":drug_class_kingdom, | |
"sclass":drug_class_superclass, | |
"itrc":drug_interaction, | |
"ext_id":drug_external_id, | |
"pathways":drug_pathway} | |
drug = Drug(drug_dict) | |
# get information of polypeptide targets | |
if len(targets) > 0: | |
for target in targets: | |
idx_pep = None | |
# get indexes | |
for idx,feature in enumerate(target): # check features of targets | |
if 'organism' in str(feature): | |
idx_org = idx | |
if 'name' in str(feature): | |
idx_name = idx | |
if 'actions' in str(feature): | |
idx_act = idx | |
if 'polypeptide' in str(feature): | |
idx_pep = idx | |
# Get information for polypeptide | |
if target[idx_org].text == organism: | |
target_name = target[idx_name].text | |
actions = ';'.join([action.text | |
for action in list(target[idx_act])]) | |
# Get information for polypeptide | |
if idx_pep is not None: #if there is polypeptide's info... | |
for idx,feature in enumerate(target[idx_pep]): | |
if 'gene-name' in str(feature): | |
gene_name = target[idx_pep][idx].text | |
if 'cellular-location' in str(feature): | |
cell_loc = target[idx_pep][idx].text | |
if 'external-identifiers' in str(feature): | |
for ext_id in list(target[idx_pep][idx]): | |
if ext_id[0].text == "UniProtKB": | |
uniprot = ext_id[1].text | |
else: | |
gene_name = None | |
action = None | |
cell_loc = None | |
uniprot = None | |
row = { | |
"dg_id":drug.id, | |
"dg_name":drug.name, | |
"dg_synm":drug.synonyms, | |
"dg_kingdom":drug.kingdom, | |
"dg_superclass":drug.superclass, | |
"dg_interactions":drug.interaction, | |
"dg_ext_id":drug.external_id, | |
"dg_pathways":drug.pathways, | |
"target_name":target_name, | |
"target_uniprot":uniprot, | |
"target_gene_name":gene_name, | |
"action":actions, | |
"cell_loc":cell_loc, | |
} | |
drug_targets.append(row) | |
dt = pd.DataFrame.from_dict(drug_targets, orient='columns') | |
dt.shape | |
dt.to_csv(saveFile) |
Hello @rollingcole,
This is a general parser, so here I just give the Drugbank file and I extract all information out of it. If you have your own list of drugs, you can filter at the beginning, and only run the code with the ones you are interested on. As far as I remember, there was only Human, but I wanted to add this just in case at some point it grows out of Human.
I hope it is of use for you!
Hello
Thank you for the code. Is it possible to extract the indication information from the file?
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hello
Thanks for publishing this code, I am looking to make use of it as a guide to writing my own XML script. Please, Rosa, could you advise on something? I am less familiar with the object-oriented programming that I see here. If I have my own specific drugs I want to look at, I am going to edit the "drug_targets" list? Second, is specifying Organism as humans necessary? I was not aware of other organisms defined in the data base. Thanks so much.