Skip to content

Instantly share code, notes, and snippets.

@ravila4
Created March 8, 2019 04:03
Show Gist options
  • Save ravila4/302b9d450a212c095d6bd6e8f55bd4e9 to your computer and use it in GitHub Desktop.
Save ravila4/302b9d450a212c095d6bd6e8f55bd4e9 to your computer and use it in GitHub Desktop.
Python script for parsing an xml database dump from DrugBank for extracting Log P values
import xmltodict
import pandas as pd
with open("full_database.xml") as db:
doc = xmltodict.parse(db.read())
values = []
for item in doc['drugbank']['drug']:
logp = None
try:
ID = item['drugbank-id']['#text']
prop = item['experimental-properties']
calc = item['calculated-properties']
if prop is not None:
# Iterate through the list of propterties
prop_list = prop['property']
if type(prop_list) is not list:
prop_list = [prop_list]
for p in prop_list:
if p['kind'] == logP:
logp = p['value']
if logp is not None:
for prop in calc['property']:
if prop['kind'] == SMILES:
smiles = prop['value']
if prop['kind'] == logP:
logp_exp = prop['value']
values.append((ID, smiles, logp, logp_exp))
except:
pass
values_df = pd.DataFrame(values)
columns = ["DrugBankID", "SMILES", "expLogP", "calcLogP"]
values_df.columns = columns
values_df.to_csv("logp_values.csv", index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment