Skip to content

Instantly share code, notes, and snippets.

@sgsfak
Last active December 23, 2018 08:11
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
  • Save sgsfak/ed87e19a71f09c7517d2 to your computer and use it in GitHub Desktop.
Save sgsfak/ed87e19a71f09c7517d2 to your computer and use it in GitHub Desktop.
Parse DrugBank XML file with xmlstarlet
--
-- PostgreSQL database dump
--
-- Dumped from database version 9.5.2
-- Dumped by pg_dump version 9.5.2
SET statement_timeout = 0;
SET lock_timeout = 0;
SET client_encoding = 'UTF8';
SET standard_conforming_strings = on;
SET check_function_bodies = false;
SET client_min_messages = warning;
SET row_security = off;
CREATE TABLE atc_drugs (
drug_id character varying(10),
atc_code character varying(10)
);
CREATE TABLE drug_brands (
drug_id character varying(10),
brand text,
company text
);
CREATE TABLE drug_interactions (
drug_id1 character varying(10),
drug_id2 character varying(10),
name text,
description text
);
CREATE TABLE drugs (
drug_id character varying(10) NOT NULL,
name text,
description text
);
CREATE TABLE drug_dosages (
drug_id character varying(10) NOT NULL REFERENCES drugs(drug_id),
form text,
route text,
strength text
);
CREATE INDEX drug_dosages_id on drug_dosages(drug_id);
CREATE VIEW drug_ints_vw AS
SELECT di.drug_id1 AS drug_id,
array_agg(di.drug_id2 ORDER BY di.drug_id2) AS drug_ids2,
array_agg(di.name ORDER BY di.drug_id2) AS nmes,
array_agg(di.description ORDER BY di.drug_id2) AS descriptions
FROM (drug_interactions di
JOIN drugs ON (((di.drug_id2)::text = (drugs.drug_id)::text)))
GROUP BY di.drug_id1;
CREATE TABLE drug_synonyms (
drug_id character varying(10),
synonym text
);
CREATE MATERIALIZED VIEW drugs_ftsearch AS
SELECT drugs.drug_id,
to_tsvector('english'::regconfig, concat(drugs.name, ' ', b.brands, ' ', s.synonyms)) AS ftext
FROM ((drugs
LEFT JOIN ( SELECT drug_brands.drug_id,
string_agg(drug_brands.brand, ' '::text) AS brands
FROM drug_brands
GROUP BY drug_brands.drug_id) b USING (drug_id))
LEFT JOIN ( SELECT drug_synonyms.drug_id,
string_agg(drug_synonyms.synonym, ' '::text) AS synonyms
FROM drug_synonyms
GROUP BY drug_synonyms.drug_id) s USING (drug_id))
WITH NO DATA;
CREATE MATERIALIZED VIEW drugs_name_ftsearch AS
SELECT drugs.drug_id,
to_tsvector('english'::regconfig, drugs.name) AS name_ftext
FROM drugs
WITH NO DATA;
CREATE VIEW drugs_vw AS
SELECT drugs.drug_id,
drugs.name,
a.atc_codes,
b.brands,
s.synonyms,
drugs.description
FROM (((drugs
LEFT JOIN ( SELECT atc_drugs.drug_id,
array_agg(atc_drugs.atc_code) AS atc_codes
FROM atc_drugs
GROUP BY atc_drugs.drug_id) a USING (drug_id))
LEFT JOIN ( SELECT drug_brands.drug_id,
array_agg(drug_brands.brand) AS brands
FROM drug_brands
GROUP BY drug_brands.drug_id) b USING (drug_id))
LEFT JOIN ( SELECT drug_synonyms.drug_id,
array_agg(drug_synonyms.synonym) AS synonyms
FROM drug_synonyms
GROUP BY drug_synonyms.drug_id) s USING (drug_id));
ALTER TABLE ONLY drugs
ADD CONSTRAINT drugs_pkey PRIMARY KEY (drug_id);
CREATE INDEX atc_drugs_atc_code_idx ON atc_drugs USING btree (atc_code);
CREATE INDEX atc_drugs_drug_id_idx ON atc_drugs USING btree (drug_id);
CREATE INDEX drug_brands_drug_id_idx ON drug_brands USING btree (drug_id);
CREATE INDEX drug_interactions_drug_id1_idx ON drug_interactions USING btree (drug_id1);
CREATE INDEX drug_interactions_drug_id2_idx ON drug_interactions USING hash (drug_id2);
CREATE INDEX drug_synonyms_drug_id_idx ON drug_synonyms USING btree (drug_id);
CREATE INDEX drugs_ftsearch_ftext_idx ON drugs_ftsearch USING gin (ftext);
CREATE INDEX drugs_name_ftsearch_name_ftext_idx ON drugs_name_ftsearch USING gin (name_ftext);
ALTER TABLE ONLY atc_drugs
ADD CONSTRAINT atc_drugs_drug_id_fkey FOREIGN KEY (drug_id) REFERENCES drugs(drug_id);
ALTER TABLE ONLY drug_brands
ADD CONSTRAINT drug_brands_drug_id_fkey FOREIGN KEY (drug_id) REFERENCES drugs(drug_id);
ALTER TABLE ONLY drug_interactions
ADD CONSTRAINT drug_interactions_drug_id1_fkey FOREIGN KEY (drug_id1) REFERENCES drugs(drug_id);
ALTER TABLE ONLY drug_synonyms
ADD CONSTRAINT drug_synonyms_drug_id_fkey FOREIGN KEY (drug_id) REFERENCES drugs(drug_id);
all : drugs.tsv atc_drugs.tsv drug_interactions.tsv drug_synonyms.tsv drug_brands.tsv drug_dosages.tsv
drugbank.xml:
curl -L -o drugbank.xml.zip -u EMAIL:PASSWORD http://www.drugbank.ca/releases/5-0-1/downloads/all-full-database
rm -f drugbank.xml ; unzip -p drugbank.xml.zip > $@
drugs.tsv: drugbank.xml
echo 'drug_id name description' > $@
xmlstarlet sel -T -N t=http://www.drugbank.ca -t -m 't:drugbank/t:drug/t:drugbank-id[@primary="true"]' -v 'concat(.," ", ../t:name, " ", normalize-space(../t:description))' -n $< >> $@
atc_drugs.tsv: drugbank.xml
echo 'drug_id atc_code' > $@
xmlstarlet sel -T -N t=http://www.drugbank.ca -t -m 't:drugbank/t:drug/t:atc-codes/t:atc-code' -v 'concat(../../t:drugbank-id," ", ./@code)' -n $< >> $@
drug_interactions.tsv: drugbank.xml
echo 'drug_id1 drug_id2 name description' > $@
xmlstarlet sel -T -N t=http://www.drugbank.ca -t -m 't:drugbank/t:drug/t:drug-interactions/t:drug-interaction' -v 'concat(../../t:drugbank-id," ", t:drugbank-id, " ", normalize-space(t:name), " ", normalize-space(t:description))' -n $< >> $@
drug_synonyms.tsv: drugbank.xml
echo 'drug_id synonym' > $@
xmlstarlet sel -T -N t=http://www.drugbank.ca -t -m 't:drugbank/t:drug/t:synonyms/t:synonym' -v 'concat(../../t:drugbank-id," ", normalize-space(.))' -n $< >> $@
drug_brands.tsv: drugbank.xml
echo 'drug_id brand company' > $@
xmlstarlet sel -T -N t=http://www.drugbank.ca -t -m 't:drugbank/t:drug/t:international-brands/t:international-brand' -v 'concat(../../t:drugbank-id," ", normalize-space(.), " ", ./@company)' -n $< >> $@
drug_dosages.tsv: drugbank.xml
echo 'drug_id form route strength' > $@
xmlstarlet sel -T -N t=http://www.drugbank.ca -t -m 't:drugbank/t:drug/t:dosages/t:dosage' -v 'concat(../../t:drugbank-id," ", normalize-space(t:form)," ", normalize-space(t:route), " ", normalize-space(t:strength))' -n $< >> $@
db_insert : drugs.tsv atc_drugs.tsv drug_interactions.tsv drug_synonyms.tsv drug_brands.tsv drug_dosages.tsv
psql -c "TRUNCATE drugs, atc_drugs, drug_interactions, drug_synonyms, drug_brands, drug_dosages" drugbank
psql -c "COPY drugs from '`pwd`/drugs.tsv' with (format csv, delimiter ' ', header true, quote '|')" drugbank
psql -c "COPY atc_drugs from '`pwd`/atc_drugs.tsv' with (format csv, delimiter ' ', header true, quote '|')" drugbank
psql -c "COPY drug_interactions from '`pwd`/drug_interactions.tsv' with (format csv, delimiter ' ', header true, quote '|')" drugbank
psql -c "COPY drug_synonyms from '`pwd`/drug_synonyms.tsv' with (format csv, delimiter ' ', header true, quote '|')" drugbank
psql -c "COPY drug_brands from '`pwd`/drug_brands.tsv' with (format csv, delimiter ' ', header true, quote '|')" drugbank
psql -c "COPY drug_dosages from '`pwd`/drug_dosages.tsv' with (format csv, delimiter ' ', header true, quote '|')" drugbank
psql -c "REFRESH MATERIALIZED VIEW drugs_name_ftsearch" drugbank
psql -c "REFRESH MATERIALIZED VIEW drugs_ftsearch" drugbank
.PHONY: all db_insert
@sgsfak
Copy link
Author

sgsfak commented May 16, 2016

As explained at http://www.drugbank.ca/releases/latest currently in order to access the DrugBank downloads you need to signup for a free DrugBank account. Once you have signed up you will be able to download the complete database manually or programatically (for example through an application or script). Please update the Makefile to replace your email and password in the Makefile in order for this to work.

@mauricioromero86
Copy link

mauricioromero86 commented Feb 2, 2017

@sgsfak, thanks. I'm trying to use this but I'm new and just trying to get a simple crosswalk between TDD ID, DrugBankID and ATC code. I'm trying to run your code in ubuntu but when I run the script in the terminal it doesn't work. I'm unsure on what system requirements may be needed to run this.
Thanks in advance.

@sgsfak
Copy link
Author

sgsfak commented Apr 4, 2017

@mauricioromero86 The requirements are a recent version of Linux (Ubuntu "Trusty" (14.04) is fine), PostgreSQL >= 9.5, curl, and xmlstarlet. For the latter sudo apt-get curl xmlstarlet should be enough.

PostgreSQL is not really needed if you just want the CSV (*.tsv) files.

@closedone
Copy link

Hi, how to use the 'Makefile' script ?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment