Skip to content

Instantly share code, notes, and snippets.

@samuell
Forked from sgsfak/Drugbank parse to DB
Created March 2, 2018 20:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save samuell/d98e23c80273a7042ba8a861cb748022 to your computer and use it in GitHub Desktop.
Save samuell/d98e23c80273a7042ba8a861cb748022 to your computer and use it in GitHub Desktop.
Parse DrugBank XML file with xmlstarlet
--
-- PostgreSQL database dump
--
-- Dumped from database version 9.5.2
-- Dumped by pg_dump version 9.5.2
SET statement_timeout = 0;
SET lock_timeout = 0;
SET client_encoding = 'UTF8';
SET standard_conforming_strings = on;
SET check_function_bodies = false;
SET client_min_messages = warning;
SET row_security = off;
CREATE TABLE atc_drugs (
drug_id character varying(10),
atc_code character varying(10)
);
CREATE TABLE drug_brands (
drug_id character varying(10),
brand text,
company text
);
CREATE TABLE drug_interactions (
drug_id1 character varying(10),
drug_id2 character varying(10),
name text,
description text
);
CREATE TABLE drugs (
drug_id character varying(10) NOT NULL,
name text,
description text
);
CREATE TABLE drug_dosages (
drug_id character varying(10) NOT NULL REFERENCES drugs(drug_id),
form text,
route text,
strength text
);
CREATE INDEX drug_dosages_id on drug_dosages(drug_id);
CREATE VIEW drug_ints_vw AS
SELECT di.drug_id1 AS drug_id,
array_agg(di.drug_id2 ORDER BY di.drug_id2) AS drug_ids2,
array_agg(di.name ORDER BY di.drug_id2) AS nmes,
array_agg(di.description ORDER BY di.drug_id2) AS descriptions
FROM (drug_interactions di
JOIN drugs ON (((di.drug_id2)::text = (drugs.drug_id)::text)))
GROUP BY di.drug_id1;
CREATE TABLE drug_synonyms (
drug_id character varying(10),
synonym text
);
CREATE MATERIALIZED VIEW drugs_ftsearch AS
SELECT drugs.drug_id,
to_tsvector('english'::regconfig, concat(drugs.name, ' ', b.brands, ' ', s.synonyms)) AS ftext
FROM ((drugs
LEFT JOIN ( SELECT drug_brands.drug_id,
string_agg(drug_brands.brand, ' '::text) AS brands
FROM drug_brands
GROUP BY drug_brands.drug_id) b USING (drug_id))
LEFT JOIN ( SELECT drug_synonyms.drug_id,
string_agg(drug_synonyms.synonym, ' '::text) AS synonyms
FROM drug_synonyms
GROUP BY drug_synonyms.drug_id) s USING (drug_id))
WITH NO DATA;
CREATE MATERIALIZED VIEW drugs_name_ftsearch AS
SELECT drugs.drug_id,
to_tsvector('english'::regconfig, drugs.name) AS name_ftext
FROM drugs
WITH NO DATA;
CREATE VIEW drugs_vw AS
SELECT drugs.drug_id,
drugs.name,
a.atc_codes,
b.brands,
s.synonyms,
drugs.description
FROM (((drugs
LEFT JOIN ( SELECT atc_drugs.drug_id,
array_agg(atc_drugs.atc_code) AS atc_codes
FROM atc_drugs
GROUP BY atc_drugs.drug_id) a USING (drug_id))
LEFT JOIN ( SELECT drug_brands.drug_id,
array_agg(drug_brands.brand) AS brands
FROM drug_brands
GROUP BY drug_brands.drug_id) b USING (drug_id))
LEFT JOIN ( SELECT drug_synonyms.drug_id,
array_agg(drug_synonyms.synonym) AS synonyms
FROM drug_synonyms
GROUP BY drug_synonyms.drug_id) s USING (drug_id));
ALTER TABLE ONLY drugs
ADD CONSTRAINT drugs_pkey PRIMARY KEY (drug_id);
CREATE INDEX atc_drugs_atc_code_idx ON atc_drugs USING btree (atc_code);
CREATE INDEX atc_drugs_drug_id_idx ON atc_drugs USING btree (drug_id);
CREATE INDEX drug_brands_drug_id_idx ON drug_brands USING btree (drug_id);
CREATE INDEX drug_interactions_drug_id1_idx ON drug_interactions USING btree (drug_id1);
CREATE INDEX drug_interactions_drug_id2_idx ON drug_interactions USING hash (drug_id2);
CREATE INDEX drug_synonyms_drug_id_idx ON drug_synonyms USING btree (drug_id);
CREATE INDEX drugs_ftsearch_ftext_idx ON drugs_ftsearch USING gin (ftext);
CREATE INDEX drugs_name_ftsearch_name_ftext_idx ON drugs_name_ftsearch USING gin (name_ftext);
ALTER TABLE ONLY atc_drugs
ADD CONSTRAINT atc_drugs_drug_id_fkey FOREIGN KEY (drug_id) REFERENCES drugs(drug_id);
ALTER TABLE ONLY drug_brands
ADD CONSTRAINT drug_brands_drug_id_fkey FOREIGN KEY (drug_id) REFERENCES drugs(drug_id);
ALTER TABLE ONLY drug_interactions
ADD CONSTRAINT drug_interactions_drug_id1_fkey FOREIGN KEY (drug_id1) REFERENCES drugs(drug_id);
ALTER TABLE ONLY drug_synonyms
ADD CONSTRAINT drug_synonyms_drug_id_fkey FOREIGN KEY (drug_id) REFERENCES drugs(drug_id);
all : drugs.tsv atc_drugs.tsv drug_interactions.tsv drug_synonyms.tsv drug_brands.tsv drug_dosages.tsv
drugbank.xml:
curl -L -o drugbank.xml.zip -u EMAIL:PASSWORD http://www.drugbank.ca/releases/5-0-1/downloads/all-full-database
rm -f drugbank.xml ; unzip -p drugbank.xml.zip > $@
drugs.tsv: drugbank.xml
echo 'drug_id name description' > $@
xmlstarlet sel -T -N t=http://www.drugbank.ca -t -m 't:drugbank/t:drug/t:drugbank-id[@primary="true"]' -v 'concat(.," ", ../t:name, " ", normalize-space(../t:description))' -n $< >> $@
atc_drugs.tsv: drugbank.xml
echo 'drug_id atc_code' > $@
xmlstarlet sel -T -N t=http://www.drugbank.ca -t -m 't:drugbank/t:drug/t:atc-codes/t:atc-code' -v 'concat(../../t:drugbank-id," ", ./@code)' -n $< >> $@
drug_interactions.tsv: drugbank.xml
echo 'drug_id1 drug_id2 name description' > $@
xmlstarlet sel -T -N t=http://www.drugbank.ca -t -m 't:drugbank/t:drug/t:drug-interactions/t:drug-interaction' -v 'concat(../../t:drugbank-id," ", t:drugbank-id, " ", normalize-space(t:name), " ", normalize-space(t:description))' -n $< >> $@
drug_synonyms.tsv: drugbank.xml
echo 'drug_id synonym' > $@
xmlstarlet sel -T -N t=http://www.drugbank.ca -t -m 't:drugbank/t:drug/t:synonyms/t:synonym' -v 'concat(../../t:drugbank-id," ", normalize-space(.))' -n $< >> $@
drug_brands.tsv: drugbank.xml
echo 'drug_id brand company' > $@
xmlstarlet sel -T -N t=http://www.drugbank.ca -t -m 't:drugbank/t:drug/t:international-brands/t:international-brand' -v 'concat(../../t:drugbank-id," ", normalize-space(.), " ", ./@company)' -n $< >> $@
drug_dosages.tsv: drugbank.xml
echo 'drug_id form route strength' > $@
xmlstarlet sel -T -N t=http://www.drugbank.ca -t -m 't:drugbank/t:drug/t:dosages/t:dosage' -v 'concat(../../t:drugbank-id," ", normalize-space(t:form)," ", normalize-space(t:route), " ", normalize-space(t:strength))' -n $< >> $@
db_insert : drugs.tsv atc_drugs.tsv drug_interactions.tsv drug_synonyms.tsv drug_brands.tsv drug_dosages.tsv
psql -c "TRUNCATE drugs, atc_drugs, drug_interactions, drug_synonyms, drug_brands, drug_dosages" drugbank
psql -c "COPY drugs from '`pwd`/drugs.tsv' with (format csv, delimiter ' ', header true, quote '|')" drugbank
psql -c "COPY atc_drugs from '`pwd`/atc_drugs.tsv' with (format csv, delimiter ' ', header true, quote '|')" drugbank
psql -c "COPY drug_interactions from '`pwd`/drug_interactions.tsv' with (format csv, delimiter ' ', header true, quote '|')" drugbank
psql -c "COPY drug_synonyms from '`pwd`/drug_synonyms.tsv' with (format csv, delimiter ' ', header true, quote '|')" drugbank
psql -c "COPY drug_brands from '`pwd`/drug_brands.tsv' with (format csv, delimiter ' ', header true, quote '|')" drugbank
psql -c "COPY drug_dosages from '`pwd`/drug_dosages.tsv' with (format csv, delimiter ' ', header true, quote '|')" drugbank
psql -c "REFRESH MATERIALIZED VIEW drugs_name_ftsearch" drugbank
psql -c "REFRESH MATERIALIZED VIEW drugs_ftsearch" drugbank
.PHONY: all db_insert
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment