Last active
July 28, 2023 06:40
-
-
Save rcdelacruz/c7e171d6dbc7ede54f3e1db59b69d1bc to your computer and use it in GitHub Desktop.
Import DrugBank to SQLite
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
######################################## | |
## Import of DrugBank Annotation Data ## | |
######################################## | |
## Function to import DrugBank XML to data.frame | |
## Last step gives error. To debug, the following function may help. | |
## Note, this functions needs some major speed improvements. Ideally, | |
## it should be replaced with a standard XML import method. | |
## (1) Download | |
## - download DrugBank XML (https://www.drugbank.ca/releases/latest) | |
## - name uncompressed file 'drugbank.xml' | |
## (2) Function to convert XML to data.frame | |
## - for debugging the function does it for each entry in loop | |
dbxml2df <- function(xmlfile) { | |
require(XML) | |
myxml <- xmlParse(file = xmlfile) | |
rootnode <- xmlRoot(myxml) | |
rootsize <- xmlSize(rootnode) | |
## (a) Identify which columns are necessary by extracting them for each entry in loop | |
cat("\n", "Extracting column names. This may take some time.", "\n") | |
mycol <- NULL | |
for(i in 1:rootsize) mycol <- unique(c(mycol, colnames(xmlToDataFrame(rootnode[i])))) | |
mycol <- as.character(na.omit(mycol)) | |
cat("\n", "The following columns were identified:", "\n", mycol, "\n") | |
## (b) Extract corresponding data in loop and inject into preformatted data.frame | |
cat("\n", "Extracting data for column names. This may take some time.", "\n") | |
df <- as.data.frame(matrix(NA, nrow=rootsize, ncol=length(mycol), dimnames=list(1:rootsize, mycol))) | |
for(i in 1:rootsize) { | |
tmp <- xmlToDataFrame(rootnode[i], stringsAsFactors = FALSE) | |
tmp <- tmp[, !is.na(colnames(tmp))] # removes columns named NA | |
v <- as.character(tmp[1,]); names(v) <- colnames(tmp) | |
df[i,] <- v[mycol] | |
} | |
return(df) | |
} | |
## Usage: | |
dbdf <- dbxml2df(xmlfile="data/drugbank.xml") | |
write.csv(dbdf, "C:/Users/rcdel/Downloads/drugbank_full_June 7 2023.csv", row.names=FALSE) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment