Skip to content

Instantly share code, notes, and snippets.

@briatte
Last active October 19, 2020 13:14
Show Gist options
  • Star 9 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
  • Save briatte/542736520e8b42e6a08e to your computer and use it in GitHub Desktop.
Save briatte/542736520e8b42e6a08e to your computer and use it in GitHub Desktop.
pubmed scraper
#' Get a PubMed search index
#' @param query a PubMed search string
#' @return the XML declaration of the search
#' @example
#' # Which articles discuss the WHO FCTC?
#' pubmed_ask("FCTC OR 'Framework Convention on Tobacco Control'")
pubmed_ask <- function(query) {
# change spaces to + and single-quotes to URL-friendly %22 in query
query = gsub("'", "%22", gsub(" ", "+", query))
query = paste("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=",
query, "&usehistory=y", sep = "")
cat("Querying PubMed on", as.character(Sys.time()), ":\n", query, "\n\n")
# perform search and save history, this will save PMIDS in history
query = xmlTreeParse(getURL(query), asText = TRUE)
cat(xmlValue(query[["doc"]][["eSearchResult"]][["QueryTranslation"]]), "\n\n")
return(query)
}
#' Get number of authors
#'
#' @param min cut results at minimum number of articles
#' @return a data frame
#' @example
#' # Number of authors for articles on the WHO FCTC (not run).
#' # pubmed_get("FCTC OR 'Framework Convention on Tobacco Control'", "fctc")
#' # pubmed_authors("pubmed_fctc", 0)
pubmed_authors <- function(dir, min = 0) {
tbl = file.path(dir, dir(dir, ".xml"))
tbl = lapply(tbl, function(x) {
pub = xmlTreeParse(x, useInternalNodes = TRUE)
tbl = xpathSApply(pub, "//PubmedArticle/MedlineCitation/Article")
tbl = lapply(tbl, xpathApply, "AuthorList/Author/LastName")
tbl = as.vector(sapply(tbl, length))
tbl = table(tbl)
data.frame(authors = names(tbl), count = as.vector(tbl), stringsAsFactors = FALSE)
})
tbl = rbind.fill(tbl)
tbl = aggregate(count ~ authors, sum, data = tbl)
tbl = subset(tbl, count >= min)
tbl$authors = factor(tbl$authors, levels = 0:max(as.numeric(tbl$authors)))
return(tbl[ order(as.numeric(tbl$authors)), ])
}
#' Get PubMed number of articles
#' @param query a PubMed search string or PubMed search index
#' @param max optional cap for the number of articles
#' @example
#' # How many articles on the WHO FCTC?
#' pubmed_count("FCTC OR 'Framework Convention on Tobacco Control'")
pubmed_count <- function(query, max = 0) {
if(is.character(query))
query = pubmed_ask(query)
stopifnot("XMLDocument" %in% class(query))
n = as.numeric(xmlValue(query[["doc"]][["eSearchResult"]][["Count"]]))
# cap number of articles (for testing purposes; off by default)
if(max > 0 & max < n)
n = max
return(n)
}
#' Get undirected edge list of coauthors
#'
#' The weights are Newman-Fowler (inversely proportional to number of coauthors).
#' @return a data frame with three columns (sender, receiver, weight)
#' @example
#' # Network of authors on the WHO FCTC (not run).
#' # pubmed_get("FCTC OR 'Framework Convention on Tobacco Control'", "fctc")
#' # n = pubmed_edges("pubmed_fctc")
#' # Plot with network package (install first).
#' # require(network)
#' # plot(network(n[ 1:2 ], directed = FALSE))
pubmed_edges <- function(dir) {
tbl = file.path(dir, dir(dir, ".xml"))
tbl = lapply(tbl, function(x) {
pub = xmlTreeParse(x, useInternalNodes = TRUE)
tbl = xpathSApply(pub, "//PubmedArticle/MedlineCitation")
tbl = lapply(tbl, function(x) {
y = paste(xpathSApply(x, "Article/AuthorList/Author/LastName", xmlValue),
xpathSApply(x, "Article/AuthorList/Author/Initials", xmlValue))
if(length(y) > 1) {
y = expand.grid(y, y)
y = subset(y, Var1 != Var2) # self-loops
y = unique(apply(y, 1, function(x) paste(sort(x), collapse = ",")))
y = ldply(strsplit(y, ","))
y = data.frame(xpathApply(x, "PMID", xmlValue), y, 1 / nrow(y))
names(y) = c("pmid", "i", "j", "w")
} else {
y = data.frame()
}
return(y)
})
tbl = rbind.fill(tbl)
write.csv(tbl, gsub("xml", "csv", x))
return(tbl)
})
tbl = rbind.fill(tbl)
tbl$uid = apply(tbl[, 2:3], 1, function(x) paste(sort(x), collapse = ","))
# Newman-Fowler weights
tbl = merge(tbl, aggregate(w ~ uid, sum, data = tbl), by = "uid")
tbl = unique(tbl[, c("i", "j", "w.y") ])
names(tbl)[3] = "w"
return(tbl)
}
#' Search and fetch XML records from PubMed
#'
#' @param query a PubMed search string
#' @param file a string to name the batch files and folder (the "pubmed_" prefix will be appended)
#' @param list optionally returns a list of details on the data (off by default)
#' @param max optional cap for the number of articles (off by default)
#' @param k how many articles per batch (1,000 by default)
#' @examples
#' query = "FCTC OR 'Framework Convention on Tobacco Control'"
#' # Scrape approx. 230 articles on the WHO FCTC (not run).
#' # pubmed_get(query, "fctc")
#' # Scrape twice to fix possible network errors (not run).
#' # for(ii in 1:2) try(pubmed_get(query, "fctc"))
#' # Scrape safely and save data details (not run).
#' # for(ii in 1:2) FCTC = try(pubmed_get(query, "fctc", list = TRUE))
#' # Plot data details (not run).
#' # require(ggplot2)
#' # qplot(data = FCTC$years, x = year, y = count, stat = "identity", geom = "bar")
#' # qplot(data = FCTC$authors, x = authors, y = count, stat = "identity", geom = "bar")
#' # qplot(data = subset(FCTC$journals, count > 5),
#' # x = reorder(journal, count), y = count, stat = "identity", geom = "bar") +
#' # labs(x = NULL) +
#' # coord_flip()
#' @reference \url{http://rpsychologist.com/how-to-download-complete-xml-records-from-pubmed-and-extract-data}
pubmed_get <- function(query, file, list = FALSE, max = 0, k = 10^3) {
stopifnot(max >= 0)
dir = paste0("pubmed_", file)
# where to save the files (batch folder)
if(!file.exists(dir))
dir.create(dir, showWarnings = FALSE)
# batch file paths and log file path
file = file.path(dir, file)
log = file.path(paste0(dir, ".log"))
sink(log)
# change spaces to + and single-quotes to URL-friendly %22 in query
q = pubmed_ask(query)
# count number of hits
n = pubmed_count(q, max)
# batch download counter
j = 0
# calculate how many iterations will be needed
r = (n %/% k) + 1
# print details on the download loop to log file
cat("Downloading", n, "articles in", r, "batch(es) of", k, "entries\n\n")
# save WebEnv-string, containing "links" to all articles in search
q = xmlValue(q[["doc"]][["eSearchResult"]][["WebEnv"]])
# batch download loop
for(i in r:1) {
x = paste("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&WebEnv=",
q,"&query_key=1&retmode=xml&retstart=", j, "&retmax=", k, sep = "")
y = paste0(file, str_pad(i, nchar(n), pad = "0"), ".xml")
if(!file.exists(y)) {
# download XML based on hits saved in query (WebEnv)
z = try(getURL(x), silent = TRUE)
if(!"try-error" %in% class(z)) {
write(z, y) # save to batch data folder
} else {
warning("Error while downloading batch file #", i)
Sys.sleep(60) # wait a minute before looping
}
}
# save file name and file size to log file
cat(i, "\n")
cat(x, "\nsaved to", y, as.character(file.info(y)$ctime), "\n\n")
# increment to next batch
j = j + k
}
sink()
# announce final file size
cat("Completed download from PubMed:", n, "articles",
as.integer(sum(file.info(file.path(dir, dir(dir, "xml")))$size) / 10^6),
"MB.\n")
if(list)
return(list(
date = Sys.time(),
search = query,
dir = dir,
log = log,
ncid = q,
count = n,
years = pubmed_years(dir, 0),
journals = pubmed_journals(dir, 0),
authors = pubmed_authors(dir, 0)
))
}
#' Get journal titles
#'
#' @param min cut results at minimum number of articles
#' @return a data frame
#' @example
#' # Journal titles for articles on the WHO FCTC (not run).
#' # pubmed_get("FCTC OR 'Framework Convention on Tobacco Control'", "fctc")
#' # pubmed_journals("pubmed_fctc")
pubmed_journals <- function(dir, min = 0) {
tbl = file.path(dir, dir(dir, ".xml"))
tbl = lapply(tbl, function(x) {
pub = xmlTreeParse(x, useInternalNodes = TRUE)
pre = "//PubmedArticle/MedlineCitation/Article/"
tbl = xpathSApply(pub, paste0(pre, "Journal/ISOAbbreviation"), xmlValue)
tbl = table(tbl)
data.frame(journal = names(tbl), count = as.vector(tbl))
})
tbl = rbind.fill(tbl)
tbl = aggregate(count ~ journal, sum, data = tbl)
tbl = subset(tbl, count >= min)
return(tbl[ order(-tbl$count), ])
}
#' Get counts of authors (by name)
#'
#' @param min cut results at minimum number of articles
#' @return a data frame
#' @example
#' # Years of publication for articles on the WHO FCTC (not run).
#' # pubmed_get("FCTC OR 'Framework Convention on Tobacco Control'", "fctc")
#' # pubmed_names("pubmed_fctc", 6)
pubmed_names <- function(dir, min = 0) {
tbl = file.path(dir, dir(dir, ".xml"))
tbl = lapply(tbl, function(x) {
pub = xmlTreeParse(x, useInternalNodes = TRUE)
tbl = xpathSApply(pub, "//PubmedArticle/MedlineCitation/Article")
last = lapply(tbl, xpathSApply, "AuthorList/Author/LastName", xmlValue)
init = lapply(tbl, xpathSApply, "AuthorList/Author/Initials", xmlValue)
stopifnot(length(unlist(last)) == length(unlist(init)))
tbl = paste(unlist(last), substr(unlist(init), 1, 1))
tbl = table(tbl)[ table(tbl) >= min ]
data.frame(author = names(tbl), count = tbl, stringsAsFactors = FALSE)
})
tbl = rbind.fill(tbl)
tbl = aggregate(count ~ author, sum, data = tbl)
return(tbl[ order(-tbl$count), ])
}
#' Get number of publications per journal and year
#'
#' @param min cut results at minimum number of articles
#' @param top label top n journals (5 by default)
#' @param regex regular expression to include additional journal titles
#' @return a data frame, ordered by journal-year
#' @example
#' # Years of publication for articles on the WHO FCTC (not run).
#' # pubmed_get("FCTC OR 'Framework Convention on Tobacco Control'", "fctc")
#' # FCTC = pubmed_timeline("pubmed_fctc", 0, regex = "World Health Org|BMJ $|Lancet|Tob Control")
#' # Plot the results (not run).
#' # require(ggplot2)
#' # qplot(data = FCTC[ order(FCTC$label), ], x = year, y = count, fill = label,
#' # stat = "identity", geom = "bar") +
#' # scale_fill_brewer("Journal", palette = "Set3", na.value = "grey") +
#' # labs(x = NULL, y = NULL) +
#' # theme(legend.position = "bottom")
pubmed_timeline <- function(dir, min = 0, top = 3, regex = NULL) {
tbl = file.path(dir, dir(dir, ".xml"))
tbl = lapply(tbl, function(x) {
pub = xmlTreeParse(x, useInternalNodes = TRUE)
pre = "//PubmedArticle/MedlineCitation/"
jl = xpathSApply(pub, paste0(pre, "Article/Journal/ISOAbbreviation"), xmlValue)
yr = xpathSApply(pub, paste0(pre, "DateCreated/Year"), xmlValue)
tbl = table(paste(jl, yr))
data.frame(journal = names(tbl), count = as.vector(tbl))
})
tbl = rbind.fill(tbl)
tbl = aggregate(count ~ journal, sum, data = tbl)
tbl = subset(tbl, count >= min)
tbl = data.frame(journal = str_sub(tbl$journal, 1, -5),
year = str_sub(tbl$journal, -4, -1),
count = tbl$count)
tbl$year = as.numeric(as.character(tbl$year))
if(top > 0) {
# find top journals
label = aggregate(count ~ journal, sum, data = pt)
label = as.character(label[ order(-label$count), 1 ])
tbl$label = factor(tbl$journal, levels = label, ordered = TRUE)
# trim and refactor
label = head(levels(tbl$label), top)
if(!is.null(regex))
label = c(label, levels(tbl$label)[ grepl(regex, levels(tbl$label)) ])
cat("Labeled journals:\n", paste0(label, collapse = ", "))
tbl$label[ !tbl$label %in% label ] = NA
}
return(tbl[ order(sort(-tbl$year)), ])
}
#' Get years of publication
#'
#' @param min cut results at minimum number of articles
#' @return a data frame, ordered by year
#' @example
#' # Years of publication for articles on the WHO FCTC (not run).
#' # pubmed_get("FCTC OR 'Framework Convention on Tobacco Control'", "fctc")
#' # pubmed_years("pubmed_fctc", 0)
pubmed_years <- function(dir, min = 0) {
tbl = file.path(dir, dir(dir, ".xml"))
tbl = lapply(tbl, function(x) {
pub = xmlTreeParse(x, useInternalNodes = TRUE)
pre = "//PubmedArticle/PubmedData/History/"
tbl = xpathSApply(pub, paste0(pre, "PubMedPubDate[@PubStatus='medline']/Year"), xmlValue)
tbl = table(tbl)
data.frame(year = names(tbl), count = as.vector(tbl))
})
tbl = rbind.fill(tbl)
tbl = aggregate(count ~ year, sum, data = tbl)
tbl = subset(tbl, count >= min)
tbl$year = as.numeric(as.character(tbl$year))
tbl$year = factor(tbl$year, levels = min(tbl$year):max(tbl$year))
return(tbl[ order(sort(-as.numeric(tbl$year))), ])
}
@gadepallivs
Copy link

Hi Briatte,
I am working on extracting pubmed info, but using rentrez package in R. I am curious to know how are you addressing NULL values at nodes. I am trying to extract abstracts from pubmed. I have posted a Q on SO, if you have suggestion please let me know.
https://stackoverflow.com/questions/32953096/xpathapply-how-to-address-empty-lists

@rjalexa
Copy link

rjalexa commented Nov 5, 2017

Thank you for your great code but I am having a few problems:
a) easy one but you should maybe list the prerequisite libraries. I found out they are at least: XML and Curl
b) Trying out your pubmed_ask I get the following error:

Querying PubMed on 2017-11-05 11:17:41 :
http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=FCTC+OR+%22Framework+Convention+on+Tobacco+Control%22&usehistory=y

Space required after the Public Identifier
SystemLiteral " or ' expected
SYSTEM or PUBLIC, the URI is missing

Any solution please? Thank you

@uhcsttam
Copy link

uhcsttam commented Nov 8, 2017

I was getting the same error. Looks like eutils moved from http to https. Fixing the url in the function to "https://..." seemed to do the trick for me. Good luck!

Briatte, this is a great resource! Thanks for sharing!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment