Skip to content

Instantly share code, notes, and snippets.

@mokjpn
Last active September 4, 2015 03:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mokjpn/ef93c3a54853f54c3550 to your computer and use it in GitHub Desktop.
Save mokjpn/ef93c3a54853f54c3550 to your computer and use it in GitHub Desktop.
## modified from http://www.r-bloggers.com/how-to-download-complete-xml-records-from-pubmed-and-extract-data/
# Install XML and RCrul package and call library(XML) and library(RCurl) before use this.
searchPubMed <- function(query.term) {
# change spaces to + in query
query.gsub <- gsub(" ", "+", query.term)
# change single-quotes to URL-friendly %22
query.gsub <- gsub("'","%22", query.gsub)
# Perform search and save history, this will save PMIDS in history
pub.esearch <- getURL(paste("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=",
query.gsub, "&usehistory=y", sep = ""))
# Parse esearch XML
pub.esearch <- xmlTreeParse(pub.esearch, asText = TRUE)
# Count number of hits (super assign)
pub.count <<- as.numeric(xmlValue(pub.esearch[["doc"]][["eSearchResult"]][["Count"]]))
# Save WebEnv-string, it contains "links" to all articles in my search
pub.esearch <- xmlValue(pub.esearch[["doc"]][["eSearchResult"]][["WebEnv"]])
# Show how many articles that's being downloaded
cat("Searching (downloading", pub.count, "articles)\n")
## We need to batch download, since efetch will cap at 10k articles ##
# Start at 0
RetStart <- 0
# End at 10k
RetMax <- 10000
# Calculate how many itterations will be needed
Runs <- (pub.count %/% 10000) + 1
# Create empty object
pub.efetch <- NULL
# Loop to batch download
for (i in 1:Runs) {
# Download XML based on hits saved in pub.esearch (WebEnv)
x <- getURL(paste("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&WebEnv=",
pub.esearch,"&query_key=1&retmode=xml&retstart=", RetStart, "&retmax=", RetMax, sep = ""))
# Add data to previous downloads
pub.efetch <- paste(pub.efetch, x, sep="")
# Increase range for next batch
RetStart <- RetStart + 10000
RetMax <- RetMax + 10000
}
# Print that download is completed
cat("Completed download from PubMed.\n")
# Return XML
return(xmlTreeParse(pub.efetch, useInternalNodes = TRUE, asText=TRUE))
}
Count1st <- function(xmldata, matchstr) {
return(unlist(xpathSApply(xmldata, "//PubmedArticle/MedlineCitation" , function(node) {
if(length( grep(matchstr, xmlValue(getNodeSet(node, "./Article/AuthorList/Author/AffiliationInfo")[[1]]), ignore.case=TRUE) ) != 0 )
return(xmlValue(getNodeSet(node, "./PMID")[[1]]))
})
))}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment