mokjpn/searchPubMed.R

## searchPubMed.R
## modified from http://www.r-bloggers.com/how-to-download-complete-xml-records-from-pubmed-and-extract-data/
# Install XML and RCrul package and call library(XML) and library(RCurl) before use this.

searchPubMed <- function(query.term) {
  # change spaces to + in query
  query.gsub <- gsub(" ", "+", query.term)
  # change single-quotes to URL-friendly %22
  query.gsub <- gsub("'","%22", query.gsub)
  # Perform search and save history, this will save PMIDS in history
  pub.esearch <- getURL(paste("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=",
                              query.gsub, "&usehistory=y", sep = ""))
  # Parse esearch XML
  pub.esearch <- xmlTreeParse(pub.esearch, asText = TRUE)
  # Count number of hits (super assign)
  pub.count <<- as.numeric(xmlValue(pub.esearch[["doc"]][["eSearchResult"]][["Count"]]))
  # Save WebEnv-string, it contains "links" to all articles in my search
  pub.esearch <- xmlValue(pub.esearch[["doc"]][["eSearchResult"]][["WebEnv"]])
  # Show how many articles that's being downloaded
  cat("Searching (downloading", pub.count, "articles)\n")

  ## We need to batch download, since efetch will cap at 10k articles ##
  # Start at 0
  RetStart <- 0
  # End at 10k
  RetMax <- 10000
  # Calculate how many itterations will be needed
  Runs <- (pub.count %/% 10000) + 1
  # Create empty object
  pub.efetch <- NULL
  # Loop to batch download
  for (i in 1:Runs) {
        # Download XML based on hits saved in pub.esearch (WebEnv)
        x <- getURL(paste("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&WebEnv=",
                          pub.esearch,"&query_key=1&retmode=xml&retstart=", RetStart, "&retmax=", RetMax, sep = ""))
        # Add data to previous downloads
        pub.efetch <- paste(pub.efetch, x, sep="")
        # Increase range for next batch
        RetStart <- RetStart + 10000
        RetMax <- RetMax + 10000
      }
  # Print that download is completed
  cat("Completed download from PubMed.\n")
  # Return XML
  return(xmlTreeParse(pub.efetch, useInternalNodes = TRUE, asText=TRUE))
  }

Count1st <- function(xmldata, matchstr) {
  return(unlist(xpathSApply(xmldata,  "//PubmedArticle/MedlineCitation" , function(node) {
    if(length( grep(matchstr, xmlValue(getNodeSet(node, "./Article/AuthorList/Author/AffiliationInfo")[[1]]), ignore.case=TRUE) ) != 0 )
      return(xmlValue(getNodeSet(node, "./PMID")[[1]]))
  })
  ))}
	## modified from http://www.r-bloggers.com/how-to-download-complete-xml-records-from-pubmed-and-extract-data/
	# Install XML and RCrul package and call library(XML) and library(RCurl) before use this.

	searchPubMed <- function(query.term) {
	# change spaces to + in query
	query.gsub <- gsub(" ", "+", query.term)
	# change single-quotes to URL-friendly %22
	query.gsub <- gsub("'","%22", query.gsub)
	# Perform search and save history, this will save PMIDS in history
	pub.esearch <- getURL(paste("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=",
	query.gsub, "&usehistory=y", sep = ""))
	# Parse esearch XML
	pub.esearch <- xmlTreeParse(pub.esearch, asText = TRUE)
	# Count number of hits (super assign)
	pub.count <<- as.numeric(xmlValue(pub.esearch[["doc"]][["eSearchResult"]][["Count"]]))
	# Save WebEnv-string, it contains "links" to all articles in my search
	pub.esearch <- xmlValue(pub.esearch[["doc"]][["eSearchResult"]][["WebEnv"]])
	# Show how many articles that's being downloaded
	cat("Searching (downloading", pub.count, "articles)\n")

	## We need to batch download, since efetch will cap at 10k articles ##
	# Start at 0
	RetStart <- 0
	# End at 10k
	RetMax <- 10000
	# Calculate how many itterations will be needed
	Runs <- (pub.count %/% 10000) + 1
	# Create empty object
	pub.efetch <- NULL
	# Loop to batch download
	for (i in 1:Runs) {
	# Download XML based on hits saved in pub.esearch (WebEnv)
	x <- getURL(paste("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&WebEnv=",
	pub.esearch,"&query_key=1&retmode=xml&retstart=", RetStart, "&retmax=", RetMax, sep = ""))
	# Add data to previous downloads
	pub.efetch <- paste(pub.efetch, x, sep="")
	# Increase range for next batch
	RetStart <- RetStart + 10000
	RetMax <- RetMax + 10000
	}
	# Print that download is completed
	cat("Completed download from PubMed.\n")
	# Return XML
	return(xmlTreeParse(pub.efetch, useInternalNodes = TRUE, asText=TRUE))
	}

	Count1st <- function(xmldata, matchstr) {
	return(unlist(xpathSApply(xmldata, "//PubmedArticle/MedlineCitation" , function(node) {
	if(length( grep(matchstr, xmlValue(getNodeSet(node, "./Article/AuthorList/Author/AffiliationInfo")[[1]]), ignore.case=TRUE) ) != 0 )
	return(xmlValue(getNodeSet(node, "./PMID")[[1]]))
	})
	))}