Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Scrape pubmed or scholar
library(dplyr)
# Web scraping.
library(rvest)
# For synonym list
library(taxize)
scrapePub <- function(sp){
Sys.sleep(2)
# Initialise refs
refs <- NA
# Find synonyms from taxize
syns <- synonyms(sp, db = 'itis')
if(NROW(syns[[1]]) == 1){
spString <- tolower(gsub(' ', '%20', sp))
} else {
spString <- paste(tolower(gsub(' ', '%20', syns[[1]]$syn_name)), collapse = '%22+OR+%22')
}
url <- paste0('http://www.ncbi.nlm.nih.gov/pubmed/?term=%22', spString, '%22')
page <- html(url)
# Test if exact phrase was found.
phraseFound <- try(page %>%
html_node('.icon') %>%
html_text() %>%
grepl("The following term was not found in PubMed:", .), silent = TRUE)
if (class(phraseFound) == "logical") {
if(phraseFound){
if(phraseFound) refs <- NA
}
}
if (class(phraseFound) != "logical") {
try({
refs <- page %>%
html_node('.result_count') %>%
html_text() %>%
strsplit(' ') %>%
.[[1]] %>%
.[length(.)] %>%
as.numeric()
})
}
return(refs)
}
scrapeScholar <- function(sp){
wait <- rnorm(1, 120, 2)
Sys.sleep(wait)
syns <- synonyms(sp, db = 'itis')
if(NROW(syns[[1]]) == 1){
spString <- tolower(gsub(' ', '%20', sp))
} else {
spString <- paste(tolower(gsub(' ', '%20', syns[[1]]$syn_name)), collapse = '%22+OR+%22')
}
url <- paste0('https://scholar.google.co.uk/scholar?hl=en&q=%22',
spString, '%22&btnG=&as_sdt=1%2C5&as_sdtp=')
page <- html(url)
try({
refs <- page %>%
html_node('#gs_ab_md') %>%
html_text() %>%
gsub('About\\s(.*)\\sresults.*', '\\1', .) %>%
gsub(',', '', .) %>%
as.numeric
})
return(refs)
}
@luisDVA

This comment has been minimized.

Copy link

commented Oct 27, 2015

for scrapeScholar:
the '#gs_ab_md' node is not always the same. When there are less than ten results the text will not say "about X" results" it will only say "X results".

I changed the gsub for a strapply call from the gsubfn package

refs <- page %>%
html_node('#gs_ab_md') %>%
html_text() %>%
strapplyc( "(\w+) results ") %>%
as.numeric

Now I'll go try it out
cheers

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.