Skip to content

Instantly share code, notes, and snippets.

@kumeS
Last active March 22, 2021 04:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kumeS/ef4d750a191b847174b3b93a4b8391c9 to your computer and use it in GitHub Desktop.
Save kumeS/ef4d750a191b847174b3b93a4b8391c9 to your computer and use it in GitHub Desktop.
download PDF files from R via google search / google scholar
if(!require("rvest")){install.packages("rvest")}; library(rvest)
if(!require("xml2")){install.packages("xml2")}; library(xml2)
if(!require("magrittr")){install.packages("magrittr")}; library(magrittr)
#Query: a character vector
#Open: browse the URL list or not
googleSearchPage <- function(Query="Biochemistry", Open=F){
url = URLencode(paste0("https://www.google.com/search?q=", Query))
page <- xml2::read_html(url) %>%
rvest::html_nodes("a") %>%
rvest::html_attr("href") %>%
.[startsWith(., "/url?q=")] %>%
sub("^/url\\?q\\=(.*?)\\&sa.*$", "\\1", .) %>%
.[!startsWith(., "https://accounts.google.com")]
if(Open){
for(n in seq_len(length(page))){
browseURL(url=page[n])
}}
return(page)
}
#Query: single or multiple character vector
#Pages: search page numbers
#com: If TRUE, "https://www.google.com". If FALSE, "www.google.co.jp".
#Download: Download PDF files or not
googleSearchPDF <- function(Query, Pages=1, com=T, Download=T){
A <- list()
for(n in seq_len(length(Query))){
B <- c()
for(m in seq_len(Pages)){
Sys.sleep(0.1)
if(com){
url = URLencode(paste0("https://www.google.com/search?q=", Query[n], " filetype:pdf", "&start=", m*10-10))
}else{
url = URLencode(paste0("https://www.google.co.jp/search?q=", Query[n], " filetype:pdf", "&start=", m*10-10))
}
link <- xml2::read_html(url) %>%
rvest::html_nodes("a") %>%
rvest::html_attr("href") %>%
.[startsWith(., "/url?q=")] %>%
sub("^/url\\?q\\=(.*?)\\&sa.*$","\\1", .) %>%
.[endsWith(., "pdf")]
if(length(link) != 0){
B <- c(B, link)
}}
A[[n]] <- unique(B)
names(A)[[n]] <- Query[n]
if(Download){
Folder <- paste0(Query[n], "-PDFs-", format(Sys.time(), "%Y-%b%d-%H%M"))
dir.create(Folder)
for(l in seq_len(length(A[[n]]))){
suppressMessages(
try(download.file(url = A[[n]][l], destfile = paste0("./", Folder, "/", Query[n], "_", formatC(l, width = 4, , flag = "0"), ".pdf"), mode="wb"),
silent=T))
}}}
return(A)
}
#Query: single or multiple character vector
#Download: Download PDF files or not
googleScholarSearchPDF <- function(Query, Download=T){
A <- list()
for(n in seq_len(length(Query))){
B <- c()
url = URLencode(paste0("https://scholar.google.com/scholar?q=", Query[n], ""))
link <- xml2::read_html(url) %>%
rvest::html_nodes("a") %>%
rvest::html_attr("href") %>%
.[endsWith(., "pdf")]
if(length(link) != 0){
B <- c(B, unique(link))
}
A[[n]] <- unique(B)
names(A)[[n]] <- Query[n]
if(Download){
Folder <- paste0(Query[n], "-ScholarPDFs-", format(Sys.time(), "%Y-%b%d-%H%M"))
dir.create(Folder)
for(l in seq_len(length(A[[n]]))){
suppressMessages(
try(download.file(url = A[[n]][l], destfile = paste0("./", Folder, "/", Query[n], "_", formatC(l, width = 4, , flag = "0"), ".pdf"), mode="wb"),
silent=T))
}}}
return(A)
}
#Get papers in order of newest
googleScholarSearchNewest <- function(Query, Output=10){
A <- list()
for(n in seq_len(length(Query))){
B <- c()
url = URLencode(paste0("https://scholar.google.com/scholar?q=", Query[n], "&scisbd=1"))
link <- xml2::read_html(url) %>%
rvest::html_nodes("a") %>%
rvest::html_attr("href") %>%
.[startsWith(., "https://")] %>%
.[!startsWith(., "https://accounts.google.com")]
if(length(link) != 0){
B <- c(B, unique(link))
}
if(length(B) > Output){ B <- B[1:Output] }
A[[n]] <- unique(B)
names(A)[[n]] <- Query[n]
}
return(A)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment