Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
require(rvest, quietly=T)
require(dplyr, quietly=T)
require(scholar, quietly=T)
index <- seq(from=81, to=90)
# Page number from which results are drawn. This is done in parts to avoid getting error 429.
# https://stackoverflow.com/questions/43461907/in-r-how-do-i-combine-two-xml-documents-into-one-document
xml0 <- read_xml("<html></html>")
for(i in index){
url <- paste0("https://scholar.google.com/scholar?start=", 10*i, "&q=appropedia")
result <- read_html(url)
result_children <- xml_children(result)
for(child in result_children){
xml_add_child(xml0, child)
}
Sys.sleep(10)
}
papers <- xml0 %>% html_nodes(".gs_r.gs_or.gs_scl")
articles_list <- list()
for(p in 1:length(papers)){
articles_list[[p]] <- c(
title = papers[p] %>% html_nodes("h3.gs_rt") %>% html_text(),
id = papers[p] %>% html_nodes("h3.gs_rt") %>% html_nodes(xpath="./span | ./a") %>% html_attr("id") %>% paste0(collapse=""),
author = papers[p] %>% html_nodes(".gs_a") %>% html_text(),
url = papers[p] %>% html_nodes(".gs_or_ggsm") %>% html_nodes("a") %>% html_attr("href"),
abstract = papers[p] %>% html_nodes(".gs_rs") %>% html_text()
)
}
articles_df <- do.call(bind_rows, articles_list) %>% as_tibble()
# Extracting year
articles_df <- articles_df %>% mutate(year=str_extract(author, "\\d{4}"))
# List of authors
# articles_df <- articles_df %>% mutate(authors = str_extract(author, "[\\w ]+[, ?][\\w+\\s][\\w+,\\s]*(?= -)"))
articles_df <- articles_df %>% mutate(authors = str_extract(author, "\\w+[, ][^\\d+][^-]+(?=- )|(\\w ?)+"))
# Clean the title
articles_df <- articles_df %>% mutate(clean_title= str_extract(title, "(?<=\\w\\] )[^\\[].*|^[^\\[].*"))
# Google Scholar get citation
# https://scholar.google.com.sv/scholar?q=info:43lfyFl0WdUJ:scholar.google.com/&output=cite&scirp=10&hl=en
articles_df$authors <- articles_df$authors %>% str_trim()
articles_to_print <- bind_cols(n=800+1:nrow(articles_df), articles_df)
# Note that the number 800 is added to keep track of the index at the beginning of the script.
docmaker <- function(t){
final.text <- paste0(
"# ", t$n, ". ", t$clean_title, "\n",
"- authors: ", t$authors, "\n",
"- year: ", t$year, "\n",
"- url: ", t$url, "\n"
)
write(final.text, file="result_appropedia_2.txt", append=TRUE)
}
docmaker(articles_to_print)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment