dubsnipe/appropedia-gscholar-2020.R

## appropedia-gscholar-2020.R
require(rvest, quietly=T)
require(dplyr, quietly=T)
require(scholar, quietly=T)

index <- seq(from=81, to=90)
# Page number from which results are drawn. This is done in parts to avoid getting error 429.

# https://stackoverflow.com/questions/43461907/in-r-how-do-i-combine-two-xml-documents-into-one-document
xml0 <- read_xml("<html></html>")
for(i in index){
  url <- paste0("https://scholar.google.com/scholar?start=", 10*i, "&q=appropedia")
  result <- read_html(url)

  result_children <- xml_children(result)
  for(child in result_children){
    xml_add_child(xml0, child)
  }
  Sys.sleep(10)
}

papers <- xml0 %>% html_nodes(".gs_r.gs_or.gs_scl")

articles_list <- list()

for(p in 1:length(papers)){
  articles_list[[p]] <- c(
    title = papers[p] %>% html_nodes("h3.gs_rt") %>% html_text(),
    id = papers[p] %>% html_nodes("h3.gs_rt") %>% html_nodes(xpath="./span | ./a") %>% html_attr("id") %>% paste0(collapse=""),
    author = papers[p] %>% html_nodes(".gs_a") %>% html_text(),
    url = papers[p] %>% html_nodes(".gs_or_ggsm") %>% html_nodes("a") %>% html_attr("href"),
    abstract = papers[p] %>% html_nodes(".gs_rs") %>% html_text()
  )
}

articles_df <- do.call(bind_rows, articles_list) %>% as_tibble()

# Extracting year
articles_df <- articles_df %>% mutate(year=str_extract(author, "\\d{4}"))

# List of authors
# articles_df <- articles_df %>% mutate(authors = str_extract(author, "[\\w ]+[, ?][\\w+\\s][\\w+,\\s]*(?= -)"))
articles_df <- articles_df %>% mutate(authors = str_extract(author, "\\w+[, ][^\\d+][^-]+(?=- )|(\\w ?)+"))

# Clean the title
articles_df <- articles_df %>% mutate(clean_title= str_extract(title, "(?<=\\w\\] )[^\\[].*|^[^\\[].*"))

# Google Scholar get citation
# https://scholar.google.com.sv/scholar?q=info:43lfyFl0WdUJ:scholar.google.com/&output=cite&scirp=10&hl=en

articles_df$authors <- articles_df$authors %>% str_trim()

articles_to_print <- bind_cols(n=800+1:nrow(articles_df), articles_df)
# Note that the number 800 is added to keep track of the index at the beginning of the script.
docmaker <- function(t){
  final.text <- paste0(
    "# ", t$n, ". ", t$clean_title, "\n",
    "- authors: ", t$authors, "\n",
    "- year: ", t$year,  "\n",
    "- url: ", t$url,  "\n"
  )
  write(final.text, file="result_appropedia_2.txt", append=TRUE)
}

docmaker(articles_to_print)
	require(rvest, quietly=T)
	require(dplyr, quietly=T)
	require(scholar, quietly=T)

	index <- seq(from=81, to=90)
	# Page number from which results are drawn. This is done in parts to avoid getting error 429.

	# https://stackoverflow.com/questions/43461907/in-r-how-do-i-combine-two-xml-documents-into-one-document
	xml0 <- read_xml("<html></html>")
	for(i in index){
	url <- paste0("https://scholar.google.com/scholar?start=", 10*i, "&q=appropedia")
	result <- read_html(url)

	result_children <- xml_children(result)
	for(child in result_children){
	xml_add_child(xml0, child)
	}
	Sys.sleep(10)
	}

	papers <- xml0 %>% html_nodes(".gs_r.gs_or.gs_scl")

	articles_list <- list()

	for(p in 1:length(papers)){
	articles_list[[p]] <- c(
	title = papers[p] %>% html_nodes("h3.gs_rt") %>% html_text(),
	id = papers[p] %>% html_nodes("h3.gs_rt") %>% html_nodes(xpath="./span \| ./a") %>% html_attr("id") %>% paste0(collapse=""),
	author = papers[p] %>% html_nodes(".gs_a") %>% html_text(),
	url = papers[p] %>% html_nodes(".gs_or_ggsm") %>% html_nodes("a") %>% html_attr("href"),
	abstract = papers[p] %>% html_nodes(".gs_rs") %>% html_text()
	)
	}

	articles_df <- do.call(bind_rows, articles_list) %>% as_tibble()

	# Extracting year
	articles_df <- articles_df %>% mutate(year=str_extract(author, "\\d{4}"))

	# List of authors
	# articles_df <- articles_df %>% mutate(authors = str_extract(author, "[\\w ]+[, ?][\\w+\\s][\\w+,\\s]*(?= -)"))
	articles_df <- articles_df %>% mutate(authors = str_extract(author, "\\w+[, ][^\\d+][^-]+(?=- )\|(\\w ?)+"))

	# Clean the title
	articles_df <- articles_df %>% mutate(clean_title= str_extract(title, "(?<=\\w\\] )[^\\[].\|^[^\\[]."))

	# Google Scholar get citation
	# https://scholar.google.com.sv/scholar?q=info:43lfyFl0WdUJ:scholar.google.com/&output=cite&scirp=10&hl=en

	articles_df$authors <- articles_df$authors %>% str_trim()

	articles_to_print <- bind_cols(n=800+1:nrow(articles_df), articles_df)
	# Note that the number 800 is added to keep track of the index at the beginning of the script.
	docmaker <- function(t){
	final.text <- paste0(
	"# ", t$n, ". ", t$clean_title, "\n",
	"- authors: ", t$authors, "\n",
	"- year: ", t$year, "\n",
	"- url: ", t$url, "\n"
	)
	write(final.text, file="result_appropedia_2.txt", append=TRUE)
	}

	docmaker(articles_to_print)