Skip to content

Instantly share code, notes, and snippets.

@sebastianbarfort
Created November 28, 2015 12:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sebastianbarfort/eadf12111473a1764452 to your computer and use it in GitHub Desktop.
Save sebastianbarfort/eadf12111473a1764452 to your computer and use it in GitHub Desktop.
AJPS scrape
rm(list = ls())
library("rvest")
library("stringr")
# define functions -----------------------
grab_data = function(url){
require("rvest")
require("stringr")
url = paste(url, "?seq=1#references_tab_contents", sep = "")
link = url %>% read_html
abstract = link %>%
html_nodes(".abstract1") %>%
html_text
p.count = link %>%
html_nodes(".count") %>%
html_text %>%
str_trim
info = link %>%
html_nodes(".mbl") %>%
html_text %>%
str_trim %>%
.[1] %>%
str_split(",") %>%
unlist
authors = link %>%
html_nodes(".contrib") %>%
html_text %>%
str_trim
authors = unlist(str_split(authors, " and |, |&"))
title = link %>%
html_nodes(".title") %>%
html_text %>%
str_trim %>%
.[1]
url1 = link %>%
html_nodes(".stable") %>%
html_text %>%
str_trim
journal = link %>%
html_nodes("cite") %>%
html_text %>%
str_trim %>%
.[1]
df.i = data.frame(rbind(c(url1, title, abstract, p.count, journal, info)), stringsAsFactors = FALSE)
names(df.i) = c("link", "title", "abstract", "page count", "journal", "volume", "number", "pages")
df.authors = data.frame(rbind(authors), stringsAsFactors = FALSE)
names(df.authors) = paste("author", 1:length(authors), sep = "")
df.i = data.frame(df.i, df.authors)
print(paste(url, "done"))
Sys.sleep(1)
return(df.i)
}
grab_article_links = function(url){
link = url %>% read_html
urls = link %>%
html_nodes("#bulk_citation_export_form .stable") %>%
html_text %>%
str_trim
urls = str_extract(urls, "http:.*")
return(urls)
}
# get journal links -----------------------
journals = read_html("~/git/abstracts/data/ajps_jstor.html") %>%
html_nodes(".pbxl a") %>%
html_attr("href")
# get article-journal links -----------------------
journal.articles = lapply(journals, grab_article_links)
# get article info
article.info = list()
for(i in unlist(journal.articles)){
print(which(unlist(journal.articles) == i))
article.info[[i]] = try(grab_data(i), silent=TRUE)
}
df.9 = plyr::ldply(article.info, gtools::smartbind)
df.9 = df.9[, !grepl("Error", names(df.9))] %>%
filter(!is.na(link))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment