Skip to content

Instantly share code, notes, and snippets.

@shawngraham
Last active November 11, 2019 02:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shawngraham/e610618fcc33e559dde06bca9952c87d to your computer and use it in GitHub Desktop.
Save shawngraham/e610618fcc33e559dde06bca9952c87d to your computer and use it in GitHub Desktop.
library("rvest")
library(dplyr)
#https://francojc.github.io/2017/11/02/acquiring-data-for-language-research-web-scraping/
#modified
webpage <- "https://www.masshist.org/digitaladams/archive/browse/diaries_by_date.php"
html <- read_html(webpage) # read the raw html
html
#let's look at the nodes
html %>%
html_nodes("div")
#let's look at one of the divs
html %>%
html_nodes("div.browseContent")
#so let's look for links:
html %>%
html_nodes("a")
#To extract the text contained within a node we use the html_text() function.
html %>%
html_nodes("a") %>%
html_text()
#so let's put that info into a 'page title' column, and then do it again and get the links themselves
html %>%
html_nodes("a") %>%
html_attr("href")
# but we see we're only getting the relative links
# so we'll put the base into 'base_url', then come back to it.
base_url <- "https://www.masshist.org"
# Get link URLs
urls <- html %>% # feed `main.page` to the next step
html_nodes("a") %>% # get the CSS nodes
html_attr("href") # extract the URLs
# Get link text
links <- html %>% # feed `main.page` to the next step
html_nodes("a") %>% # get the CSS nodes
html_text() # extract the link text
# Combine `links` and `urls` into a data.frame
# because the links are all relative, let's add the base url with paste
diaries <- data.frame(links = links, urls = paste(base_url,urls, sep=""), stringsAsFactors = FALSE)
#cut out the stuff we don't want
diaries <- diaries %>% slice(9:59)
# now, let's figure out where the information is that we want from one of those links.
# we'll start with the first one https://www.masshist.org/digitaladams/archive/doc?id=D0 and examine it
diary1 <- "https://www.masshist.org/digitaladams/archive/doc?id=D1"
html_d <- read_html(diary1)
html_d
#let's look at the nodes
html_d %>%
html_nodes("div")
# div class="entry" looks good, let's go deeper
html_d %>%
html_nodes("div.entry")
# let's go deeper!
html_d %>%
html_nodes("div.dateline span") %>%
html_attr("title")
# this gets the whole transcription by the way
html_d %>%
html_nodes("div.transcription") %>%
html_text()
html_d %>%
html_nodes("a") %>%
html_attr("id")
# so we want to create a table that has those datelines and diary entries.
entry <- html_d %>%
html_nodes("div.entry") %>%
html_text()
id <- html_d %>%
html_nodes("a") %>%
html_attr("id")
id <- na.omit(id)
id <- id[-c(1)]
scrape <- tibble(id, entry)
View(scrape)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment