Last active
November 11, 2019 02:24
-
-
Save shawngraham/e610618fcc33e559dde06bca9952c87d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library("rvest") | |
library(dplyr) | |
#https://francojc.github.io/2017/11/02/acquiring-data-for-language-research-web-scraping/ | |
#modified | |
webpage <- "https://www.masshist.org/digitaladams/archive/browse/diaries_by_date.php" | |
html <- read_html(webpage) # read the raw html | |
html | |
#let's look at the nodes | |
html %>% | |
html_nodes("div") | |
#let's look at one of the divs | |
html %>% | |
html_nodes("div.browseContent") | |
#so let's look for links: | |
html %>% | |
html_nodes("a") | |
#To extract the text contained within a node we use the html_text() function. | |
html %>% | |
html_nodes("a") %>% | |
html_text() | |
#so let's put that info into a 'page title' column, and then do it again and get the links themselves | |
html %>% | |
html_nodes("a") %>% | |
html_attr("href") | |
# but we see we're only getting the relative links | |
# so we'll put the base into 'base_url', then come back to it. | |
base_url <- "https://www.masshist.org" | |
# Get link URLs | |
urls <- html %>% # feed `main.page` to the next step | |
html_nodes("a") %>% # get the CSS nodes | |
html_attr("href") # extract the URLs | |
# Get link text | |
links <- html %>% # feed `main.page` to the next step | |
html_nodes("a") %>% # get the CSS nodes | |
html_text() # extract the link text | |
# Combine `links` and `urls` into a data.frame | |
# because the links are all relative, let's add the base url with paste | |
diaries <- data.frame(links = links, urls = paste(base_url,urls, sep=""), stringsAsFactors = FALSE) | |
#cut out the stuff we don't want | |
diaries <- diaries %>% slice(9:59) | |
# now, let's figure out where the information is that we want from one of those links. | |
# we'll start with the first one https://www.masshist.org/digitaladams/archive/doc?id=D0 and examine it | |
diary1 <- "https://www.masshist.org/digitaladams/archive/doc?id=D1" | |
html_d <- read_html(diary1) | |
html_d | |
#let's look at the nodes | |
html_d %>% | |
html_nodes("div") | |
# div class="entry" looks good, let's go deeper | |
html_d %>% | |
html_nodes("div.entry") | |
# let's go deeper! | |
html_d %>% | |
html_nodes("div.dateline span") %>% | |
html_attr("title") | |
# this gets the whole transcription by the way | |
html_d %>% | |
html_nodes("div.transcription") %>% | |
html_text() | |
html_d %>% | |
html_nodes("a") %>% | |
html_attr("id") | |
# so we want to create a table that has those datelines and diary entries. | |
entry <- html_d %>% | |
html_nodes("div.entry") %>% | |
html_text() | |
id <- html_d %>% | |
html_nodes("a") %>% | |
html_attr("id") | |
id <- na.omit(id) | |
id <- id[-c(1)] | |
scrape <- tibble(id, entry) | |
View(scrape) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment