shawngraham/diary-scrape.r

## diary-scrape.r
library(rvest)

base_url <- "https://www.masshist.org"
# Load the page
main.page <- read_html(x = "https://www.masshist.org/digitaladams/archive/browse/diaries_by_date.php")
# Get link URLs
urls <- main.page %>% # feed `main.page` to the next step
  html_nodes("a") %>% # get the CSS nodes
  html_attr("href") # extract the URLs
# Get link text
links <- main.page %>% # feed `main.page` to the next step
  html_nodes("a") %>% # get the CSS nodes
  html_text() # extract the link text
# Combine `links` and `urls` into a data.frame
# because the links are all relative, let's add the base url with paste
diaries <- data.frame(links = links, urls = paste(base_url,urls, sep=""), stringsAsFactors = FALSE)

# Loop over each row in `diaries`
for(i in seq((diaries))) {
  text <- read_html(diaries$urls[i]) %>% # load the page
    html_nodes(".entry") %>% # isloate the text; maybe .transcription
    html_text() # get the text

  # Create the file name
  filename <- paste0(diaries$links[i], ".txt")
  sink(file = filename) %>% # open file to write
    cat(text)  # write the file
  sink() # close the file
}
	library(rvest)

	base_url <- "https://www.masshist.org"
	# Load the page
	main.page <- read_html(x = "https://www.masshist.org/digitaladams/archive/browse/diaries_by_date.php")
	# Get link URLs
	urls <- main.page %>% # feed `main.page` to the next step
	html_nodes("a") %>% # get the CSS nodes
	html_attr("href") # extract the URLs
	# Get link text
	links <- main.page %>% # feed `main.page` to the next step
	html_nodes("a") %>% # get the CSS nodes
	html_text() # extract the link text
	# Combine `links` and `urls` into a data.frame
	# because the links are all relative, let's add the base url with paste
	diaries <- data.frame(links = links, urls = paste(base_url,urls, sep=""), stringsAsFactors = FALSE)

	# Loop over each row in `diaries`
	for(i in seq((diaries))) {
	text <- read_html(diaries$urls[i]) %>% # load the page
	html_nodes(".entry") %>% # isloate the text; maybe .transcription
	html_text() # get the text

	# Create the file name
	filename <- paste0(diaries$links[i], ".txt")
	sink(file = filename) %>% # open file to write
	cat(text) # write the file
	sink() # close the file
	}