shawngraham/diary-scraper.r

## diary-scraper.r
#after https://francojc.github.io/2015/03/01/web-scraping-with-rvest-in-r/

library(rvest)
library(dplyr)


base_url <- "https://www.masshist.org"
# Load the page
main.page <- read_html(x = "https://www.masshist.org/digitaladams/archive/browse/diaries_by_date.php")
# Get link URLs
urls <- main.page %>% # feed `main.page` to the next step
  html_nodes("a") %>% # get the CSS nodes
  html_attr("href") # extract the URLs
# Get link text
links <- main.page %>% # feed `main.page` to the next step
  html_nodes("a") %>% # get the CSS nodes
  html_text() # extract the link text
# Combine `links` and `urls` into a data.frame
# because the links are all relative, let's add the base url with paste
diaries <- data.frame(links = links, urls = paste(base_url,urls, sep=""), stringsAsFactors = FALSE)

# but we have a few links to 'home' etc that we don't want
# so we'll filter those out with grepl and a regular
# expression that looks for 'John' at the start of
# the links field.
diaries <- diaries %>% filter(grepl("^John", links))

#update nov 9 - I find that line 26 doesn't work in some versions of r via binder that
#i have running. I think it's a versioning thing. Anyway, another way of achieving the same
#effect if you get an error there is to slice away the bits you don't want (thus keeping
#the range of stuff you *do* want:
diaries <- diaries %>% slice(9:59)

#create a directory to keep our materials in

dir.create("diaries")

# Loop over each row in `diaries`
for(i in seq(nrow(diaries))) {  # we're going to loop over each row in 'diaries', extracting the entries from the pages and then writing them to file.
  text <- read_html(diaries$urls[i]) %>% # load the page
    html_nodes(".entry") %>% # isloate the text
    html_text() # get the text

  # Create the file name
  filename <- paste0("diaries/", diaries$links[i], ".txt") #this uses the relevant link text as the file name
  sink(file = filename) %>% # open file to write
    cat(text)  # write the file
  sink() # close the file
}
	#after https://francojc.github.io/2015/03/01/web-scraping-with-rvest-in-r/

	library(rvest)
	library(dplyr)


	base_url <- "https://www.masshist.org"
	# Load the page
	main.page <- read_html(x = "https://www.masshist.org/digitaladams/archive/browse/diaries_by_date.php")
	# Get link URLs
	urls <- main.page %>% # feed `main.page` to the next step
	html_nodes("a") %>% # get the CSS nodes
	html_attr("href") # extract the URLs
	# Get link text
	links <- main.page %>% # feed `main.page` to the next step
	html_nodes("a") %>% # get the CSS nodes
	html_text() # extract the link text
	# Combine `links` and `urls` into a data.frame
	# because the links are all relative, let's add the base url with paste
	diaries <- data.frame(links = links, urls = paste(base_url,urls, sep=""), stringsAsFactors = FALSE)

	# but we have a few links to 'home' etc that we don't want
	# so we'll filter those out with grepl and a regular
	# expression that looks for 'John' at the start of
	# the links field.
	diaries <- diaries %>% filter(grepl("^John", links))

	#update nov 9 - I find that line 26 doesn't work in some versions of r via binder that
	#i have running. I think it's a versioning thing. Anyway, another way of achieving the same
	#effect if you get an error there is to slice away the bits you don't want (thus keeping
	#the range of stuff you do want:
	diaries <- diaries %>% slice(9:59)

	#create a directory to keep our materials in

	dir.create("diaries")

	# Loop over each row in `diaries`
	for(i in seq(nrow(diaries))) { # we're going to loop over each row in 'diaries', extracting the entries from the pages and then writing them to file.
	text <- read_html(diaries$urls[i]) %>% # load the page
	html_nodes(".entry") %>% # isloate the text
	html_text() # get the text

	# Create the file name
	filename <- paste0("diaries/", diaries$links[i], ".txt") #this uses the relevant link text as the file name
	sink(file = filename) %>% # open file to write
	cat(text) # write the file
	sink() # close the file
	}