shawngraham/scraping-one-set-of-diaries.r

## scraping-one-set-of-diaries.r

library("rvest")
library(dplyr)

#https://francojc.github.io/2017/11/02/acquiring-data-for-language-research-web-scraping/
#modified

webpage <- "https://www.masshist.org/digitaladams/archive/browse/diaries_by_date.php"

html <- read_html(webpage) # read the raw html
html

#let's look at the nodes
html %>%
  html_nodes("div")

#let's look at one of the divs
html %>%
  html_nodes("div.browseContent")

#so let's look for links:
html %>%
  html_nodes("a")

#To extract the text contained within a node we use the html_text() function.

html %>%
  html_nodes("a") %>%
  html_text()

#so let's put that info into a 'page title' column, and then do it again and get the links themselves

html %>%
  html_nodes("a") %>%
  html_attr("href")

# but we see we're only getting the relative links
# so we'll put the base into 'base_url', then come back to it.
base_url <- "https://www.masshist.org"

# Get link URLs
urls <- html %>% # feed `main.page` to the next step
  html_nodes("a") %>% # get the CSS nodes
  html_attr("href") # extract the URLs
# Get link text
links <- html %>% # feed `main.page` to the next step
  html_nodes("a") %>% # get the CSS nodes
  html_text() # extract the link text
# Combine `links` and `urls` into a data.frame
# because the links are all relative, let's add the base url with paste
diaries <- data.frame(links = links, urls = paste(base_url,urls, sep=""), stringsAsFactors = FALSE)

#cut out the stuff we don't want
diaries <- diaries %>% slice(9:59)

# now, let's figure out where the information is that we want from one of those links.
# we'll start with the first one https://www.masshist.org/digitaladams/archive/doc?id=D0 and examine it

diary1 <- "https://www.masshist.org/digitaladams/archive/doc?id=D1"

html_d <- read_html(diary1)
html_d

#let's look at the nodes
html_d %>%
  html_nodes("div")

# div class="entry" looks good, let's go deeper
html_d %>%
  html_nodes("div.entry")

# let's go deeper!
html_d %>%
  html_nodes("div.dateline span") %>%
  html_attr("title")

# this gets the whole transcription by the way
html_d %>%
  html_nodes("div.transcription") %>%
  html_text()

html_d %>%
  html_nodes("a") %>%
  html_attr("id")


# so we want to create a table that has those datelines and diary entries.

entry <- html_d %>%
  html_nodes("div.entry") %>%
  html_text()


id <- html_d %>%
  html_nodes("a") %>%
  html_attr("id")


id <- na.omit(id)
id <- id[-c(1)]

scrape <- tibble(id, entry)
View(scrape)

	library("rvest")
	library(dplyr)

	#https://francojc.github.io/2017/11/02/acquiring-data-for-language-research-web-scraping/
	#modified

	webpage <- "https://www.masshist.org/digitaladams/archive/browse/diaries_by_date.php"

	html <- read_html(webpage) # read the raw html
	html

	#let's look at the nodes
	html %>%
	html_nodes("div")

	#let's look at one of the divs
	html %>%
	html_nodes("div.browseContent")

	#so let's look for links:
	html %>%
	html_nodes("a")

	#To extract the text contained within a node we use the html_text() function.

	html %>%
	html_nodes("a") %>%
	html_text()

	#so let's put that info into a 'page title' column, and then do it again and get the links themselves

	html %>%
	html_nodes("a") %>%
	html_attr("href")

	# but we see we're only getting the relative links
	# so we'll put the base into 'base_url', then come back to it.
	base_url <- "https://www.masshist.org"

	# Get link URLs
	urls <- html %>% # feed `main.page` to the next step
	html_nodes("a") %>% # get the CSS nodes
	html_attr("href") # extract the URLs
	# Get link text
	links <- html %>% # feed `main.page` to the next step
	html_nodes("a") %>% # get the CSS nodes
	html_text() # extract the link text
	# Combine `links` and `urls` into a data.frame
	# because the links are all relative, let's add the base url with paste
	diaries <- data.frame(links = links, urls = paste(base_url,urls, sep=""), stringsAsFactors = FALSE)

	#cut out the stuff we don't want
	diaries <- diaries %>% slice(9:59)

	# now, let's figure out where the information is that we want from one of those links.
	# we'll start with the first one https://www.masshist.org/digitaladams/archive/doc?id=D0 and examine it

	diary1 <- "https://www.masshist.org/digitaladams/archive/doc?id=D1"

	html_d <- read_html(diary1)
	html_d

	#let's look at the nodes
	html_d %>%
	html_nodes("div")

	# div class="entry" looks good, let's go deeper
	html_d %>%
	html_nodes("div.entry")

	# let's go deeper!
	html_d %>%
	html_nodes("div.dateline span") %>%
	html_attr("title")

	# this gets the whole transcription by the way
	html_d %>%
	html_nodes("div.transcription") %>%
	html_text()

	html_d %>%
	html_nodes("a") %>%
	html_attr("id")


	# so we want to create a table that has those datelines and diary entries.

	entry <- html_d %>%
	html_nodes("div.entry") %>%
	html_text()


	id <- html_d %>%
	html_nodes("a") %>%
	html_attr("id")


	id <- na.omit(id)
	id <- id[-c(1)]

	scrape <- tibble(id, entry)
	View(scrape)