#after https://francojc.github.io/2015/03/01/web-scraping-with-rvest-in-r/ | |
library(rvest) | |
library(dplyr) | |
base_url <- "https://www.masshist.org" | |
# Load the page | |
main.page <- read_html(x = "https://www.masshist.org/digitaladams/archive/browse/diaries_by_date.php") | |
# Get link URLs | |
urls <- main.page %>% # feed `main.page` to the next step | |
html_nodes("a") %>% # get the CSS nodes | |
html_attr("href") # extract the URLs | |
# Get link text | |
links <- main.page %>% # feed `main.page` to the next step | |
html_nodes("a") %>% # get the CSS nodes | |
html_text() # extract the link text | |
# Combine `links` and `urls` into a data.frame | |
# because the links are all relative, let's add the base url with paste | |
diaries <- data.frame(links = links, urls = paste(base_url,urls, sep=""), stringsAsFactors = FALSE) | |
# but we have a few links to 'home' etc that we don't want | |
# so we'll filter those out with grepl and a regular | |
# expression that looks for 'John' at the start of | |
# the links field. | |
diaries <- diaries %>% filter(grepl("^John", links)) | |
#update nov 9 - I find that line 26 doesn't work in some versions of r via binder that | |
#i have running. I think it's a versioning thing. Anyway, another way of achieving the same | |
#effect if you get an error there is to slice away the bits you don't want (thus keeping | |
#the range of stuff you *do* want: | |
diaries <- diaries %>% slice(9:59) | |
#create a directory to keep our materials in | |
dir.create("diaries") | |
# Loop over each row in `diaries` | |
for(i in seq(nrow(diaries))) { # we're going to loop over each row in 'diaries', extracting the entries from the pages and then writing them to file. | |
text <- read_html(diaries$urls[i]) %>% # load the page | |
html_nodes(".entry") %>% # isloate the text | |
html_text() # get the text | |
# Create the file name | |
filename <- paste0("diaries/", diaries$links[i], ".txt") #this uses the relevant link text as the file name | |
sink(file = filename) %>% # open file to write | |
cat(text) # write the file | |
sink() # close the file | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment