Last active
November 10, 2019 03:36
-
-
Save shawngraham/c5298bd6852495e9d2a0c787d9768926 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#after https://francojc.github.io/2015/03/01/web-scraping-with-rvest-in-r/ | |
library(rvest) | |
library(dplyr) | |
base_url <- "https://www.masshist.org" | |
# Load the page | |
main.page <- read_html(x = "https://www.masshist.org/digitaladams/archive/browse/diaries_by_date.php") | |
# Get link URLs | |
urls <- main.page %>% # feed `main.page` to the next step | |
html_nodes("a") %>% # get the CSS nodes | |
html_attr("href") # extract the URLs | |
# Get link text | |
links <- main.page %>% # feed `main.page` to the next step | |
html_nodes("a") %>% # get the CSS nodes | |
html_text() # extract the link text | |
# Combine `links` and `urls` into a data.frame | |
# because the links are all relative, let's add the base url with paste | |
diaries <- data.frame(links = links, urls = paste(base_url,urls, sep=""), stringsAsFactors = FALSE) | |
# but we have a few links to 'home' etc that we don't want | |
# so we'll filter those out with grepl and a regular | |
# expression that looks for 'John' at the start of | |
# the links field. | |
diaries <- diaries %>% filter(grepl("^John", links)) | |
#update nov 9 - I find that line 26 doesn't work in some versions of r via binder that | |
#i have running. I think it's a versioning thing. Anyway, another way of achieving the same | |
#effect if you get an error there is to slice away the bits you don't want (thus keeping | |
#the range of stuff you *do* want: | |
diaries <- diaries %>% slice(9:59) | |
#create a directory to keep our materials in | |
dir.create("diaries") | |
# Loop over each row in `diaries` | |
for(i in seq(nrow(diaries))) { # we're going to loop over each row in 'diaries', extracting the entries from the pages and then writing them to file. | |
text <- read_html(diaries$urls[i]) %>% # load the page | |
html_nodes(".entry") %>% # isloate the text | |
html_text() # get the text | |
# Create the file name | |
filename <- paste0("diaries/", diaries$links[i], ".txt") #this uses the relevant link text as the file name | |
sink(file = filename) %>% # open file to write | |
cat(text) # write the file | |
sink() # close the file | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment