-
-
Save shawngraham/8acc5fd88f2455b4284a3b48569c6025 to your computer and use it in GitHub Desktop.
library(rvest) | |
base_url <- "https://www.masshist.org" | |
# Load the page | |
main.page <- read_html(x = "https://www.masshist.org/digitaladams/archive/browse/diaries_by_date.php") | |
# Get link URLs | |
urls <- main.page %>% # feed `main.page` to the next step | |
html_nodes("a") %>% # get the CSS nodes | |
html_attr("href") # extract the URLs | |
# Get link text | |
links <- main.page %>% # feed `main.page` to the next step | |
html_nodes("a") %>% # get the CSS nodes | |
html_text() # extract the link text | |
# Combine `links` and `urls` into a data.frame | |
# because the links are all relative, let's add the base url with paste | |
diaries <- data.frame(links = links, urls = paste(base_url,urls, sep=""), stringsAsFactors = FALSE) | |
# Loop over each row in `diaries` | |
for(i in seq((diaries))) { | |
text <- read_html(diaries$urls[i]) %>% # load the page | |
html_nodes(".entry") %>% # isloate the text; maybe .transcription | |
html_text() # get the text | |
# Create the file name | |
filename <- paste0(diaries$links[i], ".txt") | |
sink(file = filename) %>% # open file to write | |
cat(text) # write the file | |
sink() # close the file | |
} |
library(rvest)
base_url <- "https://www.masshist.org"
Load the page
main.page <-
read_html(x = "https://www.masshist.org/digitaladams/archive/browse/diaries_by_date.php")
Get link URLs
urls <- main.page %>% # feed main.page
to the next step
html_nodes("a") %>% # get the CSS nodes
html_attr("href") # extract the URLs
Get link text
links <- main.page %>% # feed main.page
to the next step
html_nodes("a") %>% # get the CSS nodes
html_text() # extract the link text
Combine links
and urls
into a data.frame
because the links are all relative, let's add the base url with paste
diaries <-
data.frame(
links = links,
urls = paste(base_url, urls, sep = ""),
stringsAsFactors = FALSE
)
#====================================
Loop over each row in diaries
for (i in 1:length(diaries$urls)) {
filename <- paste0(diaries$links[i], ".txt")
print(filename)
out <- tryCatch({
download.file(diaries$urls[i], destfile = filename, quiet = TRUE)
},
error = function(cond) {
message(paste("URL does not seem to exist:", diaries$urls[i]))
message("Here's the original error message:")
message(cond)
# Choose a return value in case of error
return(NA)
},
warning = function(cond) {
message(paste("URL caused a warning:", diaries$urls[i]))
message("Here's the original warning message:")
message(cond)
# Choose a return value in case of warning
return(NULL)
})
}
figure it out.
The scrape of links was also grabbing the links to 'home' 'back' etc, and so the scrape would get hung up on those things or break when it came time to loop. So if you filter that stuff out, after line 16, with
then all is well with the world. Well, that and line 19 has to be changed to
then you're golden.