Last active
November 6, 2016 14:03
-
-
Save erzk/3087757bf3c2cb570318b95f7fb34b17 to your computer and use it in GitHub Desktop.
Scrape the performance data from ROH collections
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(rvest) | |
# URL: http://www.rohcollections.org.uk/SearchResults.aspx?searchtype=performance&page=0&genre=Opera | |
performances <- c() | |
for (i in 0:233){ | |
site_perf <- paste0("http://www.rohcollections.org.uk/SearchResults.aspx?searchtype=performance&page=", | |
i, | |
"&genre=Opera") | |
print(site_perf) # optional print to see the progress | |
html_perf <- read_html(site_perf) | |
cast_perf <- html_nodes(html_perf, "tr td") %>% html_text() | |
performances <- append(performances, cast_perf) | |
} | |
# turn into a data frame - splits the vector into a 4-column DF | |
perfdf <- as.data.frame(matrix(performances, ncol = 4, byrow = T)) | |
colnames(perfdf) <- c("Title", "Date", "DayTime", "Company") # rename columns | |
# turn Date column into date type | |
perfdf$Date <- as.Date(perfdf$Date, format = "%d %B %Y") | |
write.csv(perfdf, "perfdf.csv", row.names = F) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment