Last active
July 15, 2020 05:05
-
-
Save pdparker/22074ee2bdf9de1fd0b6 to your computer and use it in GitHub Desktop.
Scrap Australian PM speeches into mongoDB Database
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#################################### Set up database ########################### | |
# - Make sure database is setup to be read,write and executable outside of sudo | |
# - Make sure to start mongo deamon before setting up database usr$ mongod | |
################################################################################ | |
##Produces mongodb documents with the following fields: | |
# _id: Transcript id - used to index the files | |
# title: Title of the speech or interview | |
# primMinister: Who gave the speech in format 'Last name, First name' | |
# releaseDate: Given in days since 1970-01-01 as per R's default data storage | |
# releaseType: Inidication of whether context is speech, interview, etc. | |
# transcript: Transcript of the actual speech | |
#load required mongo package | |
library("rmongodb") | |
# connect to your local mongodb | |
mongo <- mongo.create() | |
#Check what databases are avaliable (should be empty to start) | |
mongo.get.databases(mongo) | |
#If you need to drop the table and start again uncomment the line below | |
#mongo.drop(mongo,ns = "speeches.transcript") | |
#Initiate collection and count documents (again should be 0 to start) | |
mongo.count(mongo, ns = "speeches.transcript") | |
#### Webscrapping #### | |
#load web scrapping package | |
library(rvest) | |
#load package for string manipulation | |
library(stringr) | |
# Prime minister's speeches are logged by transcript number in temporal order | |
linkLocation <- read_html("http://pmtranscripts.pmc.gov.au/transcripts.xml") | |
linkLocation <-linkLocation %>% html_nodes("uri") %>% html_text() | |
#top line is just the link to the meta-data itself | |
linkLocation <- linkLocation[-1] | |
#link 21 is inconsistently named with the others | |
linkLocation[21] <- 31836 | |
linkLocationAPI <- paste0("http://pmtranscripts.dpmc.gov.au/query?transcript=", | |
gsub("^(.+transcript-)([0-9]+$)", "\\2", linkLocation)) | |
n <- 1:10 | |
#Loop through URLs and insert into DB | |
#Any failed links - of which there are a few - are written to a log.txt file in working dir | |
for (i in n){ | |
cat("Now working on link: ", linkLocationAPI[i],"\n") | |
tryCatch( | |
{ metaSpeeches <- read_html(linkLocationAPI[i]) | |
}, error=function(cond) { | |
message(paste("URL does not seem to exist:", linkLocationAPI[i])) | |
message("Here's the original error message:") | |
message(cond) | |
#log the error | |
cat("ERROR-Problem with url: ", linkLocationAPI[i], "\n",file = "log.txt",append = TRUE) | |
}, warning=function(cond) { | |
message(paste("URL caused a warning:", linkLocationAPI[i])) | |
message("Here's the original warning message:") | |
message(cond) | |
#log the warning | |
cat("WARNING-Problem with url: ", linkLocationAPI[i], "\n",file = "log.txt",append = TRUE) | |
}, finally = { | |
#Scrap required elements by tag and class | |
id = metaSpeeches %>% html_nodes("transcript-id") %>% html_text() | |
title = metaSpeeches %>% html_nodes("title") %>% html_text() | |
prime = metaSpeeches %>% html_nodes("prime-minister") %>% html_text() | |
service = metaSpeeches %>% html_nodes("period-of-service") %>% html_text() %>% strsplit(" - ") %>% unlist %>% gsub("([0-9]{4})([0-9]{2})([0-9]{2})", "\\1-\\2-\\3",.) | |
releaseDate = metaSpeeches %>% html_nodes("release-date") %>% html_text() %>% gsub("([0-9]{2}).([0-9]{2}).([0-9]{4})", "\\3-\\2-\\1",.) | |
releaseType = metaSpeeches %>% html_nodes("release-type") %>% html_text() | |
transcript = metaSpeeches %>%html_node("content") %>% html_text %>%str_replace_all("(\n|\t|\\s+)", " ") | |
#list to change to BSON format | |
template = list('_id' = as.integer(id), 'title' = title, 'primeMinister' = prime, | |
'service' = service,'releaseDate' = releaseDate, | |
'releaseType' = releaseType,'transcript' = transcript) | |
#list to BSON | |
outPut <- mongo.bson.from.list(template) | |
#Insert into DB | |
mongo.insert(mongo, "speeches.transcript", outPut) | |
#Log completion to screen | |
cat(id,": complete","\n\n") | |
#BE A GOOD CITIZEN: Don't overload server | |
Sys.sleep(1) | |
}) | |
} | |
#### Query database Examples #### | |
##Find First instance of query | |
#Find an example of a transcript in interview format | |
mongo.findOne(mongo, ns = "speeches.transcript", | |
query = mongo.bson.from.JSON('{"releaseType":"Interview"}') | |
) | |
##Quick find All | |
#Provides a quick find all but not as flexible as below | |
idCheck <- mongo.find.batch(mongo,ns = "speeches.transcript", | |
query = mongo.bson.empty(), | |
fields = mongo.bson.from.JSON('{"_id":1}') | |
) | |
sort(unlist(idCheck)) | |
## Longer find all plus regex example | |
#Set up query with filter | |
reg_1 = list("releaseDate" = list("$regex" = "^19.+" )) | |
#Set up cursor to move through documents | |
cursor <- mongo.find(mongo, ns = "speeches.transcript", | |
query = reg_1, | |
fields = mongo.bson.from.JSON('{"primeMinister":1, "releaseDate":1, "title":1}') | |
) | |
#Pass query to mongo and convert to data.frame | |
res <- NULL | |
while (mongo.cursor.next(cursor)) { | |
value = mongo.cursor.value(cursor) | |
Rvalue = mongo.bson.to.list(value) | |
res <- rbind(res, Rvalue) | |
} | |
#kill the query | |
err <- mongo.cursor.destroy(cursor) | |
#Return database | |
res | |
#### Clean Up #### | |
mongo.disconnect(mongo) | |
mongo.destroy(mongo) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment