Skip to content

Instantly share code, notes, and snippets.

@pdparker pdparker/pmScraper.R
Last active Dec 18, 2015

Embed
What would you like to do?
Scrap Australian PM speeches into mongoDB Database
#################################### Set up database ###########################
# - Make sure database is setup to be read,write and executable outside of sudo
# - Make sure to start mongo deamon before setting up database usr$ mongod
################################################################################
##Produces mongodb documents with the following fields:
# _id: Transcript id - used to index the files
# title: Title of the speech or interview
# primMinister: Who gave the speech in format 'Last name, First name'
# releaseDate: Given in days since 1970-01-01 as per R's default data storage
# releaseType: Inidication of whether context is speech, interview, etc.
# transcript: Transcript of the actual speech
#load required mongo package
library("rmongodb")
# connect to your local mongodb
mongo <- mongo.create()
#Check what databases are avaliable (should be empty to start)
mongo.get.databases(mongo)
#If you need to drop the table and start again uncomment the line below
#mongo.drop(mongo,ns = "speeches.transcript")
#Initiate collection and count documents (again should be 0 to start)
mongo.count(mongo, ns = "speeches.transcript")
#### Webscrapping ####
#load web scrapping package
library(rvest)
#load package for string manipulation
library(stringr)
# Prime minister's speeches are logged by transcript number in temporal order
linkLocation <- read_html("http://pmtranscripts.dpmc.gov.au/transcripts.xml")
linkLocation <-linkLocation %>% html_nodes("uri") %>% html_text()
#top line is just the link to the meta-data itself
linkLocation <- linkLocation[-1]
#link 21 is inconsistently named with the others
linkLocation[21] <- 31836
linkLocationAPI <- paste0("http://pmtranscripts.dpmc.gov.au/query?transcript=",
gsub("^(.+transcript-)([0-9]+$)", "\\2", linkLocation))
n <- 1:10
#Loop through URLs and insert into DB
#Any failed links - of which there are a few - are written to a log.txt file in working dir
for (i in n){
cat("Now working on link: ", linkLocationAPI[i],"\n")
tryCatch(
{ metaSpeeches <- read_html(linkLocationAPI[i])
}, error=function(cond) {
message(paste("URL does not seem to exist:", linkLocationAPI[i]))
message("Here's the original error message:")
message(cond)
#log the error
cat("ERROR-Problem with url: ", linkLocationAPI[i], "\n",file = "log.txt",append = TRUE)
}, warning=function(cond) {
message(paste("URL caused a warning:", linkLocationAPI[i]))
message("Here's the original warning message:")
message(cond)
#log the warning
cat("WARNING-Problem with url: ", linkLocationAPI[i], "\n",file = "log.txt",append = TRUE)
}, finally = {
#Scrap required elements by tag and class
id = metaSpeeches %>% html_nodes("transcript-id") %>% html_text()
title = metaSpeeches %>% html_nodes("title") %>% html_text()
prime = metaSpeeches %>% html_nodes("prime-minister") %>% html_text()
service = metaSpeeches %>% html_nodes("period-of-service") %>% html_text() %>% strsplit(" - ") %>% unlist %>% gsub("([0-9]{4})([0-9]{2})([0-9]{2})", "\\1-\\2-\\3",.)
releaseDate = metaSpeeches %>% html_nodes("release-date") %>% html_text() %>% gsub("([0-9]{2}).([0-9]{2}).([0-9]{4})", "\\3-\\2-\\1",.)
releaseType = metaSpeeches %>% html_nodes("release-type") %>% html_text()
transcript = metaSpeeches %>%html_node("content") %>% html_text %>%str_replace_all("(\n|\t|\\s+)", " ")
#list to change to BSON format
template = list('_id' = as.integer(id), 'title' = title, 'primeMinister' = prime,
'service' = service,'releaseDate' = releaseDate,
'releaseType' = releaseType,'transcript' = transcript)
#list to BSON
outPut <- mongo.bson.from.list(template)
#Insert into DB
mongo.insert(mongo, "speeches.transcript", outPut)
#Log completion to screen
cat(id,": complete","\n\n")
#BE A GOOD CITIZEN: Don't overload server
Sys.sleep(1)
})
}
#### Query database Examples ####
##Find First instance of query
#Find an example of a transcript in interview format
mongo.findOne(mongo, ns = "speeches.transcript",
query = mongo.bson.from.JSON('{"releaseType":"Interview"}')
)
##Quick find All
#Provides a quick find all but not as flexible as below
idCheck <- mongo.find.batch(mongo,ns = "speeches.transcript",
query = mongo.bson.empty(),
fields = mongo.bson.from.JSON('{"_id":1}')
)
sort(unlist(idCheck))
## Longer find all plus regex example
#Set up query with filter
reg_1 = list("releaseDate" = list("$regex" = "^19.+" ))
#Set up cursor to move through documents
cursor <- mongo.find(mongo, ns = "speeches.transcript",
query = reg_1,
fields = mongo.bson.from.JSON('{"primeMinister":1, "releaseDate":1, "title":1}')
)
#Pass query to mongo and convert to data.frame
res <- NULL
while (mongo.cursor.next(cursor)) {
value = mongo.cursor.value(cursor)
Rvalue = mongo.bson.to.list(value)
res <- rbind(res, Rvalue)
}
#kill the query
err <- mongo.cursor.destroy(cursor)
#Return database
res
#### Clean Up ####
mongo.disconnect(mongo)
mongo.destroy(mongo)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.