pdparker/pmScraper.R

## pmScraper.R
#################################### Set up database ###########################
# - Make sure database is setup to be read,write and executable outside of sudo
# - Make sure to start mongo deamon before setting up database usr$ mongod
################################################################################

##Produces mongodb documents with the following fields:
# _id: Transcript id - used to index the files
# title: Title of the speech or interview
# primMinister: Who gave the speech in format 'Last name, First name'
# releaseDate: Given in days since 1970-01-01 as per R's default data storage
# releaseType: Inidication of whether context is speech, interview, etc.
# transcript: Transcript of the actual speech

#load required mongo package
library("rmongodb")
# connect to your local mongodb
mongo <- mongo.create()
#Check what databases are avaliable (should be empty to start)
mongo.get.databases(mongo)
#If you need to drop the table and start again uncomment the line below
#mongo.drop(mongo,ns = "speeches.transcript")
#Initiate collection and count documents (again should be 0 to start)
mongo.count(mongo, ns = "speeches.transcript")

#### Webscrapping ####
#load web scrapping package
library(rvest)
#load package for string manipulation
library(stringr)
# Prime minister's speeches are logged by transcript number in temporal order
linkLocation <- read_html("http://pmtranscripts.pmc.gov.au/transcripts.xml")
linkLocation <-linkLocation %>% html_nodes("uri") %>% html_text()
#top line is just the link to the meta-data itself
linkLocation <- linkLocation[-1]
#link 21 is inconsistently named with the others
linkLocation[21] <- 31836
linkLocationAPI <- paste0("http://pmtranscripts.dpmc.gov.au/query?transcript=",
					   gsub("^(.+transcript-)([0-9]+$)", "\\2", linkLocation))
n <- 1:10
#Loop through URLs and insert into DB
#Any failed links - of which there are a few - are written to a log.txt file in working dir
for (i in n){
	cat("Now working on link: ", linkLocationAPI[i],"\n")
	tryCatch(
		{	metaSpeeches <- read_html(linkLocationAPI[i])
		}, error=function(cond) {
			message(paste("URL does not seem to exist:", linkLocationAPI[i]))
			message("Here's the original error message:")
			message(cond)
			#log the error
			cat("ERROR-Problem with url: ", linkLocationAPI[i], "\n",file = "log.txt",append = TRUE)
		}, warning=function(cond) {
			message(paste("URL caused a warning:", linkLocationAPI[i]))
			message("Here's the original warning message:")
			message(cond)
			#log the warning
			cat("WARNING-Problem with url: ", linkLocationAPI[i], "\n",file = "log.txt",append = TRUE)
		}, finally = {
			#Scrap required elements by tag and class
			id = metaSpeeches %>% html_nodes("transcript-id") %>% html_text()
			title = metaSpeeches %>% html_nodes("title") %>% html_text()
			prime = metaSpeeches %>% html_nodes("prime-minister") %>% html_text()
			service = metaSpeeches %>% html_nodes("period-of-service") %>% html_text() %>% strsplit(" - ") %>% unlist %>% gsub("([0-9]{4})([0-9]{2})([0-9]{2})", "\\1-\\2-\\3",.)
			releaseDate = metaSpeeches %>% html_nodes("release-date") %>% html_text() %>% gsub("([0-9]{2}).([0-9]{2}).([0-9]{4})", "\\3-\\2-\\1",.)
			releaseType = 	metaSpeeches %>% html_nodes("release-type") %>% html_text()
			transcript =  metaSpeeches %>%html_node("content") %>% html_text %>%str_replace_all("(\n|\t|\\s+)", " ")
			#list to change to BSON format
			template = list('_id' = as.integer(id), 'title' = title, 'primeMinister' = prime,
							'service' = service,'releaseDate' = releaseDate,
							'releaseType' = releaseType,'transcript' = transcript)
			#list to BSON
			outPut <- mongo.bson.from.list(template)
			#Insert into DB
			mongo.insert(mongo, "speeches.transcript", outPut)
			#Log completion to screen
			cat(id,": complete","\n\n")
			#BE A GOOD CITIZEN: Don't overload server
			Sys.sleep(1)
		})
}

#### Query database Examples ####
##Find First instance of query
#Find an example of a transcript in interview format
mongo.findOne(mongo, ns = "speeches.transcript",
			  query = mongo.bson.from.JSON('{"releaseType":"Interview"}')
			  )

##Quick find All
#Provides a quick find all but not as flexible as below
idCheck <- mongo.find.batch(mongo,ns = "speeches.transcript",
						query = mongo.bson.empty(),
						fields = mongo.bson.from.JSON('{"_id":1}')
)

sort(unlist(idCheck))

## Longer find all plus regex example
#Set up query with filter
reg_1 = list("releaseDate" = list("$regex" = "^19.+" ))
#Set up cursor to move through documents
cursor <- mongo.find(mongo, ns = "speeches.transcript",
					 query = reg_1,
					fields = mongo.bson.from.JSON('{"primeMinister":1, "releaseDate":1, "title":1}')
)
#Pass query to mongo and convert to data.frame
res <- NULL
while (mongo.cursor.next(cursor)) {
	value = mongo.cursor.value(cursor)
	Rvalue = mongo.bson.to.list(value)
	res <- rbind(res, Rvalue)
}
#kill the query
err <- mongo.cursor.destroy(cursor)
#Return database
res

#### Clean Up ####
mongo.disconnect(mongo)
mongo.destroy(mongo)
	#################################### Set up database ###########################
	# - Make sure database is setup to be read,write and executable outside of sudo
	# - Make sure to start mongo deamon before setting up database usr$ mongod
	################################################################################

	##Produces mongodb documents with the following fields:
	# _id: Transcript id - used to index the files
	# title: Title of the speech or interview
	# primMinister: Who gave the speech in format 'Last name, First name'
	# releaseDate: Given in days since 1970-01-01 as per R's default data storage
	# releaseType: Inidication of whether context is speech, interview, etc.
	# transcript: Transcript of the actual speech

	#load required mongo package
	library("rmongodb")
	# connect to your local mongodb
	mongo <- mongo.create()
	#Check what databases are avaliable (should be empty to start)
	mongo.get.databases(mongo)
	#If you need to drop the table and start again uncomment the line below
	#mongo.drop(mongo,ns = "speeches.transcript")
	#Initiate collection and count documents (again should be 0 to start)
	mongo.count(mongo, ns = "speeches.transcript")

	#### Webscrapping ####
	#load web scrapping package
	library(rvest)
	#load package for string manipulation
	library(stringr)
	# Prime minister's speeches are logged by transcript number in temporal order
	linkLocation <- read_html("http://pmtranscripts.pmc.gov.au/transcripts.xml")
	linkLocation <-linkLocation %>% html_nodes("uri") %>% html_text()
	#top line is just the link to the meta-data itself
	linkLocation <- linkLocation[-1]
	#link 21 is inconsistently named with the others
	linkLocation[21] <- 31836
	linkLocationAPI <- paste0("http://pmtranscripts.dpmc.gov.au/query?transcript=",
	gsub("^(.+transcript-)([0-9]+$)", "\\2", linkLocation))
	n <- 1:10
	#Loop through URLs and insert into DB
	#Any failed links - of which there are a few - are written to a log.txt file in working dir
	for (i in n){
	cat("Now working on link: ", linkLocationAPI[i],"\n")
	tryCatch(
	{ metaSpeeches <- read_html(linkLocationAPI[i])
	}, error=function(cond) {
	message(paste("URL does not seem to exist:", linkLocationAPI[i]))
	message("Here's the original error message:")
	message(cond)
	#log the error
	cat("ERROR-Problem with url: ", linkLocationAPI[i], "\n",file = "log.txt",append = TRUE)
	}, warning=function(cond) {
	message(paste("URL caused a warning:", linkLocationAPI[i]))
	message("Here's the original warning message:")
	message(cond)
	#log the warning
	cat("WARNING-Problem with url: ", linkLocationAPI[i], "\n",file = "log.txt",append = TRUE)
	}, finally = {
	#Scrap required elements by tag and class
	id = metaSpeeches %>% html_nodes("transcript-id") %>% html_text()
	title = metaSpeeches %>% html_nodes("title") %>% html_text()
	prime = metaSpeeches %>% html_nodes("prime-minister") %>% html_text()
	service = metaSpeeches %>% html_nodes("period-of-service") %>% html_text() %>% strsplit(" - ") %>% unlist %>% gsub("([0-9]{4})([0-9]{2})([0-9]{2})", "\\1-\\2-\\3",.)
	releaseDate = metaSpeeches %>% html_nodes("release-date") %>% html_text() %>% gsub("([0-9]{2}).([0-9]{2}).([0-9]{4})", "\\3-\\2-\\1",.)
	releaseType = metaSpeeches %>% html_nodes("release-type") %>% html_text()
	transcript = metaSpeeches %>%html_node("content") %>% html_text %>%str_replace_all("(\n\|\t\|\\s+)", " ")
	#list to change to BSON format
	template = list('_id' = as.integer(id), 'title' = title, 'primeMinister' = prime,
	'service' = service,'releaseDate' = releaseDate,
	'releaseType' = releaseType,'transcript' = transcript)
	#list to BSON
	outPut <- mongo.bson.from.list(template)
	#Insert into DB
	mongo.insert(mongo, "speeches.transcript", outPut)
	#Log completion to screen
	cat(id,": complete","\n\n")
	#BE A GOOD CITIZEN: Don't overload server
	Sys.sleep(1)
	})
	}

	#### Query database Examples ####
	##Find First instance of query
	#Find an example of a transcript in interview format
	mongo.findOne(mongo, ns = "speeches.transcript",
	query = mongo.bson.from.JSON('{"releaseType":"Interview"}')
	)

	##Quick find All
	#Provides a quick find all but not as flexible as below
	idCheck <- mongo.find.batch(mongo,ns = "speeches.transcript",
	query = mongo.bson.empty(),
	fields = mongo.bson.from.JSON('{"_id":1}')
	)

	sort(unlist(idCheck))

	## Longer find all plus regex example
	#Set up query with filter
	reg_1 = list("releaseDate" = list("$regex" = "^19.+" ))
	#Set up cursor to move through documents
	cursor <- mongo.find(mongo, ns = "speeches.transcript",
	query = reg_1,
	fields = mongo.bson.from.JSON('{"primeMinister":1, "releaseDate":1, "title":1}')
	)
	#Pass query to mongo and convert to data.frame
	res <- NULL
	while (mongo.cursor.next(cursor)) {
	value = mongo.cursor.value(cursor)
	Rvalue = mongo.bson.to.list(value)
	res <- rbind(res, Rvalue)
	}
	#kill the query
	err <- mongo.cursor.destroy(cursor)
	#Return database
	res

	#### Clean Up ####
	mongo.disconnect(mongo)
	mongo.destroy(mongo)