Created
August 29, 2012 07:23
-
-
Save vladiim/3507953 to your computer and use it in GitHub Desktop.
Scraping UK MP data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| library(XML) | |
| # URL of interest: | |
| mps <- "http://news.bbc.co.uk/2/hi/uk_politics/8044207.stm" | |
| # parse the document for R representation: | |
| mps.doc <- htmlParse(mps) | |
| # get all the tables in mps.doc as data frames | |
| mps.tabs <- readHTMLTable(mps.doc) | |
| # loop to find relevant table: | |
| first <- "Abbott, Ms Diane" | |
| last <- "157,841" | |
| for (i in 1:length(mps.tabs)) { | |
| lastrow <- nrow(mps.tabs[[i]]) # get number of rows | |
| lastcol <- ncol(mps.tabs[[i]]) | |
| if (as.character(mps.tabs[[i]][1,1])==first & as.character(mps.tabs[[i]][lastrow,lastcol])==last) { | |
| tabi <- i | |
| } | |
| } | |
| # extract the relevant table and format it: | |
| mps <- mps.tabs[[tabi]] | |
| money <- sapply(mps[,-1:-3], FUN= function(x) as.numeric(gsub(",", "", as.character(x), fixed = TRUE) )) | |
| mps2 <- cbind(mps[,1:3],money) | |
| # which are the five biggest parties by # of mps? | |
| nbig5 <- names(summary(mps2$Party)[order(summary(mps2$Party)*-1)][1:5]) | |
| #subset of mps only with the five biggest parties: | |
| big5 <- subset(mps2, mps$Party%in%nbig5) | |
| # load the lattice package for a nice plot | |
| library(lattice) | |
| bwplot(Total ~ Party, data=big5, ylab="Total expenses per MP (in £)") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment