Skip to content

Instantly share code, notes, and snippets.

@paparaka
Created March 28, 2019 16:07
Show Gist options
  • Save paparaka/39f74dd5135f7ec659067fd13cb54866 to your computer and use it in GitHub Desktop.
Save paparaka/39f74dd5135f7ec659067fd13cb54866 to your computer and use it in GitHub Desktop.
Brexit alternative votes parsed in R
require(rvest)
require(stringr)
require(data.table)
require(tidyverse)
library(dplyr)
#Specifying the url for desired website to be scraped
url <- 'https://www.theguardian.com/uk-news/ng-interactive/2019/mar/27/how-did-your-mp-vote-in-the-indicative-votes'
#Reading the HTML code from the website
webpage <- read_html(url)
#This is the table that hold the data we want
rows <- webpage %>%
html_nodes('.int-table') %>%
html_nodes(xpath = '//div[@class="int-row int-row--mp"]')
scrape_row <- function(i) {
r_name <- i %>% html_nodes(xpath = 'div[@class="int-cell int-cell--name"]') %>%
html_text() %>% str_replace_all("\n", "")
r_const <- i %>% html_nodes(xpath = 'div[@class="int-cell int-cell--const"]') %>%
html_text() %>% str_replace_all("\n", "")
r_party <- i %>% html_nodes(xpath = 'div[1]') %>%
html_text() %>% str_replace_all("\n", "")
r_colour <- i %>% html_nodes(xpath = 'div[1]') %>% html_attrs() %>% as.character() %>%
str_replace("int-cell int-cell--party int-color--", "")
r_votes <- i %>% html_nodes(xpath = 'div[@class="int-cell int-cell--vote"]/node()/*') %>%
html_attrs() %>% unlist() %>% str_replace("gv-vote-blob gv-", "")
votes_id = paste0("V",seq(1,8))
data.frame("name"=r_name,"const"=r_const,"party"=r_party,votes_id,"vote"=r_votes) %>% return()
}
## apply function on each element of the list
out_list <- lapply(rows, scrape_row)
# row bind the list
DT <- do.call(rbind, out_list) %>% as.data.table()
DT %>% spread(key=votes_id, value = vote)
# DT %>% separate(col = const,
# into = c("const", "refer_position", "percent"),
# sep = "\w /(\w \d\%\)" )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment