HughParsonage/sportsbet_2016_election_scraper.R

## sportsbet_2016_election_scraper.R
library(rvest)
library(dplyr)
library(tidyr)
library(xml2)
library(data.table)

url_act_electorates_outrights <-
  "http://www.sportsbet.com.au/betting/politics/australian-federal-politics/outrights?ev_oc_grp_id=1971028"
url_nt <-
  "http://www.sportsbet.com.au/betting/politics/australian-federal-politics/outrights?ev_oc_grp_id=1971052"

xpaths <-
  list(
    ACT = 1971028,
    NT  = 1971052,
    NSW = 1956097,
    QLD = 1971071,
    SA  = 1971079,
    TAS = 1971095,
    VIC = 1971122,
    WA  = 1971123
)


state_odds_table <- function(i){
  nsw_url <-
    paste0("http://www.sportsbet.com.au/betting/politics/australian-federal-politics/outrights?ev_oc_grp_id=", xpaths[[i]])

  odds <-
    nsw_url %>%
    read_html() %>%
    html_nodes(xpath=paste0('//*[@id="accordion-body-', xpaths[[i]], '"]')) %>%
    xml_text()

  odds_nodes <-
    nsw_url %>%
    read_html() %>%
    xml_nodes(xpath='//*[@id="accordion"]/ul')

  tbl_odds_char <-
    odds_nodes %>%
    xml_text() %>%
    strsplit(split = "[0-9][0-9]:[0-9][0-9][^0-9]+Markets\\s+\\([0-9]\\)\n+")

  # first entry not a division
  tbl_odds_char <-
    tbl_odds_char[[1]][-1]

  divisions <-
    odds_nodes %>%
    xml_text() %>%
    stringr::str_extract_all(pattern = "[0-9][0-9]:[0-9][0-9][^0-9]+Markets\\s+\\([0-9]\\)\n+") %>%
    unlist() %>%
    gsub("\n+", "", .) %>%
    gsub("[^A-Za-z]", "", .) %>%
    # La Trobe is 'Vic'.
    gsub(paste0("^([A-Za-z]+)((Vic)|(", names(xpaths)[i], "))Markets$"), "\\1", .)

  divisions <- ifelse(grepl("GEL+IBRAND", divisions, ignore.case = TRUE),
                      "GELLIBRAND",
                      divisions)

  names(tbl_odds_char) <- divisions

  odds_nsw_list <-
    tbl_odds_char %>%
    unlist %>%
    stringr::str_split(., pattern = "\n{3,}")

  names(odds_nsw_list) <- divisions

  include_only_party_indices <- function(x){
    x[grepl("\\.[0-9][0-9][0-9]*$", x)] # need to include 1.001
  }

  odds_nsw_list <- lapply(odds_nsw_list, include_only_party_indices)

  maxRow <- max(sapply(odds_nsw_list, length))

  cbind_asis <- function(...){
    cbind.data.frame(..., stringsAsFactors = FALSE)
  }

  odds_nsw_df <-
    do.call(cbind_asis, lapply(odds_nsw_list, function(x){
      length(x) <- maxRow
      x
    })) %>%
    as.data.frame(., stringsAsFactors = FALSE) %>%
    mutate(rowNumber = as.character(rownames(.))) %>%
    gather(Division, char, -rowNumber, na.rm = TRUE) %>%
    filter(complete.cases(.)) %>%
    mutate(Party = stringr::str_extract(char, pattern = "((Labor)|(Xenophon)|(Liberal)|(Coalition)|(National)|(Green)|(Other)|(Independent))"),
           Odds = as.numeric(gsub("^.*\n+(.*)$", "\\1", char))) %>%
    mutate(Party = ifelse(Party %in% c("National", "Liberal"), "Coalition", Party))

  as.data.table(odds_nsw_df)
}

national_odds <-
  lapply(seq_along(xpaths), state_odds_table) %>%
  rbindlist(use.names = TRUE, fill = TRUE)


national_odds %<>%
  group_by(Division) %>%
  arrange(Odds) %>%
  mutate(Probability = (1/(Odds)) / sum(1 / Odds),
         Favourite = first(Party)) %>%
  mutate(Time = Sys.time()) %>%
  group_by(Division, Party) %>%
  mutate(
    Probability2 = sum(Probability, na.rm = TRUE)
  ) %>%
  # this will exclude some where the odds are even
  ungroup

national_favourites <-
  national_odds %>%
  select(-char) %>%
  group_by(Division) %>%
  filter(rowNumber == 1) %>%
  ungroup %>%
  mutate(ELECT_DIV2 = toupper(Division))

# fwrite(national_odds, file.path = paste0("data/", gsub(":", "", Sys.time(), fixed = TRUE), "national_favourites.csv"))
readr::write_csv(national_odds,
                 path = file.path("data", paste0(gsub(":", "", Sys.time(), fixed = TRUE), "national_favourites.csv")))
readr::write_csv(national_favourites,
                 path = file.path("data", paste0(gsub(":", "", Sys.time(), fixed = TRUE), "the_national_favourites.csv")))
	library(rvest)
	library(dplyr)
	library(tidyr)
	library(xml2)
	library(data.table)

	url_act_electorates_outrights <-
	"http://www.sportsbet.com.au/betting/politics/australian-federal-politics/outrights?ev_oc_grp_id=1971028"
	url_nt <-
	"http://www.sportsbet.com.au/betting/politics/australian-federal-politics/outrights?ev_oc_grp_id=1971052"

	xpaths <-
	list(
	ACT = 1971028,
	NT = 1971052,
	NSW = 1956097,
	QLD = 1971071,
	SA = 1971079,
	TAS = 1971095,
	VIC = 1971122,
	WA = 1971123
	)


	state_odds_table <- function(i){
	nsw_url <-
	paste0("http://www.sportsbet.com.au/betting/politics/australian-federal-politics/outrights?ev_oc_grp_id=", xpaths[[i]])

	odds <-
	nsw_url %>%
	read_html() %>%
	html_nodes(xpath=paste0('//*[@id="accordion-body-', xpaths[[i]], '"]')) %>%
	xml_text()

	odds_nodes <-
	nsw_url %>%
	read_html() %>%
	xml_nodes(xpath='//*[@id="accordion"]/ul')

	tbl_odds_char <-
	odds_nodes %>%
	xml_text() %>%
	strsplit(split = "[0-9][0-9]:[0-9][0-9][^0-9]+Markets\\s+\\([0-9]\\)\n+")

	# first entry not a division
	tbl_odds_char <-
	tbl_odds_char[[1]][-1]

	divisions <-
	odds_nodes %>%
	xml_text() %>%
	stringr::str_extract_all(pattern = "[0-9][0-9]:[0-9][0-9][^0-9]+Markets\\s+\\([0-9]\\)\n+") %>%
	unlist() %>%
	gsub("\n+", "", .) %>%
	gsub("[^A-Za-z]", "", .) %>%
	# La Trobe is 'Vic'.
	gsub(paste0("^([A-Za-z]+)((Vic)\|(", names(xpaths)[i], "))Markets$"), "\\1", .)

	divisions <- ifelse(grepl("GEL+IBRAND", divisions, ignore.case = TRUE),
	"GELLIBRAND",
	divisions)

	names(tbl_odds_char) <- divisions

	odds_nsw_list <-
	tbl_odds_char %>%
	unlist %>%
	stringr::str_split(., pattern = "\n{3,}")

	names(odds_nsw_list) <- divisions

	include_only_party_indices <- function(x){
	x[grepl("\\.[0-9][0-9][0-9]*$", x)] # need to include 1.001
	}

	odds_nsw_list <- lapply(odds_nsw_list, include_only_party_indices)

	maxRow <- max(sapply(odds_nsw_list, length))

	cbind_asis <- function(...){
	cbind.data.frame(..., stringsAsFactors = FALSE)
	}

	odds_nsw_df <-
	do.call(cbind_asis, lapply(odds_nsw_list, function(x){
	length(x) <- maxRow
	x
	})) %>%
	as.data.frame(., stringsAsFactors = FALSE) %>%
	mutate(rowNumber = as.character(rownames(.))) %>%
	gather(Division, char, -rowNumber, na.rm = TRUE) %>%
	filter(complete.cases(.)) %>%
	mutate(Party = stringr::str_extract(char, pattern = "((Labor)\|(Xenophon)\|(Liberal)\|(Coalition)\|(National)\|(Green)\|(Other)\|(Independent))"),
	Odds = as.numeric(gsub("^.\n+(.)$", "\\1", char))) %>%
	mutate(Party = ifelse(Party %in% c("National", "Liberal"), "Coalition", Party))

	as.data.table(odds_nsw_df)
	}

	national_odds <-
	lapply(seq_along(xpaths), state_odds_table) %>%
	rbindlist(use.names = TRUE, fill = TRUE)


	national_odds %<>%
	group_by(Division) %>%
	arrange(Odds) %>%
	mutate(Probability = (1/(Odds)) / sum(1 / Odds),
	Favourite = first(Party)) %>%
	mutate(Time = Sys.time()) %>%
	group_by(Division, Party) %>%
	mutate(
	Probability2 = sum(Probability, na.rm = TRUE)
	) %>%
	# this will exclude some where the odds are even
	ungroup

	national_favourites <-
	national_odds %>%
	select(-char) %>%
	group_by(Division) %>%
	filter(rowNumber == 1) %>%
	ungroup %>%
	mutate(ELECT_DIV2 = toupper(Division))

	# fwrite(national_odds, file.path = paste0("data/", gsub(":", "", Sys.time(), fixed = TRUE), "national_favourites.csv"))
	readr::write_csv(national_odds,
	path = file.path("data", paste0(gsub(":", "", Sys.time(), fixed = TRUE), "national_favourites.csv")))
	readr::write_csv(national_favourites,
	path = file.path("data", paste0(gsub(":", "", Sys.time(), fixed = TRUE), "the_national_favourites.csv")))