Skip to content

Instantly share code, notes, and snippets.

@psobczyk
Created June 19, 2018 06:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save psobczyk/7dadd9543e2336a46acb7867d621a051 to your computer and use it in GitHub Desktop.
Save psobczyk/7dadd9543e2336a46acb7867d621a051 to your computer and use it in GitHub Desktop.
from data scraping to World Cup winner predictions
We can make this file beautiful and searchable if this error is corrected: No commas found in this CSV file in line 0.
match;group.x;position.x;group.y;position.y
49;A;1;B;2
50;C;1;D;2
51;B;1;A;2
52;D;1;C;2
53;E;1;F;2
54;G;1;H;2
55;F;1;E;2
56;H;1;G;2
57;;49;;50
58;;53;;54
59;;55;;56
60;;51;;52
61;;57;;58
62;;59;;60
63;;61;;62
#### predictions based on elo ranking
library(rvest)
library(dplyr)
library(tidyr)
library(pbapply)
library(ggplot2)
library(ggthemes)
# get fixtures ------------------------------------------------------------
#https://github.com/openfootball/world-cup/blob/master/2018--russia/cup.txt
#please remove any results from txt file
cup_2018 <- readLines('data/cup_2018.txt', encoding = 'UTF-8')
match_lines <- grepl('^[ ]?\\(.*\\).*', cup_2018)
match_results <- cup_2018[match_lines]
match_results2 <- lapply(match_results, function(tmp){
c(trimws(gsub('.*00 (.*)-(.*)@.*', '\\1', tmp)),
trimws(gsub('.*00 (.*)-(.*)@.*', '\\2', tmp)),
as.numeric(trimws(gsub('^[ ]?\\(([0-9]*)\\).*', '\\1', tmp))))
})
fixtures_cup_2018 <- do.call(rbind, match_results2[-c(1:5)]) %>%
data.frame(stringsAsFactors = F) %>%
rename(team1=X1, team2=X2, match_no = X3) %>%
mutate(match_no = as.numeric(match_no)) %>%
arrange(match_no)
# manually created based on wikipedia
knock_off_fixtures <- read.csv2('data/knock_off_fixtures.csv', stringsAsFactors = F)
# scraping groups
group_lines <- grepl('^Group.*', cup_2018)
groups <- cup_2018[group_lines][1:8]
groups <- lapply(groups, function(x){
group <- gsub(pattern = '(.*)\\|.*', '\\1', x)
countries <- trimws(strsplit(gsub('[[:space:]]', ' ', trimws(gsub(pattern = '(.*)\\|(.*)', '\\2', x))), ' ')[[1]])
countries <- countries[countries!='']
data.frame(group = group, country = countries, stringsAsFactors = F)
}) %>% do.call(what = rbind)
# getting elo_ranking
# ranking seems to be created dynamically, so we need to download website
# https://www.eloratings.net/
url <- '~/Downloads/World Football Elo Ratings.htm'
main_page <- read_html(url)
main_page %>%
html_nodes(xpath = '//div[@id="maindiv"]//div[@class="maintable slickgrid_835274 ui-widget"]') %>%
html_text()
countries <- main_page %>%
html_nodes(xpath = '//div[@class="slick-cell l1 r1 team-cell narrow-layout"]//a') %>%
html_text()
rating <- main_page %>%
html_nodes(xpath = '//div[@class="slick-cell l2 r2 rating-cell narrow-layout"]') %>%
html_text()
elo_ranking <- data.frame(country = countries, rating = rating, stringsAsFactors = F)
elo_ranking <- elo_ranking %>%
mutate(rating = as.numeric(rating),
rating = ifelse(country == 'Russia', rating+100, rating))
# simulations -------------------------------------------------------------
n_sim <- 1000
# we set that 20% of all matches end up in draw. In 2014 it was almost 19%
sim_results_full_world_cup <- pblapply(1:n_sim, function(i){
probs <- fixtures_cup_2018 %>%
inner_join(elo_ranking, by = c('team1'='country')) %>%
inner_join(elo_ranking, by = c('team2'='country')) %>%
mutate(diff_rating = rating.x - rating.y,
probability.x = 1/(10^(-diff_rating/400) + 1), #according to elo rating
random = runif(n()),
points.x = ifelse(random < probability.x - 0.1, 3, ifelse(random > probability.x + 0.1, 0, 1)),
points.y = ifelse(points.x == 1, 1, 3 - points.x))
group_results <- probs %>%
unite(team_points.x, team1, points.x, sep = '_') %>%
unite(team_points.y, team2, points.y, sep = '_') %>%
tidyr::gather(key, team_points, team_points.x, team_points.y) %>%
separate(team_points, c("team", "points"), sep = '_') %>%
select(team, points) %>%
inner_join(groups, by =c('team'='country')) %>%
mutate(points = as.numeric(points)) %>%
group_by(group, team) %>%
summarise(points = sum(points)) %>%
arrange(group, desc(points), runif(n())) %>% #if teams have the same number of points, we randomly select the winner
mutate(position = row_number(),
iter = i) %>%
filter(position <= 2) %>% #only two team advance to next stage
ungroup() %>%
mutate(group = trimws(gsub('Group (.*)', '\\1', group)))
one_of_16_finals <- knock_off_fixtures %>%
inner_join(group_results, by=c('group.x'='group', 'position.x'='position')) %>%
inner_join(group_results, by=c('group.y'='group', 'position.y'='position')) %>%
inner_join(elo_ranking, by = c('team.x'='country')) %>%
inner_join(elo_ranking, by = c('team.y'='country')) %>%
mutate(diff_rating = rating.x - rating.y,
probability.x = 1/(10^(-diff_rating/400) + 1),
random = runif(n()),
team = ifelse(random < probability.x, team.x, team.y)) %>%
select(iter=iter.x, match, team)
quater_finals <- knock_off_fixtures %>%
inner_join(one_of_16_finals, by=c('position.x'='match')) %>%
inner_join(one_of_16_finals, by=c('position.y'='match')) %>%
inner_join(elo_ranking, by = c('team.x'='country')) %>%
inner_join(elo_ranking, by = c('team.y'='country')) %>%
mutate(diff_rating = rating.x - rating.y,
probability.x = 1/(10^(-diff_rating/400) + 1),
random = runif(n()),
team = ifelse(random < probability.x, team.x, team.y)) %>%
select(iter=iter.x, match, team)
semi_finals <- knock_off_fixtures %>%
inner_join(quater_finals, by=c('position.x'='match')) %>%
inner_join(quater_finals, by=c('position.y'='match')) %>%
inner_join(elo_ranking, by = c('team.x'='country')) %>%
inner_join(elo_ranking, by = c('team.y'='country')) %>%
mutate(diff_rating = rating.x - rating.y,
probability.x = 1/(10^(-diff_rating/400) + 1),
random = runif(n()),
team = ifelse(random < probability.x, team.x, team.y)) %>%
select(iter=iter.x, match, team)
winner <- knock_off_fixtures %>%
inner_join(semi_finals, by=c('position.x'='match')) %>%
inner_join(semi_finals, by=c('position.y'='match')) %>%
inner_join(elo_ranking, by = c('team.x'='country')) %>%
inner_join(elo_ranking, by = c('team.y'='country')) %>%
mutate(diff_rating = rating.x - rating.y,
probability.x = 1/(10^(-diff_rating/400) + 1),
random = runif(n()),
team = ifelse(random < probability.x, team.x, team.y)) %>%
select(iter=iter.x, match, team)
list(group_results, one_of_16_finals, quater_finals, semi_finals, winner)
})
# simulations results wrap up ---------------------------------------------
chances_knock_off <- lapply(sim_results_full_world_cup, function(x) x[[1]]) %>%
do.call(what = rbind) %>%
group_by(team) %>%
summarise(chance = n()/n_sim, group = group[1]) %>%
arrange(desc(chance))
chances_quater_final <- lapply(sim_results_full_world_cup, function(x) x[[2]]) %>%
do.call(what = rbind) %>%
group_by(team) %>%
summarise(chance = n()/n_sim) %>%
arrange(desc(chance))
chances_semi_final <- lapply(sim_results_full_world_cup, function(x) x[[3]]) %>%
do.call(what = rbind) %>%
group_by(team) %>%
summarise(chance = n()/n_sim) %>%
arrange(desc(chance))
chances_final <- lapply(sim_results_full_world_cup, function(x) x[[4]]) %>%
do.call(what = rbind) %>%
group_by(team) %>%
summarise(chance = n()/n_sim) %>%
arrange(desc(chance))
chances_winning <- lapply(sim_results_full_world_cup, function(x) x[[5]]) %>%
do.call(what = rbind) %>%
group_by(team) %>%
summarise(chance = n()/n_sim) %>%
arrange(desc(chance))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment