Created
June 19, 2018 06:35
-
-
Save psobczyk/7dadd9543e2336a46acb7867d621a051 to your computer and use it in GitHub Desktop.
from data scraping to World Cup winner predictions
We can make this file beautiful and searchable if this error is corrected: No commas found in this CSV file in line 0.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
match;group.x;position.x;group.y;position.y | |
49;A;1;B;2 | |
50;C;1;D;2 | |
51;B;1;A;2 | |
52;D;1;C;2 | |
53;E;1;F;2 | |
54;G;1;H;2 | |
55;F;1;E;2 | |
56;H;1;G;2 | |
57;;49;;50 | |
58;;53;;54 | |
59;;55;;56 | |
60;;51;;52 | |
61;;57;;58 | |
62;;59;;60 | |
63;;61;;62 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#### predictions based on elo ranking | |
library(rvest) | |
library(dplyr) | |
library(tidyr) | |
library(pbapply) | |
library(ggplot2) | |
library(ggthemes) | |
# get fixtures ------------------------------------------------------------ | |
#https://github.com/openfootball/world-cup/blob/master/2018--russia/cup.txt | |
#please remove any results from txt file | |
cup_2018 <- readLines('data/cup_2018.txt', encoding = 'UTF-8') | |
match_lines <- grepl('^[ ]?\\(.*\\).*', cup_2018) | |
match_results <- cup_2018[match_lines] | |
match_results2 <- lapply(match_results, function(tmp){ | |
c(trimws(gsub('.*00 (.*)-(.*)@.*', '\\1', tmp)), | |
trimws(gsub('.*00 (.*)-(.*)@.*', '\\2', tmp)), | |
as.numeric(trimws(gsub('^[ ]?\\(([0-9]*)\\).*', '\\1', tmp)))) | |
}) | |
fixtures_cup_2018 <- do.call(rbind, match_results2[-c(1:5)]) %>% | |
data.frame(stringsAsFactors = F) %>% | |
rename(team1=X1, team2=X2, match_no = X3) %>% | |
mutate(match_no = as.numeric(match_no)) %>% | |
arrange(match_no) | |
# manually created based on wikipedia | |
knock_off_fixtures <- read.csv2('data/knock_off_fixtures.csv', stringsAsFactors = F) | |
# scraping groups | |
group_lines <- grepl('^Group.*', cup_2018) | |
groups <- cup_2018[group_lines][1:8] | |
groups <- lapply(groups, function(x){ | |
group <- gsub(pattern = '(.*)\\|.*', '\\1', x) | |
countries <- trimws(strsplit(gsub('[[:space:]]', ' ', trimws(gsub(pattern = '(.*)\\|(.*)', '\\2', x))), ' ')[[1]]) | |
countries <- countries[countries!=''] | |
data.frame(group = group, country = countries, stringsAsFactors = F) | |
}) %>% do.call(what = rbind) | |
# getting elo_ranking | |
# ranking seems to be created dynamically, so we need to download website | |
# https://www.eloratings.net/ | |
url <- '~/Downloads/World Football Elo Ratings.htm' | |
main_page <- read_html(url) | |
main_page %>% | |
html_nodes(xpath = '//div[@id="maindiv"]//div[@class="maintable slickgrid_835274 ui-widget"]') %>% | |
html_text() | |
countries <- main_page %>% | |
html_nodes(xpath = '//div[@class="slick-cell l1 r1 team-cell narrow-layout"]//a') %>% | |
html_text() | |
rating <- main_page %>% | |
html_nodes(xpath = '//div[@class="slick-cell l2 r2 rating-cell narrow-layout"]') %>% | |
html_text() | |
elo_ranking <- data.frame(country = countries, rating = rating, stringsAsFactors = F) | |
elo_ranking <- elo_ranking %>% | |
mutate(rating = as.numeric(rating), | |
rating = ifelse(country == 'Russia', rating+100, rating)) | |
# simulations ------------------------------------------------------------- | |
n_sim <- 1000 | |
# we set that 20% of all matches end up in draw. In 2014 it was almost 19% | |
sim_results_full_world_cup <- pblapply(1:n_sim, function(i){ | |
probs <- fixtures_cup_2018 %>% | |
inner_join(elo_ranking, by = c('team1'='country')) %>% | |
inner_join(elo_ranking, by = c('team2'='country')) %>% | |
mutate(diff_rating = rating.x - rating.y, | |
probability.x = 1/(10^(-diff_rating/400) + 1), #according to elo rating | |
random = runif(n()), | |
points.x = ifelse(random < probability.x - 0.1, 3, ifelse(random > probability.x + 0.1, 0, 1)), | |
points.y = ifelse(points.x == 1, 1, 3 - points.x)) | |
group_results <- probs %>% | |
unite(team_points.x, team1, points.x, sep = '_') %>% | |
unite(team_points.y, team2, points.y, sep = '_') %>% | |
tidyr::gather(key, team_points, team_points.x, team_points.y) %>% | |
separate(team_points, c("team", "points"), sep = '_') %>% | |
select(team, points) %>% | |
inner_join(groups, by =c('team'='country')) %>% | |
mutate(points = as.numeric(points)) %>% | |
group_by(group, team) %>% | |
summarise(points = sum(points)) %>% | |
arrange(group, desc(points), runif(n())) %>% #if teams have the same number of points, we randomly select the winner | |
mutate(position = row_number(), | |
iter = i) %>% | |
filter(position <= 2) %>% #only two team advance to next stage | |
ungroup() %>% | |
mutate(group = trimws(gsub('Group (.*)', '\\1', group))) | |
one_of_16_finals <- knock_off_fixtures %>% | |
inner_join(group_results, by=c('group.x'='group', 'position.x'='position')) %>% | |
inner_join(group_results, by=c('group.y'='group', 'position.y'='position')) %>% | |
inner_join(elo_ranking, by = c('team.x'='country')) %>% | |
inner_join(elo_ranking, by = c('team.y'='country')) %>% | |
mutate(diff_rating = rating.x - rating.y, | |
probability.x = 1/(10^(-diff_rating/400) + 1), | |
random = runif(n()), | |
team = ifelse(random < probability.x, team.x, team.y)) %>% | |
select(iter=iter.x, match, team) | |
quater_finals <- knock_off_fixtures %>% | |
inner_join(one_of_16_finals, by=c('position.x'='match')) %>% | |
inner_join(one_of_16_finals, by=c('position.y'='match')) %>% | |
inner_join(elo_ranking, by = c('team.x'='country')) %>% | |
inner_join(elo_ranking, by = c('team.y'='country')) %>% | |
mutate(diff_rating = rating.x - rating.y, | |
probability.x = 1/(10^(-diff_rating/400) + 1), | |
random = runif(n()), | |
team = ifelse(random < probability.x, team.x, team.y)) %>% | |
select(iter=iter.x, match, team) | |
semi_finals <- knock_off_fixtures %>% | |
inner_join(quater_finals, by=c('position.x'='match')) %>% | |
inner_join(quater_finals, by=c('position.y'='match')) %>% | |
inner_join(elo_ranking, by = c('team.x'='country')) %>% | |
inner_join(elo_ranking, by = c('team.y'='country')) %>% | |
mutate(diff_rating = rating.x - rating.y, | |
probability.x = 1/(10^(-diff_rating/400) + 1), | |
random = runif(n()), | |
team = ifelse(random < probability.x, team.x, team.y)) %>% | |
select(iter=iter.x, match, team) | |
winner <- knock_off_fixtures %>% | |
inner_join(semi_finals, by=c('position.x'='match')) %>% | |
inner_join(semi_finals, by=c('position.y'='match')) %>% | |
inner_join(elo_ranking, by = c('team.x'='country')) %>% | |
inner_join(elo_ranking, by = c('team.y'='country')) %>% | |
mutate(diff_rating = rating.x - rating.y, | |
probability.x = 1/(10^(-diff_rating/400) + 1), | |
random = runif(n()), | |
team = ifelse(random < probability.x, team.x, team.y)) %>% | |
select(iter=iter.x, match, team) | |
list(group_results, one_of_16_finals, quater_finals, semi_finals, winner) | |
}) | |
# simulations results wrap up --------------------------------------------- | |
chances_knock_off <- lapply(sim_results_full_world_cup, function(x) x[[1]]) %>% | |
do.call(what = rbind) %>% | |
group_by(team) %>% | |
summarise(chance = n()/n_sim, group = group[1]) %>% | |
arrange(desc(chance)) | |
chances_quater_final <- lapply(sim_results_full_world_cup, function(x) x[[2]]) %>% | |
do.call(what = rbind) %>% | |
group_by(team) %>% | |
summarise(chance = n()/n_sim) %>% | |
arrange(desc(chance)) | |
chances_semi_final <- lapply(sim_results_full_world_cup, function(x) x[[3]]) %>% | |
do.call(what = rbind) %>% | |
group_by(team) %>% | |
summarise(chance = n()/n_sim) %>% | |
arrange(desc(chance)) | |
chances_final <- lapply(sim_results_full_world_cup, function(x) x[[4]]) %>% | |
do.call(what = rbind) %>% | |
group_by(team) %>% | |
summarise(chance = n()/n_sim) %>% | |
arrange(desc(chance)) | |
chances_winning <- lapply(sim_results_full_world_cup, function(x) x[[5]]) %>% | |
do.call(what = rbind) %>% | |
group_by(team) %>% | |
summarise(chance = n()/n_sim) %>% | |
arrange(desc(chance)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment