Create a gist now

Instantly share code, notes, and snippets.

Embed
What would you like to do?
from data scraping to World Cup winner predictions
We can make this file beautiful and searchable if this error is corrected: No commas found in this CSV file
match;group.x;position.x;group.y;position.y
49;A;1;B;2
50;C;1;D;2
51;B;1;A;2
52;D;1;C;2
53;E;1;F;2
54;G;1;H;2
55;F;1;E;2
56;H;1;G;2
57;;49;;50
58;;53;;54
59;;55;;56
60;;51;;52
61;;57;;58
62;;59;;60
63;;61;;62
#### predictions based on elo ranking
library(rvest)
library(dplyr)
library(tidyr)
library(pbapply)
library(ggplot2)
library(ggthemes)
# get fixtures ------------------------------------------------------------
#https://github.com/openfootball/world-cup/blob/master/2018--russia/cup.txt
#please remove any results from txt file
cup_2018 <- readLines('data/cup_2018.txt', encoding = 'UTF-8')
match_lines <- grepl('^[ ]?\\(.*\\).*', cup_2018)
match_results <- cup_2018[match_lines]
match_results2 <- lapply(match_results, function(tmp){
c(trimws(gsub('.*00 (.*)-(.*)@.*', '\\1', tmp)),
trimws(gsub('.*00 (.*)-(.*)@.*', '\\2', tmp)),
as.numeric(trimws(gsub('^[ ]?\\(([0-9]*)\\).*', '\\1', tmp))))
})
fixtures_cup_2018 <- do.call(rbind, match_results2[-c(1:5)]) %>%
data.frame(stringsAsFactors = F) %>%
rename(team1=X1, team2=X2, match_no = X3) %>%
mutate(match_no = as.numeric(match_no)) %>%
arrange(match_no)
# manually created based on wikipedia
knock_off_fixtures <- read.csv2('data/knock_off_fixtures.csv', stringsAsFactors = F)
# scraping groups
group_lines <- grepl('^Group.*', cup_2018)
groups <- cup_2018[group_lines][1:8]
groups <- lapply(groups, function(x){
group <- gsub(pattern = '(.*)\\|.*', '\\1', x)
countries <- trimws(strsplit(gsub('[[:space:]]', ' ', trimws(gsub(pattern = '(.*)\\|(.*)', '\\2', x))), ' ')[[1]])
countries <- countries[countries!='']
data.frame(group = group, country = countries, stringsAsFactors = F)
}) %>% do.call(what = rbind)
# getting elo_ranking
# ranking seems to be created dynamically, so we need to download website
# https://www.eloratings.net/
url <- '~/Downloads/World Football Elo Ratings.htm'
main_page <- read_html(url)
main_page %>%
html_nodes(xpath = '//div[@id="maindiv"]//div[@class="maintable slickgrid_835274 ui-widget"]') %>%
html_text()
countries <- main_page %>%
html_nodes(xpath = '//div[@class="slick-cell l1 r1 team-cell narrow-layout"]//a') %>%
html_text()
rating <- main_page %>%
html_nodes(xpath = '//div[@class="slick-cell l2 r2 rating-cell narrow-layout"]') %>%
html_text()
elo_ranking <- data.frame(country = countries, rating = rating, stringsAsFactors = F)
elo_ranking <- elo_ranking %>%
mutate(rating = as.numeric(rating),
rating = ifelse(country == 'Russia', rating+100, rating))
# simulations -------------------------------------------------------------
n_sim <- 1000
# we set that 20% of all matches end up in draw. In 2014 it was almost 19%
sim_results_full_world_cup <- pblapply(1:n_sim, function(i){
probs <- fixtures_cup_2018 %>%
inner_join(elo_ranking, by = c('team1'='country')) %>%
inner_join(elo_ranking, by = c('team2'='country')) %>%
mutate(diff_rating = rating.x - rating.y,
probability.x = 1/(10^(-diff_rating/400) + 1), #according to elo rating
random = runif(n()),
points.x = ifelse(random < probability.x - 0.1, 3, ifelse(random > probability.x + 0.1, 0, 1)),
points.y = ifelse(points.x == 1, 1, 3 - points.x))
group_results <- probs %>%
unite(team_points.x, team1, points.x, sep = '_') %>%
unite(team_points.y, team2, points.y, sep = '_') %>%
tidyr::gather(key, team_points, team_points.x, team_points.y) %>%
separate(team_points, c("team", "points"), sep = '_') %>%
select(team, points) %>%
inner_join(groups, by =c('team'='country')) %>%
mutate(points = as.numeric(points)) %>%
group_by(group, team) %>%
summarise(points = sum(points)) %>%
arrange(group, desc(points), runif(n())) %>% #if teams have the same number of points, we randomly select the winner
mutate(position = row_number(),
iter = i) %>%
filter(position <= 2) %>% #only two team advance to next stage
ungroup() %>%
mutate(group = trimws(gsub('Group (.*)', '\\1', group)))
one_of_16_finals <- knock_off_fixtures %>%
inner_join(group_results, by=c('group.x'='group', 'position.x'='position')) %>%
inner_join(group_results, by=c('group.y'='group', 'position.y'='position')) %>%
inner_join(elo_ranking, by = c('team.x'='country')) %>%
inner_join(elo_ranking, by = c('team.y'='country')) %>%
mutate(diff_rating = rating.x - rating.y,
probability.x = 1/(10^(-diff_rating/400) + 1),
random = runif(n()),
team = ifelse(random < probability.x, team.x, team.y)) %>%
select(iter=iter.x, match, team)
quater_finals <- knock_off_fixtures %>%
inner_join(one_of_16_finals, by=c('position.x'='match')) %>%
inner_join(one_of_16_finals, by=c('position.y'='match')) %>%
inner_join(elo_ranking, by = c('team.x'='country')) %>%
inner_join(elo_ranking, by = c('team.y'='country')) %>%
mutate(diff_rating = rating.x - rating.y,
probability.x = 1/(10^(-diff_rating/400) + 1),
random = runif(n()),
team = ifelse(random < probability.x, team.x, team.y)) %>%
select(iter=iter.x, match, team)
semi_finals <- knock_off_fixtures %>%
inner_join(quater_finals, by=c('position.x'='match')) %>%
inner_join(quater_finals, by=c('position.y'='match')) %>%
inner_join(elo_ranking, by = c('team.x'='country')) %>%
inner_join(elo_ranking, by = c('team.y'='country')) %>%
mutate(diff_rating = rating.x - rating.y,
probability.x = 1/(10^(-diff_rating/400) + 1),
random = runif(n()),
team = ifelse(random < probability.x, team.x, team.y)) %>%
select(iter=iter.x, match, team)
winner <- knock_off_fixtures %>%
inner_join(semi_finals, by=c('position.x'='match')) %>%
inner_join(semi_finals, by=c('position.y'='match')) %>%
inner_join(elo_ranking, by = c('team.x'='country')) %>%
inner_join(elo_ranking, by = c('team.y'='country')) %>%
mutate(diff_rating = rating.x - rating.y,
probability.x = 1/(10^(-diff_rating/400) + 1),
random = runif(n()),
team = ifelse(random < probability.x, team.x, team.y)) %>%
select(iter=iter.x, match, team)
list(group_results, one_of_16_finals, quater_finals, semi_finals, winner)
})
# simulations results wrap up ---------------------------------------------
chances_knock_off <- lapply(sim_results_full_world_cup, function(x) x[[1]]) %>%
do.call(what = rbind) %>%
group_by(team) %>%
summarise(chance = n()/n_sim, group = group[1]) %>%
arrange(desc(chance))
chances_quater_final <- lapply(sim_results_full_world_cup, function(x) x[[2]]) %>%
do.call(what = rbind) %>%
group_by(team) %>%
summarise(chance = n()/n_sim) %>%
arrange(desc(chance))
chances_semi_final <- lapply(sim_results_full_world_cup, function(x) x[[3]]) %>%
do.call(what = rbind) %>%
group_by(team) %>%
summarise(chance = n()/n_sim) %>%
arrange(desc(chance))
chances_final <- lapply(sim_results_full_world_cup, function(x) x[[4]]) %>%
do.call(what = rbind) %>%
group_by(team) %>%
summarise(chance = n()/n_sim) %>%
arrange(desc(chance))
chances_winning <- lapply(sim_results_full_world_cup, function(x) x[[5]]) %>%
do.call(what = rbind) %>%
group_by(team) %>%
summarise(chance = n()/n_sim) %>%
arrange(desc(chance))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment