Skip to content

Instantly share code, notes, and snippets.

@dggoldst
Last active May 25, 2016 01:20
Show Gist options
  • Save dggoldst/f70e8d1a55bdc3869e7e to your computer and use it in GitHub Desktop.
Save dggoldst/f70e8d1a55bdc3869e7e to your computer and use it in GitHub Desktop.
library(dplyr)
library(ggplot2)
library(httr)
setwd("C:/Dropbox/Projects/20160206_Soccer_Scores")
if (!file.exists("20160206_Soccer_Scores.csv.gz")) {
cat("must reread")
}
if (!file.exists("20160206_Soccer_Scores.csv.gz"))
{
cat("Reading data from server\n")
STARTYEAR = 1993
STOPYEAR = 2015 #Stop at 2015 to get 2015-2016 data
yearstrings = vector(mode = "character", length = STOPYEAR - STARTYEAR +
1)
for (i in STARTYEAR:STOPYEAR) {
yearstrings[i - STARTYEAR + 1] =
paste(
substr(as.character(i), start = 3, stop = 4),
substr(as.character(i + 1), start = 3, stop = 4),
sep = ""
)
}
reslist = vector(mode = "list", length = 1000)
i = 1
for (yearstring in yearstrings) {
for (confstring in c("E0", "E1", "E2", "E3", "EC")) {
aurl = paste(
"http://www.football-data.co.uk/mmz4281/",
yearstring,
"/",
confstring,
".csv",
sep = ""
)
if (http_status(GET(aurl))$category == "Success")
{
reslist[[i]] = read.csv(url(aurl))
i = i + 1
} else {
cat("Failed: ", aurl, "\n")
}
}
}
if (exists("reslist2")) {
rm(reslist2)
}
reslist2 = vector(mode = "list", length = 50)
for (i in 1:(5 * (STOPYEAR - STARTYEAR + 1))) {
reslist2[[i]] = reslist[[i]][, c(1:6)]
}
df = do.call(rbind, reslist2)
df = subset(df, !is.na(FTHG))
write.csv(df,
file = gzfile("20160206_Soccer_Scores.csv.gz"),
row.names = FALSE)
}
get_neighbors = function(low, high) {
#returns own score plus neighbors
if (high < low)
stop("high > low in get_neighbors")
rbind(expand.grid(a = (low - 1):(low + 1), b = high),
expand.grid(a = low, b = (high - 1):(high + 1))) %>%
filter(a >= 0 &
b >= 0 &
b >= a) %>%
mutate(neighbor = paste(b, a, sep = "-"),
neighbor_group = paste(high, low, sep = "-")) %>%
group_by(neighbor_group, neighbor) %>%
summarise()
}
#create the neighbor data frame ndf
curr = 1
reslist = vector(mode = "list", length = 1000)
for (i in 0:11) {
for (j in i:11) {
reslist[[curr]] = get_neighbors(i, j)
curr = curr + 1
}
}
ndf = do.call('rbind', reslist)
ndf_summ = ndf %>% group_by(neighbor_group) %>% summarise(count = length(neighbor)) %>% arrange(-count) %>% ungroup()
df = read.csv("20160206_Soccer_Scores.csv.gz")
df = df %>%
mutate(
Low = ifelse(FTHG < FTAG, FTHG, FTAG),
High = ifelse(FTHG < FTAG, FTAG, FTHG),
Total = FTHG + FTAG
)
#Total points scored hist
p = ggplot(df, aes(x = Total))
p = p + geom_bar(
stat = "bin",
binwidth = 1,
fill = "green",
color = "black",
width = 2,
alpha = .2
)
p = p + scale_x_continuous(breaks = seq(-.5, 12.5, by = 1),
labels = c("", 0:12))
p = p + coord_cartesian(xlim = c(0.1, 12))
p = p + labs(x = "Total Goals", y = "Matches", title = "Distribution of Total Goals / Match \n(N = 52,017 Matches)")
p = p + theme(panel.grid = element_blank())
p
ggsave(
plot = p,
file = "Total_Points_Scored.png",
width = 4,
height = 4
)
matcher =
rbind(expand.grid(1, 1:3),
expand.grid(0:2, 2))[-5, ] %>%
mutate(match = TRUE)
names(matcher) = c("Low", "High", "Match")
df = left_join(df, matcher) %>%
mutate(Score = paste(High, Low, sep = "-"))
df_summ = df %>% group_by(Score) %>%
summarise(Count = n(),
Goals = Low[1] + High[1]) %>% ungroup()
df_summ = df_summ %>%
arrange(-Count) %>%
mutate(CumProp = cumsum(Count) / sum(Count),
Prop = Count / sum(Count))
df_summ2 = df_summ %>%
group_by(Goals) %>%
summarise(Proportion = sum(Prop)) %>%
ungroup() %>%
mutate(TotalProp = cumsum(Proportion))
p = ggplot(df_summ %>% filter(CumProp < .95), aes(y = 100 * Prop, x = reorder(Score, Prop))) +
geom_bar(
stat = 'identity',
fill = "green",
color = "black",
alpha = .2
) +
coord_flip() +
labs(x = "Score", y = "Percentage of Games", title = "Most Common Final Scores") +
theme(legend.position = "bottom")
p = p + theme(panel.grid.major = element_blank())
p
ggsave(
plot = p,
file = "Most_Common_Scores.png",
width = 4,
height = 4
)
###NOW JUST JOIN df_summ and NDF by x=Score y=neighbor
#Games within 1 point hist
plot_data = left_join(ndf, df_summ, by = c("neighbor" = "Score")) %>% select(-CumProp) %>%
group_by(neighbor_group) %>%
summarise(
group_count = sum(Count, na.rm = TRUE),
group_prop = sum(Prop, na.rm = TRUE),
obs = n()
) %>%
arrange(-group_count) %>%
ungroup() %>%
filter(group_count > 0) %>%
filter(group_prop > .1)
plot_data$score = factor(plot_data$neighbor_group, levels = as.character(plot_data$neighbor_group))
p = ggplot(plot_data, aes(x = score, y = 100 * group_prop))
p = p + geom_bar(
stat = "identity",
fill = "green",
color = "black",
alpha = .2
)
p = p + labs(x = "Score", y = "% of Matches Within 1 Point of Score",
title = "% of Matches Within 1 Point \nof Various Scores")
p = p + scale_y_continuous(breaks = seq(0, 55, by = 5))
p = p + theme(panel.grid.major = element_blank())
p
ggsave(
plot = p,
file = "Within_One_Point.png",
width = 4,
height = 4
)
#show all the games within 1 of a given score
plot_data = left_join(ndf, df_summ, by = c("neighbor" = "Score"))
plot_data = subset(plot_data, neighbor_group %in% c("2-1", "2-0", "1-0", "1-1"))
plot_data$neighbor_group = factor(plot_data$neighbor_group, levels = c("2-1", "2-0", "1-0", "1-1"))
p = ggplot(plot_data, aes(x = neighbor, y = Count))
p = p + geom_bar(
stat = "identity",
fill = "green",
color = "black",
alpha = .2
)
p = p + labs(x = "Scores in Score Group", y = "Number of Matches", title =
"Frequency of Scores Within Score Groups")
p = p + facet_grid(neighbor_group ~ .)
p
ggsave(
plot = p,
file = "Breakdown.png",
width = 4,
height = 4
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment