Skip to content

Instantly share code, notes, and snippets.

@jalapic
Created February 19, 2015 02:15
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jalapic/9ed04504048785424f1a to your computer and use it in GitHub Desktop.
Save jalapic/9ed04504048785424f1a to your computer and use it in GitHub Desktop.
### Using tsne to map similarity of English Soccer Teams
library(engsoccerdata)
library(dplyr)
library(magrittr)
library(ggplot2)
library(tidyr)
library(tsne)
head(engsoccerdata2)
#get rid of 1939
df <- engsoccerdata2 %>%
filter(Season!=1939) %>%
filter(!((home=="Leeds City" | visitor=="Leeds City") & Season==1919)) %>%
filter(!((home=="Wigan Borough" | visitor=="Wigan Borough") & Season==1931)) %>%
filter(!((home=="Accrington Stanley" | visitor=="Accrington Stanley") & Season==1961)) %>%
filter(!((home=="Port Vale" | visitor=="Port Vale") & Season==1919))
#get final standings info in one summary dataframe
#assume 3pts for a win
#assume tie-breakers are pts, gd, gf
df <-
rbind(
df %>% select(Season, division, team=home, opp=visitor, gf=hgoal, ga=vgoal),
df %>% select(Season, division, team=visitor, opp=home, gf=vgoal, ga=hgoal)
) %>%
mutate(points=ifelse(gf>ga, 3, ifelse(gf<ga, 0, 1))) %>%
group_by(Season, team, division) %>%
summarise(totalpts = sum(points),
totalgf = sum(gf),
totalga = sum(ga)
) %>%
mutate(totalgd = totalgf-totalga) %>%
group_by(Season) %>%
arrange(division, desc(totalpts), desc(totalgd), desc(totalgf))
df
#function for ranking final standings
getranks <- function(df){
library(data.table)
dt = as.data.table(as.data.frame(df))
dt[order(-totalpts, -totalgd, -totalgf), rank.init := 1:.N, by = division]
dt[, div.clean := sub('(\\d+).*', '\\1', division)]
setorder(dt, div.clean, rank.init)
dt[, rank.final := mean(.I), by = .(div.clean, rank.init)]
setorder(dt, division, rank.final)
dt1 <- dt[,.(Season,team, rank.final)]
return(dt1)
}
#split df by seasons and apply function
#unevens: 1921,1922,1931
# e.g. 1931, 21 in 3n, 22 in 3s. teams ranked 21st in each (Rochdale/Gillingham) are ranked 85.5, whereas team 22nd in 3S are ranked 87th.
df %<>% ungroup() %$% split(., Season)
df.res <- lapply(df, getranks) #works
teamranks <- do.call("rbind", df.res)
### Example plot 1.
teamranks %>%
as.data.frame() %>%
filter(team=="Aston Villa") %$%
ggplot(., aes(Season, rank.final)) +
geom_line(color="maroon", lwd=1) +
scale_y_reverse(lim=c(95,1)) +
theme_bw()
teamranks %>%
as.data.frame() %>%
filter(team=="Aston Villa") %>%
filter(rank.final==1 | rank.final>40)
### Example plot 2.
teamranks %>%
as.data.frame() %>%
filter(team=="Preston North End") %$%
ggplot(., aes(Season, rank.final)) +
geom_line(color="black", lwd=1) +
scale_y_reverse(lim=c(95,1)) +
theme_bw()
teamranks %>%
as.data.frame() %>%
filter(team=="Preston North End") %>%
filter(rank.final<=3 | rank.final>70)
### Reshape data
teamranks.wide <-
teamranks %>%
as.data.frame() %>%
mutate(rank.final = 93 - rank.final) %>%
select(team, Season, rank.final) %>%
spread(Season, rank.final, fill=0)
rownames(teamranks.wide)<- teamranks.wide %>% .$team #set rownames
teamranks.wide %<>% select(-team) # remove name var.
teamranks.wide
### Identify some groups
# k-means clustering analysis
res.k <- kmeans(teamranks.wide, 7)
table(res.k$cluster)
group1 <- names(res.k$cluster[res.k$cluster==1])
group2 <- names(res.k$cluster[res.k$cluster==2])
group3 <- names(res.k$cluster[res.k$cluster==3])
group4 <- names(res.k$cluster[res.k$cluster==4])
group5 <- names(res.k$cluster[res.k$cluster==5])
group6 <- names(res.k$cluster[res.k$cluster==6])
group7 <- names(res.k$cluster[res.k$cluster==7])
## tsne
D <- dist(teamranks.wide) #create distance object
# creating dataframe for plotting colors and text on final plot
namesdf <- data.frame(name = c(group1, group2, group3, group4, group5, group6, group7),
group = c(rep(1, length(group1)), rep(2, length(group2)), rep(3, length(group3)),
rep(4, length(group4)), rep(5, length(group5)), rep(6, length(group6)),
rep(7, length(group7)))
)
namesdf %<>% arrange(name) #names in correct order to match rownames of teamranks.wide
colors = rainbow(7)
names(colors) = unique(namesdf$group)
#define function used in plotting
ecb = function(x,y){ plot(x,t='n'); text(x,labels=rownames(teamranks.wide), col=colors[namesdf$group], cex=1) }
#plot
tsne_D = tsne(D, k=2, epoch_callback = ecb, perplexity=50)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment