Skip to content

Instantly share code, notes, and snippets.

@nxrunning
Created February 19, 2020 09:15
Show Gist options
  • Save nxrunning/1d00a5b5633f669deb5c2443f36cb17a to your computer and use it in GitHub Desktop.
Save nxrunning/1d00a5b5633f669deb5c2443f36cb17a to your computer and use it in GitHub Desktop.
Webscraping epl 19/20 goals for and goals against data from wikipedia
#Importing libraries
library(rvest) #for webscrapping
library(ggplot2) #for data visualisation
library(plyr) #for renaming values
library("ggrepel") #for avoiding overlapping of labels
#Specify url
url <- "https://en.wikipedia.org/wiki/2019%E2%80%9320_Premier_League"
#Scraped page
scraped_page <- read_html(url)
#Scraping the teams
Teams <- scraped_page %>%
html_nodes("h2+ .wikitable th+ td") %>%
html_text() %>%
as.character()
#Preview
head(Teams)
#Removing "\n" in the teams
Teams <- gsub("\n","",Teams)
#Scraping "Goals For"
Goals_for <- scraped_page %>%
html_nodes("h2+ .wikitable td:nth-child(7) , th:nth-child(7) abbr") %>%
html_text() %>%
as.numeric
#Preview
head(Goals_for)
#Remove NA case
Goals_for <- na.omit(Goals_for)
#Scraping "Goals Against"
Goals_against<- scraped_page %>%
html_nodes("h2+ .wikitable td:nth-child(8)") %>%
html_text() %>%
as.numeric
#Preview
head(Goals_against)
#Combining all scraped data into a dataframe
df <- data.frame(Teams, Goals_for, Goals_against)
#Cleaning the data
df$Teams <- mapvalues(df$Teams,
from=c("Liverpool (Q)","Manchester City[a]"),
to=c("Liverpool", "Manchester City"))
#Creating a cluster column
df$Cluster = 0
#Using for loop and ifelse statements to create different clusters
for(i in 1: nrow(df)) {
if (df$Goals_for[i] > mean(df$Goals_for) & df$Goals_against[i] < mean(df$Goals_against)) {
df$Cluster[i] = 1
} else if (df$Goals_for[i] < mean(df$Goals_for) & df$Goals_against[i] < mean(df$Goals_against)) {
df$Cluster[i] = 2
} else if (df$Goals_for[i] > mean(df$Goals_for) & df$Goals_against[i] > mean(df$Goals_against)) {
df$Cluster[i] = 3
} else {
df$Cluster[i] = 4
}
}
#Making the cluster columns a factor
df$Cluster = factor(df$Cluster, levels = c(1, 2, 3, 4),
labels = c("Strong attack\nStrong defence",
"Poor attack\nStrong defence",
"Strong attack\nPoor defence",
"Poor attack\nPoor defence"))
#Data visualisation
ggplot(df, aes(x = Goals_for, y = Goals_against, label = Teams, colour = Cluster))+
geom_point()+
geom_vline(xintercept=mean(Goals_for), linetype="dashed", alpha = 0.4, colour = "red") +
geom_hline(yintercept=mean(Goals_against), linetype="dashed", alpha = 0.4, colour = "red") +
geom_text_repel(aes(Goals_for, Goals_against, label = Teams),
size = 2, colour = "black", fontface = "bold")+
labs(title = "EPL 2019/2020: Goals For vs Goals Against",
x = "Goals For", y = "Goals Against",
colour = " ")+
theme(legend.position = "top")
ggsave("output.jpeg", dpi = 300)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment