Created
February 19, 2020 09:15
-
-
Save nxrunning/1d00a5b5633f669deb5c2443f36cb17a to your computer and use it in GitHub Desktop.
Webscraping epl 19/20 goals for and goals against data from wikipedia
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Importing libraries | |
library(rvest) #for webscrapping | |
library(ggplot2) #for data visualisation | |
library(plyr) #for renaming values | |
library("ggrepel") #for avoiding overlapping of labels | |
#Specify url | |
url <- "https://en.wikipedia.org/wiki/2019%E2%80%9320_Premier_League" | |
#Scraped page | |
scraped_page <- read_html(url) | |
#Scraping the teams | |
Teams <- scraped_page %>% | |
html_nodes("h2+ .wikitable th+ td") %>% | |
html_text() %>% | |
as.character() | |
#Preview | |
head(Teams) | |
#Removing "\n" in the teams | |
Teams <- gsub("\n","",Teams) | |
#Scraping "Goals For" | |
Goals_for <- scraped_page %>% | |
html_nodes("h2+ .wikitable td:nth-child(7) , th:nth-child(7) abbr") %>% | |
html_text() %>% | |
as.numeric | |
#Preview | |
head(Goals_for) | |
#Remove NA case | |
Goals_for <- na.omit(Goals_for) | |
#Scraping "Goals Against" | |
Goals_against<- scraped_page %>% | |
html_nodes("h2+ .wikitable td:nth-child(8)") %>% | |
html_text() %>% | |
as.numeric | |
#Preview | |
head(Goals_against) | |
#Combining all scraped data into a dataframe | |
df <- data.frame(Teams, Goals_for, Goals_against) | |
#Cleaning the data | |
df$Teams <- mapvalues(df$Teams, | |
from=c("Liverpool (Q)","Manchester City[a]"), | |
to=c("Liverpool", "Manchester City")) | |
#Creating a cluster column | |
df$Cluster = 0 | |
#Using for loop and ifelse statements to create different clusters | |
for(i in 1: nrow(df)) { | |
if (df$Goals_for[i] > mean(df$Goals_for) & df$Goals_against[i] < mean(df$Goals_against)) { | |
df$Cluster[i] = 1 | |
} else if (df$Goals_for[i] < mean(df$Goals_for) & df$Goals_against[i] < mean(df$Goals_against)) { | |
df$Cluster[i] = 2 | |
} else if (df$Goals_for[i] > mean(df$Goals_for) & df$Goals_against[i] > mean(df$Goals_against)) { | |
df$Cluster[i] = 3 | |
} else { | |
df$Cluster[i] = 4 | |
} | |
} | |
#Making the cluster columns a factor | |
df$Cluster = factor(df$Cluster, levels = c(1, 2, 3, 4), | |
labels = c("Strong attack\nStrong defence", | |
"Poor attack\nStrong defence", | |
"Strong attack\nPoor defence", | |
"Poor attack\nPoor defence")) | |
#Data visualisation | |
ggplot(df, aes(x = Goals_for, y = Goals_against, label = Teams, colour = Cluster))+ | |
geom_point()+ | |
geom_vline(xintercept=mean(Goals_for), linetype="dashed", alpha = 0.4, colour = "red") + | |
geom_hline(yintercept=mean(Goals_against), linetype="dashed", alpha = 0.4, colour = "red") + | |
geom_text_repel(aes(Goals_for, Goals_against, label = Teams), | |
size = 2, colour = "black", fontface = "bold")+ | |
labs(title = "EPL 2019/2020: Goals For vs Goals Against", | |
x = "Goals For", y = "Goals Against", | |
colour = " ")+ | |
theme(legend.position = "top") | |
ggsave("output.jpeg", dpi = 300) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment