Skip to content

Instantly share code, notes, and snippets.

@imjakedaniels
Created August 23, 2018 19:43
Show Gist options
  • Save imjakedaniels/57ee17ba0bf5e4b322d6a1e6610aa87d to your computer and use it in GitHub Desktop.
Save imjakedaniels/57ee17ba0bf5e4b322d6a1e6610aa87d to your computer and use it in GitHub Desktop.
Arie sucks maybe?
library(rvest)
library(tidyverse)
library(magrittr)
library(scales)
library(knitr)
library(lubridate)
url <- "https://www.imdb.com/title/tt0313038/episodes?season="
timevalues <- 13:22
unitedata <- function(x){
full_url <- paste0(url, x)
full_url
}
finalurl <- unitedata(timevalues)
finalurl
imdbScrape <- function(x){
page <- x
name <- page %>% read_html() %>% html_nodes('#episodes_content strong a') %>% html_text() %>% as.data.frame()
rating <- page %>% read_html() %>% html_nodes('.ipl-rating-widget') %>% html_text() %>% as.data.frame()
details <- page %>% read_html() %>% html_nodes('.zero-z-index div') %>% html_text() %>% as.data.frame()
chart <- cbind(name, rating, details)
names(chart) <- c("Name", "Rating", "Details")
chart <- as.tibble(chart)
return(chart)
Sys.sleep(5)
}
bachelor <- map_df(finalurl, imdbScrape)
bachelor$Season <- str_extract(bachelor$Details, "S[0-9]+")
bachelor$Season <- as.numeric(str_extract(bachelor$Season, "[0-9]+"))
bachelor$Episode <- str_extract(bachelor$Details, "Ep[0-9]+")
bachelor$Episode <- as.numeric(str_extract(bachelor$Episode, "[0-9]+"))
bachelor$Details <- NULL
bachelor$Rating <- str_extract(bachelor$Rating, "[0-9].[0-9]")
na_index <- which(!is.na(bachelor$Rating))
bachelor <- bachelor[na_index,]
bachelor$Rating <- as.numeric(bachelor$Rating)
bachelor_names <- c('Jason', 'Jake', 'Brad', 'Ben', 'Sean', 'Juan-Pablo', 'Chris', 'Ben', 'Nick', 'Arie')
bachelor %>%
group_by(Season) %>%
summarise(Rating = mean(Rating)) %>%
ggplot() +
geom_line(aes(x=Season, y=Rating), color = "Red", size = 1.1) +
scale_x_continuous(breaks=c(13:22), labels=bachelor_names, limits=c(13,22)) +
scale_y_continuous(breaks=c(1:10), labels=c(1:10), limits=c(1,10)) +
geom_line(aes(x=Season, y=mean(Rating)), linetype=2, color = "Black") +
annotate("text", x=27, y= 7.45, label = "avg", color = "black", size = 3) +
theme_bw() +
labs(title = "Is Arie the Worst Bachelor in Show History?",
subtitle = "Average Episode Ratings by Season",
caption = "Source: IMDB, August 2018",
x = "Seasons 13-22",
y = "Rating") +
theme(plot.title = element_text(family='', face = 'bold', colour = 'black', size = 20),
plot.subtitle = element_text(family='', face = 'italic', colour = 'black', size = 10),
plot.caption = element_text(family='', colour = 'black', size = 10),
axis.title.x = element_text(family='', face = 'bold', colour = 'black', size = 12),
axis.title.y = element_text(family='', colour = 'black', size = 12),
plot.background = element_rect(fill = "white"))
@imjakedaniels
Copy link
Author

Since there were NA, i made the scrape for the rating less narrow so I still got a result for each row then extracted just the ranking out and dropped the ones with NA

You're not gonna like the results for Arie though.....

@imjakedaniels
Copy link
Author

bachelor

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment