Skip to content

Instantly share code, notes, and snippets.

@erikgregorywebb
Created February 3, 2021 18:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save erikgregorywebb/5c6b0039bfe6fedffe0fc1914ce9d916 to your computer and use it in GitHub Desktop.
Save erikgregorywebb/5c6b0039bfe6fedffe0fc1914ce9d916 to your computer and use it in GitHub Desktop.
# import packages
library(tidyverse)
library(rvest)
library(lubridate)
library(scales)
# define list of shows
shows = tibble(
name = c('The Office', 'Parks & Recreation', 'Modern Family', 'Community', 'New Girl', 'The Good Place'),
imdb_id = c('tt0386676', 'tt1266020', 'tt1442437', 'tt1439629', 'tt1826940', 'tt4955642'),
no_seasons = c(9, 7, 11, 6, 7, 4),
)
# define scraper function
scrape_show = function(name, imdb_id, no_seasons) {
datalist = list()
n = 1
for (i in 1:no_seasons) {
Sys.sleep(3)
url = paste('https://www.imdb.com/title/', imdb_id, '/episodes?season=', i , sep = '')
page = read_html(url)
episodes = page %>% html_node('.list') %>% html_nodes('.info')
for (j in 1:length(episodes)) {
season = i
title = episodes[j] %>% html_node('a') %>% html_text()
airdate = episodes[j] %>% html_node('.airdate') %>% html_text()
rating = episodes[j] %>% html_node('.ipl-rating-star.small .ipl-rating-star__rating') %>% html_text()
votes = episodes[j] %>% html_node('.ipl-rating-star__total-votes') %>% html_text()
description = episodes[j] %>% html_node('.item_description') %>% html_text()
print(paste(title, airdate, rating, votes, description, sep = ' '))
row = c(name, season, title, airdate, rating, votes, description)
datalist[[n]] = row
n = n +1
}
}
raw = do.call(rbind, datalist)
return(raw)
}
# scrape show info
datalist = list()
for (i in 1:nrow(shows)) {
raw = scrape_show(name = shows$name[i], imdb_id = shows$imdb_id[i], no_seasons = shows$no_seasons[i])
datalist[[i]] = raw
}
raw_all_shows = do.call(rbind, datalist)
# clean
all_shows = raw_all_shows %>% as_tibble() %>%
rename(show = V1, season = V2, title = V3, airdate = V4, rating = V5, votes = V6, description = V7) %>%
mutate(airdate = str_trim(airdate), description = str_trim(description)) %>%
mutate(airdate = dmy(airdate), rating = as.numeric(rating)) %>%
mutate(votes = gsub("\\(","", votes)) %>% mutate(votes = gsub("\\)","", votes)) %>% mutate(votes = as.numeric(gsub(",","", votes)))
# check season and episode counts
all_shows %>% group_by(show, season) %>% count() %>% View()
# plot rating trends
rating_trends_plot = all_shows %>%
mutate(season = factor(season, levels = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11))) %>%
mutate(show = factor(show, levels = c('The Office', 'Parks & Recreation', 'Modern Family', 'Community', 'New Girl', 'The Good Place'))) %>%
ggplot(., aes(x = airdate, y = rating, col = show, group = season)) +
geom_line(alpha = .5) + geom_smooth(se = F, size = 2) +
theme(legend.position = 'none') +
labs(title = 'Ratings Trend by Show', subtitle = 'IMDb Episode Average Rating, All Seasons',
y = 'Average Rating', x = 'Year', fill = 'Season', caption = 'Data Source: IDMb.com | Author: @erikgregorywebb') +
facet_wrap(~show, scales = 'free') +
scale_y_continuous(limits = c(6, 10)) +
theme(text = element_text(size = 15))
rating_trends_plot
# define character count function
get_character_counts = function(df, show_name, character_name) {
total_episodes = df %>% filter(show == show_name) %>% nrow()
character_episodes = df %>%
filter(show == show_name) %>% filter(str_detect(description, character_name)) %>% nrow()
result = tibble(show = show_name, character_name = character_name,
total_episodes = total_episodes, character_episodes = character_episodes)
return(result)
}
# define looping function
get_all_character_counts = function(df, show_name, character_names) {
datalist = list()
for (i in 1:length(character_names)) {
datalist[[i]] = get_character_counts(all_shows, show_name, character_names[i])
}
raw = do.call(rbind, datalist)
return(raw)
}
# the office
names = c('Michael', 'Dwight', 'Jim', 'Pam', 'Ryan', 'Andy', 'Oscar', 'Stanley', 'Kevin', 'Meredith', 'Angela', 'Phyllis', 'Toby', 'Kelly', 'Darryl')
the_office = get_all_character_counts(all_shows, 'The Office', names)
# parks & recreation
names = c('Leslie', 'Ann', 'Tom', 'Ron', 'April', 'Andy', 'Ben', 'Chris')
parks_recreation = get_all_character_counts(all_shows, 'Parks & Recreation', names)
# modern family
names = c('Jay', 'Gloria', 'Manny', 'Claire', 'Phil', 'Haley', 'Alex', 'Luke', 'Mitchell', 'Cameron', 'Lily')
modern_family = get_all_character_counts(all_shows, 'Modern Family', names)
# community
names = c('Jeff', 'Britta', 'Abed', 'Shirley', 'Annie', 'Troy', 'Pierce')
community = get_all_character_counts(all_shows, 'Community', names)
# new girl
names = c('Jess', 'Nick', 'Winston', 'Schmidt', 'Coach', 'Cece')
new_girl = get_all_character_counts(all_shows, 'New Girl', names)
# the good place
names = c(' Eleanor', 'Chidi', 'Tahani', 'Janet', 'Jason', 'Michael', 'Shawn')
the_good_place = get_all_character_counts(all_shows, 'The Good Place', names)
# combine
all_characters = bind_rows(the_office, parks_recreation, modern_family, community, new_girl, the_good_place)
# define plotting function
plot_characters = function(df) {
df %>%
mutate(per = character_episodes / total_episodes) %>%
ggplot(., aes(x = reorder(character_name , per), y = per)) +
geom_point(size = 2) + geom_line() +
coord_flip() +
labs(y = 'Percent Present', x = 'Name') +
scale_y_continuous(labels = scales::percent_format(accuracy = 1))
}
# calculate character % present
character_importance_plot = all_characters %>%
mutate(show = factor(show, levels = c('The Office', 'Parks & Recreation', 'Modern Family', 'Community', 'New Girl', 'The Good Place'))) %>%
mutate(per = character_episodes / total_episodes) %>%
ggplot(., aes(x = reorder(character_name , per), y = per, col = show)) +
geom_point(size = 3) +
coord_flip() +
theme(legend.position = 'none') +
labs(title = 'Character Importance by Show', subtitle = 'IMDb Episode Descriptions, All Seasons',
y = 'Percent Present in Episode Descriptions', x = '', fill = 'Season', caption = 'Data Source: IDMb.com | Author: @erikgregorywebb') +
facet_wrap(~show, scales = 'free') +
scale_y_continuous(labels = scales::percent_format(accuracy = 1), limits = c(0, 1)) +
facet_wrap(~show, scales = 'free') +
theme(text = element_text(size = 15))
character_importance_plot
# export data
setwd("~/Projects/imdb")
write_csv(all_shows, 'all_shows.csv')
write_csv(all_characters, 'all_characters.csv')
# export charts
png(filename = 'rating_trends_plot.png', width = 12, height = 8, units = 'in', res = 500)
rating_trends_plot
dev.off()
png(filename = 'character_importance_plot.png', width = 12, height = 8, units = 'in', res = 500)
character_importance_plot
dev.off()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment