Created
February 3, 2021 18:42
-
-
Save erikgregorywebb/5c6b0039bfe6fedffe0fc1914ce9d916 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# import packages | |
library(tidyverse) | |
library(rvest) | |
library(lubridate) | |
library(scales) | |
# define list of shows | |
shows = tibble( | |
name = c('The Office', 'Parks & Recreation', 'Modern Family', 'Community', 'New Girl', 'The Good Place'), | |
imdb_id = c('tt0386676', 'tt1266020', 'tt1442437', 'tt1439629', 'tt1826940', 'tt4955642'), | |
no_seasons = c(9, 7, 11, 6, 7, 4), | |
) | |
# define scraper function | |
scrape_show = function(name, imdb_id, no_seasons) { | |
datalist = list() | |
n = 1 | |
for (i in 1:no_seasons) { | |
Sys.sleep(3) | |
url = paste('https://www.imdb.com/title/', imdb_id, '/episodes?season=', i , sep = '') | |
page = read_html(url) | |
episodes = page %>% html_node('.list') %>% html_nodes('.info') | |
for (j in 1:length(episodes)) { | |
season = i | |
title = episodes[j] %>% html_node('a') %>% html_text() | |
airdate = episodes[j] %>% html_node('.airdate') %>% html_text() | |
rating = episodes[j] %>% html_node('.ipl-rating-star.small .ipl-rating-star__rating') %>% html_text() | |
votes = episodes[j] %>% html_node('.ipl-rating-star__total-votes') %>% html_text() | |
description = episodes[j] %>% html_node('.item_description') %>% html_text() | |
print(paste(title, airdate, rating, votes, description, sep = ' ')) | |
row = c(name, season, title, airdate, rating, votes, description) | |
datalist[[n]] = row | |
n = n +1 | |
} | |
} | |
raw = do.call(rbind, datalist) | |
return(raw) | |
} | |
# scrape show info | |
datalist = list() | |
for (i in 1:nrow(shows)) { | |
raw = scrape_show(name = shows$name[i], imdb_id = shows$imdb_id[i], no_seasons = shows$no_seasons[i]) | |
datalist[[i]] = raw | |
} | |
raw_all_shows = do.call(rbind, datalist) | |
# clean | |
all_shows = raw_all_shows %>% as_tibble() %>% | |
rename(show = V1, season = V2, title = V3, airdate = V4, rating = V5, votes = V6, description = V7) %>% | |
mutate(airdate = str_trim(airdate), description = str_trim(description)) %>% | |
mutate(airdate = dmy(airdate), rating = as.numeric(rating)) %>% | |
mutate(votes = gsub("\\(","", votes)) %>% mutate(votes = gsub("\\)","", votes)) %>% mutate(votes = as.numeric(gsub(",","", votes))) | |
# check season and episode counts | |
all_shows %>% group_by(show, season) %>% count() %>% View() | |
# plot rating trends | |
rating_trends_plot = all_shows %>% | |
mutate(season = factor(season, levels = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11))) %>% | |
mutate(show = factor(show, levels = c('The Office', 'Parks & Recreation', 'Modern Family', 'Community', 'New Girl', 'The Good Place'))) %>% | |
ggplot(., aes(x = airdate, y = rating, col = show, group = season)) + | |
geom_line(alpha = .5) + geom_smooth(se = F, size = 2) + | |
theme(legend.position = 'none') + | |
labs(title = 'Ratings Trend by Show', subtitle = 'IMDb Episode Average Rating, All Seasons', | |
y = 'Average Rating', x = 'Year', fill = 'Season', caption = 'Data Source: IDMb.com | Author: @erikgregorywebb') + | |
facet_wrap(~show, scales = 'free') + | |
scale_y_continuous(limits = c(6, 10)) + | |
theme(text = element_text(size = 15)) | |
rating_trends_plot | |
# define character count function | |
get_character_counts = function(df, show_name, character_name) { | |
total_episodes = df %>% filter(show == show_name) %>% nrow() | |
character_episodes = df %>% | |
filter(show == show_name) %>% filter(str_detect(description, character_name)) %>% nrow() | |
result = tibble(show = show_name, character_name = character_name, | |
total_episodes = total_episodes, character_episodes = character_episodes) | |
return(result) | |
} | |
# define looping function | |
get_all_character_counts = function(df, show_name, character_names) { | |
datalist = list() | |
for (i in 1:length(character_names)) { | |
datalist[[i]] = get_character_counts(all_shows, show_name, character_names[i]) | |
} | |
raw = do.call(rbind, datalist) | |
return(raw) | |
} | |
# the office | |
names = c('Michael', 'Dwight', 'Jim', 'Pam', 'Ryan', 'Andy', 'Oscar', 'Stanley', 'Kevin', 'Meredith', 'Angela', 'Phyllis', 'Toby', 'Kelly', 'Darryl') | |
the_office = get_all_character_counts(all_shows, 'The Office', names) | |
# parks & recreation | |
names = c('Leslie', 'Ann', 'Tom', 'Ron', 'April', 'Andy', 'Ben', 'Chris') | |
parks_recreation = get_all_character_counts(all_shows, 'Parks & Recreation', names) | |
# modern family | |
names = c('Jay', 'Gloria', 'Manny', 'Claire', 'Phil', 'Haley', 'Alex', 'Luke', 'Mitchell', 'Cameron', 'Lily') | |
modern_family = get_all_character_counts(all_shows, 'Modern Family', names) | |
# community | |
names = c('Jeff', 'Britta', 'Abed', 'Shirley', 'Annie', 'Troy', 'Pierce') | |
community = get_all_character_counts(all_shows, 'Community', names) | |
# new girl | |
names = c('Jess', 'Nick', 'Winston', 'Schmidt', 'Coach', 'Cece') | |
new_girl = get_all_character_counts(all_shows, 'New Girl', names) | |
# the good place | |
names = c(' Eleanor', 'Chidi', 'Tahani', 'Janet', 'Jason', 'Michael', 'Shawn') | |
the_good_place = get_all_character_counts(all_shows, 'The Good Place', names) | |
# combine | |
all_characters = bind_rows(the_office, parks_recreation, modern_family, community, new_girl, the_good_place) | |
# define plotting function | |
plot_characters = function(df) { | |
df %>% | |
mutate(per = character_episodes / total_episodes) %>% | |
ggplot(., aes(x = reorder(character_name , per), y = per)) + | |
geom_point(size = 2) + geom_line() + | |
coord_flip() + | |
labs(y = 'Percent Present', x = 'Name') + | |
scale_y_continuous(labels = scales::percent_format(accuracy = 1)) | |
} | |
# calculate character % present | |
character_importance_plot = all_characters %>% | |
mutate(show = factor(show, levels = c('The Office', 'Parks & Recreation', 'Modern Family', 'Community', 'New Girl', 'The Good Place'))) %>% | |
mutate(per = character_episodes / total_episodes) %>% | |
ggplot(., aes(x = reorder(character_name , per), y = per, col = show)) + | |
geom_point(size = 3) + | |
coord_flip() + | |
theme(legend.position = 'none') + | |
labs(title = 'Character Importance by Show', subtitle = 'IMDb Episode Descriptions, All Seasons', | |
y = 'Percent Present in Episode Descriptions', x = '', fill = 'Season', caption = 'Data Source: IDMb.com | Author: @erikgregorywebb') + | |
facet_wrap(~show, scales = 'free') + | |
scale_y_continuous(labels = scales::percent_format(accuracy = 1), limits = c(0, 1)) + | |
facet_wrap(~show, scales = 'free') + | |
theme(text = element_text(size = 15)) | |
character_importance_plot | |
# export data | |
setwd("~/Projects/imdb") | |
write_csv(all_shows, 'all_shows.csv') | |
write_csv(all_characters, 'all_characters.csv') | |
# export charts | |
png(filename = 'rating_trends_plot.png', width = 12, height = 8, units = 'in', res = 500) | |
rating_trends_plot | |
dev.off() | |
png(filename = 'character_importance_plot.png', width = 12, height = 8, units = 'in', res = 500) | |
character_importance_plot | |
dev.off() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment