erikgregorywebb/imdb-all.R

## imdb-all.R
# import packages
library(tidyverse)
library(rvest)
library(lubridate)
library(scales)

# define list of shows
shows = tibble(
  name = c('The Office', 'Parks & Recreation', 'Modern Family', 'Community', 'New Girl', 'The Good Place'),
  imdb_id = c('tt0386676', 'tt1266020', 'tt1442437', 'tt1439629', 'tt1826940', 'tt4955642'),
  no_seasons = c(9, 7, 11, 6, 7, 4),
)

# define scraper function
scrape_show = function(name, imdb_id, no_seasons) {
  datalist = list()
  n = 1
  for (i in 1:no_seasons) {
    Sys.sleep(3)
    url = paste('https://www.imdb.com/title/', imdb_id, '/episodes?season=', i , sep = '')
    page = read_html(url)
    episodes = page %>% html_node('.list') %>% html_nodes('.info')
    for (j in 1:length(episodes)) {
      season = i
      title = episodes[j] %>% html_node('a') %>% html_text()
      airdate = episodes[j] %>% html_node('.airdate') %>% html_text()
      rating = episodes[j] %>% html_node('.ipl-rating-star.small .ipl-rating-star__rating') %>% html_text()
      votes = episodes[j] %>% html_node('.ipl-rating-star__total-votes') %>% html_text()
      description = episodes[j] %>% html_node('.item_description') %>% html_text()
      print(paste(title, airdate, rating, votes, description, sep = ' '))
      row = c(name, season, title, airdate, rating, votes, description)
      datalist[[n]] = row
      n = n +1
    }
  }
  raw = do.call(rbind, datalist)
  return(raw)
}

# scrape show info
datalist = list()
for (i in 1:nrow(shows)) {
  raw = scrape_show(name = shows$name[i], imdb_id = shows$imdb_id[i], no_seasons = shows$no_seasons[i])
  datalist[[i]] = raw
}
raw_all_shows = do.call(rbind, datalist)

# clean
all_shows = raw_all_shows %>% as_tibble() %>%
  rename(show = V1, season = V2, title = V3, airdate = V4, rating = V5, votes = V6, description = V7) %>%
  mutate(airdate = str_trim(airdate), description = str_trim(description)) %>%
  mutate(airdate = dmy(airdate), rating = as.numeric(rating)) %>%
  mutate(votes = gsub("\\(","", votes)) %>% mutate(votes = gsub("\\)","", votes)) %>% mutate(votes = as.numeric(gsub(",","", votes)))

# check season and episode counts
all_shows %>% group_by(show, season) %>% count() %>% View()

# plot rating trends
rating_trends_plot = all_shows %>%
  mutate(season = factor(season, levels = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11))) %>%
  mutate(show = factor(show, levels = c('The Office', 'Parks & Recreation', 'Modern Family', 'Community', 'New Girl', 'The Good Place'))) %>%
  ggplot(., aes(x = airdate, y = rating, col = show, group = season)) +
  geom_line(alpha = .5) + geom_smooth(se = F, size = 2) +
  theme(legend.position = 'none') +
  labs(title = 'Ratings Trend by Show', subtitle = 'IMDb Episode Average Rating, All Seasons',
       y = 'Average Rating', x = 'Year', fill = 'Season', caption = 'Data Source: IDMb.com | Author: @erikgregorywebb') +
  facet_wrap(~show, scales = 'free') +
  scale_y_continuous(limits = c(6, 10)) +
  theme(text = element_text(size = 15))
rating_trends_plot

# define character count function
get_character_counts = function(df, show_name, character_name) {
  total_episodes = df %>% filter(show == show_name) %>% nrow()
  character_episodes = df %>%
    filter(show == show_name) %>% filter(str_detect(description, character_name)) %>% nrow()
  result = tibble(show = show_name, character_name = character_name,
                  total_episodes = total_episodes, character_episodes = character_episodes)
  return(result)
}

# define looping function
get_all_character_counts = function(df, show_name, character_names) {
  datalist = list()
  for (i in 1:length(character_names)) {
    datalist[[i]] = get_character_counts(all_shows, show_name, character_names[i])
  }
  raw = do.call(rbind, datalist)
  return(raw)
}

# the office
names = c('Michael', 'Dwight', 'Jim', 'Pam', 'Ryan', 'Andy', 'Oscar', 'Stanley', 'Kevin', 'Meredith', 'Angela', 'Phyllis', 'Toby', 'Kelly', 'Darryl')
the_office = get_all_character_counts(all_shows, 'The Office', names)

# parks & recreation
names = c('Leslie', 'Ann', 'Tom', 'Ron', 'April', 'Andy', 'Ben', 'Chris')
parks_recreation = get_all_character_counts(all_shows, 'Parks & Recreation', names)

# modern family
names = c('Jay', 'Gloria', 'Manny', 'Claire', 'Phil', 'Haley', 'Alex', 'Luke', 'Mitchell', 'Cameron', 'Lily')
modern_family = get_all_character_counts(all_shows, 'Modern Family', names)

# community
names = c('Jeff', 'Britta', 'Abed', 'Shirley', 'Annie', 'Troy', 'Pierce')
community = get_all_character_counts(all_shows, 'Community', names)

# new girl
names = c('Jess', 'Nick', 'Winston', 'Schmidt', 'Coach', 'Cece')
new_girl = get_all_character_counts(all_shows, 'New Girl', names)

# the good place
names = c(' Eleanor', 'Chidi', 'Tahani', 'Janet', 'Jason', 'Michael', 'Shawn')
the_good_place = get_all_character_counts(all_shows, 'The Good Place', names)

# combine
all_characters = bind_rows(the_office, parks_recreation, modern_family, community, new_girl, the_good_place)

# define plotting function
plot_characters = function(df) {
  df %>%
    mutate(per = character_episodes / total_episodes) %>%
    ggplot(., aes(x = reorder(character_name , per), y = per)) +
    geom_point(size = 2) + geom_line() +
    coord_flip() +
    labs(y = 'Percent Present', x = 'Name') +
    scale_y_continuous(labels = scales::percent_format(accuracy = 1))
}

# calculate character % present
character_importance_plot = all_characters %>%
  mutate(show = factor(show, levels = c('The Office', 'Parks & Recreation', 'Modern Family', 'Community', 'New Girl', 'The Good Place'))) %>%
  mutate(per = character_episodes / total_episodes) %>%
  ggplot(., aes(x = reorder(character_name , per), y = per, col = show)) +
  geom_point(size = 3) +
  coord_flip() +
  theme(legend.position = 'none') +
  labs(title = 'Character Importance by Show', subtitle = 'IMDb Episode Descriptions, All Seasons',
       y = 'Percent Present in Episode Descriptions', x = '', fill = 'Season', caption = 'Data Source: IDMb.com | Author: @erikgregorywebb') +
  facet_wrap(~show, scales = 'free') +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1), limits = c(0, 1)) +
  facet_wrap(~show, scales = 'free') +
  theme(text = element_text(size = 15))
character_importance_plot

# export data
setwd("~/Projects/imdb")
write_csv(all_shows, 'all_shows.csv')
write_csv(all_characters, 'all_characters.csv')

# export charts
png(filename = 'rating_trends_plot.png', width = 12, height = 8, units = 'in', res = 500)
rating_trends_plot
dev.off()

png(filename = 'character_importance_plot.png', width = 12, height = 8, units = 'in', res = 500)
character_importance_plot
dev.off()
	# import packages
	library(tidyverse)
	library(rvest)
	library(lubridate)
	library(scales)

	# define list of shows
	shows = tibble(
	name = c('The Office', 'Parks & Recreation', 'Modern Family', 'Community', 'New Girl', 'The Good Place'),
	imdb_id = c('tt0386676', 'tt1266020', 'tt1442437', 'tt1439629', 'tt1826940', 'tt4955642'),
	no_seasons = c(9, 7, 11, 6, 7, 4),
	)

	# define scraper function
	scrape_show = function(name, imdb_id, no_seasons) {
	datalist = list()
	n = 1
	for (i in 1:no_seasons) {
	Sys.sleep(3)
	url = paste('https://www.imdb.com/title/', imdb_id, '/episodes?season=', i , sep = '')
	page = read_html(url)
	episodes = page %>% html_node('.list') %>% html_nodes('.info')
	for (j in 1:length(episodes)) {
	season = i
	title = episodes[j] %>% html_node('a') %>% html_text()
	airdate = episodes[j] %>% html_node('.airdate') %>% html_text()
	rating = episodes[j] %>% html_node('.ipl-rating-star.small .ipl-rating-star__rating') %>% html_text()
	votes = episodes[j] %>% html_node('.ipl-rating-star__total-votes') %>% html_text()
	description = episodes[j] %>% html_node('.item_description') %>% html_text()
	print(paste(title, airdate, rating, votes, description, sep = ' '))
	row = c(name, season, title, airdate, rating, votes, description)
	datalist[[n]] = row
	n = n +1
	}
	}
	raw = do.call(rbind, datalist)
	return(raw)
	}

	# scrape show info
	datalist = list()
	for (i in 1:nrow(shows)) {
	raw = scrape_show(name = shows$name[i], imdb_id = shows$imdb_id[i], no_seasons = shows$no_seasons[i])
	datalist[[i]] = raw
	}
	raw_all_shows = do.call(rbind, datalist)

	# clean
	all_shows = raw_all_shows %>% as_tibble() %>%
	rename(show = V1, season = V2, title = V3, airdate = V4, rating = V5, votes = V6, description = V7) %>%
	mutate(airdate = str_trim(airdate), description = str_trim(description)) %>%
	mutate(airdate = dmy(airdate), rating = as.numeric(rating)) %>%
	mutate(votes = gsub("\\(","", votes)) %>% mutate(votes = gsub("\\)","", votes)) %>% mutate(votes = as.numeric(gsub(",","", votes)))

	# check season and episode counts
	all_shows %>% group_by(show, season) %>% count() %>% View()

	# plot rating trends
	rating_trends_plot = all_shows %>%
	mutate(season = factor(season, levels = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11))) %>%
	mutate(show = factor(show, levels = c('The Office', 'Parks & Recreation', 'Modern Family', 'Community', 'New Girl', 'The Good Place'))) %>%
	ggplot(., aes(x = airdate, y = rating, col = show, group = season)) +
	geom_line(alpha = .5) + geom_smooth(se = F, size = 2) +
	theme(legend.position = 'none') +
	labs(title = 'Ratings Trend by Show', subtitle = 'IMDb Episode Average Rating, All Seasons',
	y = 'Average Rating', x = 'Year', fill = 'Season', caption = 'Data Source: IDMb.com \| Author: @erikgregorywebb') +
	facet_wrap(~show, scales = 'free') +
	scale_y_continuous(limits = c(6, 10)) +
	theme(text = element_text(size = 15))
	rating_trends_plot

	# define character count function
	get_character_counts = function(df, show_name, character_name) {
	total_episodes = df %>% filter(show == show_name) %>% nrow()
	character_episodes = df %>%
	filter(show == show_name) %>% filter(str_detect(description, character_name)) %>% nrow()
	result = tibble(show = show_name, character_name = character_name,
	total_episodes = total_episodes, character_episodes = character_episodes)
	return(result)
	}

	# define looping function
	get_all_character_counts = function(df, show_name, character_names) {
	datalist = list()
	for (i in 1:length(character_names)) {
	datalist[[i]] = get_character_counts(all_shows, show_name, character_names[i])
	}
	raw = do.call(rbind, datalist)
	return(raw)
	}

	# the office
	names = c('Michael', 'Dwight', 'Jim', 'Pam', 'Ryan', 'Andy', 'Oscar', 'Stanley', 'Kevin', 'Meredith', 'Angela', 'Phyllis', 'Toby', 'Kelly', 'Darryl')
	the_office = get_all_character_counts(all_shows, 'The Office', names)

	# parks & recreation
	names = c('Leslie', 'Ann', 'Tom', 'Ron', 'April', 'Andy', 'Ben', 'Chris')
	parks_recreation = get_all_character_counts(all_shows, 'Parks & Recreation', names)

	# modern family
	names = c('Jay', 'Gloria', 'Manny', 'Claire', 'Phil', 'Haley', 'Alex', 'Luke', 'Mitchell', 'Cameron', 'Lily')
	modern_family = get_all_character_counts(all_shows, 'Modern Family', names)

	# community
	names = c('Jeff', 'Britta', 'Abed', 'Shirley', 'Annie', 'Troy', 'Pierce')
	community = get_all_character_counts(all_shows, 'Community', names)

	# new girl
	names = c('Jess', 'Nick', 'Winston', 'Schmidt', 'Coach', 'Cece')
	new_girl = get_all_character_counts(all_shows, 'New Girl', names)

	# the good place
	names = c(' Eleanor', 'Chidi', 'Tahani', 'Janet', 'Jason', 'Michael', 'Shawn')
	the_good_place = get_all_character_counts(all_shows, 'The Good Place', names)

	# combine
	all_characters = bind_rows(the_office, parks_recreation, modern_family, community, new_girl, the_good_place)

	# define plotting function
	plot_characters = function(df) {
	df %>%
	mutate(per = character_episodes / total_episodes) %>%
	ggplot(., aes(x = reorder(character_name , per), y = per)) +
	geom_point(size = 2) + geom_line() +
	coord_flip() +
	labs(y = 'Percent Present', x = 'Name') +
	scale_y_continuous(labels = scales::percent_format(accuracy = 1))
	}

	# calculate character % present
	character_importance_plot = all_characters %>%
	mutate(show = factor(show, levels = c('The Office', 'Parks & Recreation', 'Modern Family', 'Community', 'New Girl', 'The Good Place'))) %>%
	mutate(per = character_episodes / total_episodes) %>%
	ggplot(., aes(x = reorder(character_name , per), y = per, col = show)) +
	geom_point(size = 3) +
	coord_flip() +
	theme(legend.position = 'none') +
	labs(title = 'Character Importance by Show', subtitle = 'IMDb Episode Descriptions, All Seasons',
	y = 'Percent Present in Episode Descriptions', x = '', fill = 'Season', caption = 'Data Source: IDMb.com \| Author: @erikgregorywebb') +
	facet_wrap(~show, scales = 'free') +
	scale_y_continuous(labels = scales::percent_format(accuracy = 1), limits = c(0, 1)) +
	facet_wrap(~show, scales = 'free') +
	theme(text = element_text(size = 15))
	character_importance_plot

	# export data
	setwd("~/Projects/imdb")
	write_csv(all_shows, 'all_shows.csv')
	write_csv(all_characters, 'all_characters.csv')

	# export charts
	png(filename = 'rating_trends_plot.png', width = 12, height = 8, units = 'in', res = 500)
	rating_trends_plot
	dev.off()

	png(filename = 'character_importance_plot.png', width = 12, height = 8, units = 'in', res = 500)
	character_importance_plot
	dev.off()