This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# calculate character % present | |
character_importance_plot = all_characters %>% | |
mutate(show = factor(show, levels = c('The Office', 'Parks & Recreation', 'Modern Family', 'Community', 'New Girl', 'The Good Place'))) %>% | |
mutate(per = character_episodes / total_episodes) %>% | |
ggplot(., aes(x = reorder(character_name , per), y = per, col = show)) + | |
geom_point(size = 3) + | |
coord_flip() + | |
theme(legend.position = 'none') + | |
labs(title = 'Character Importance by Show', subtitle = 'IMDb Episode Descriptions, All Seasons', | |
y = 'Percent Present in Episode Descriptions', x = '', fill = 'Season', caption = 'Data Source: IDMb.com | Author: @erikgregorywebb') + |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# define character count function | |
get_character_counts = function(df, show_name, character_name) { | |
total_episodes = df %>% filter(show == show_name) %>% nrow() | |
character_episodes = df %>% | |
filter(show == show_name) %>% filter(str_detect(description, character_name)) %>% nrow() | |
result = tibble(show = show_name, character_name = character_name, | |
total_episodes = total_episodes, character_episodes = character_episodes) | |
return(result) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
rating_trends_plot = all_shows %>% | |
mutate(season = factor(season, levels = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11))) %>% | |
mutate(show = factor(show, levels = c('The Office', 'Parks & Recreation', 'Modern Family', 'Community', 'New Girl', 'The Good Place'))) %>% | |
ggplot(., aes(x = airdate, y = rating, col = show, group = season)) + | |
geom_line(alpha = .5) + geom_smooth(se = F, size = 2) + | |
theme(legend.position = 'none') + | |
labs(title = 'Ratings Trend by Show', subtitle = 'IMDb Episode Average Rating, All Seasons', | |
y = 'Average Rating', x = 'Year', fill = 'Season', caption = 'Data Source: IDMb.com | Author: @erikgregorywebb') + | |
facet_wrap(~show, scales = 'free') + | |
scale_y_continuous(limits = c(6, 10)) + |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# define scraper function | |
scrape_show = function(name, imdb_id, no_seasons) { | |
datalist = list() | |
n = 1 | |
for (i in 1:no_seasons) { | |
Sys.sleep(3) | |
url = paste('https://www.imdb.com/title/', imdb_id, '/episodes?season=', i , sep = '') | |
page = read_html(url) | |
episodes = page %>% html_node('.list') %>% html_nodes('.info') | |
for (j in 1:length(episodes)) { |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# define list of shows | |
shows = tibble( | |
name = c('The Office', 'Parks & Recreation', 'Modern Family', 'Community', 'New Girl', 'The Good Place'), | |
imdb_id = c('tt0386676', 'tt1266020', 'tt1442437', 'tt1439629', 'tt1826940', 'tt4955642'), | |
no_seasons = c(9, 7, 11, 6, 7, 4), | |
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# import packages | |
library(tidyverse) | |
library(rvest) | |
library(lubridate) | |
library(scales) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# import packages | |
library(tidyverse) | |
library(rvest) | |
library(lubridate) | |
library(scales) | |
# define list of shows | |
shows = tibble( | |
name = c('The Office', 'Parks & Recreation', 'Modern Family', 'Community', 'New Girl', 'The Good Place'), | |
imdb_id = c('tt0386676', 'tt1266020', 'tt1442437', 'tt1439629', 'tt1826940', 'tt4955642'), |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# import | |
library(rJava) | |
library(rMouse) | |
Sys.setenv(JAVA_HOME='C:/Program Files/Java/jre1.8.0_241') # for 64-bit version | |
# loop | |
condition = FALSE | |
while(condition == FALSE) { | |
Sys.sleep(30) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(rvest) | |
library(stringr) | |
# get list of gists | |
all_links = c() | |
for (i in 1:25) { | |
url = paste('https://gist.github.com/erikgregorywebb?page=', i, sep ='') | |
print(url) | |
Sys.sleep(1) | |
page = read_html(url) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(rvest) | |
library(tools) | |
library(fuzzyjoin) | |
# extract list of programs by state (source: mastersindatascience.org) | |
url = 'https://www.mastersindatascience.org/schools/' | |
page = read_html(url) | |
states = page %>% html_nodes('.row') %>% html_nodes('a') %>% html_attr('href') |