Skip to content

Instantly share code, notes, and snippets.

View erikgregorywebb's full-sized avatar
📈

Erik erikgregorywebb

📈
View GitHub Profile
# calculate character % present
character_importance_plot = all_characters %>%
mutate(show = factor(show, levels = c('The Office', 'Parks & Recreation', 'Modern Family', 'Community', 'New Girl', 'The Good Place'))) %>%
mutate(per = character_episodes / total_episodes) %>%
ggplot(., aes(x = reorder(character_name , per), y = per, col = show)) +
geom_point(size = 3) +
coord_flip() +
theme(legend.position = 'none') +
labs(title = 'Character Importance by Show', subtitle = 'IMDb Episode Descriptions, All Seasons',
y = 'Percent Present in Episode Descriptions', x = '', fill = 'Season', caption = 'Data Source: IDMb.com | Author: @erikgregorywebb') +
# define character count function
get_character_counts = function(df, show_name, character_name) {
total_episodes = df %>% filter(show == show_name) %>% nrow()
character_episodes = df %>%
filter(show == show_name) %>% filter(str_detect(description, character_name)) %>% nrow()
result = tibble(show = show_name, character_name = character_name,
total_episodes = total_episodes, character_episodes = character_episodes)
return(result)
}
# define scraper function
scrape_show = function(name, imdb_id, no_seasons) {
datalist = list()
n = 1
for (i in 1:no_seasons) {
Sys.sleep(3)
url = paste('https://www.imdb.com/title/', imdb_id, '/episodes?season=', i , sep = '')
page = read_html(url)
episodes = page %>% html_node('.list') %>% html_nodes('.info')
for (j in 1:length(episodes)) {
# define list of shows
shows = tibble(
name = c('The Office', 'Parks & Recreation', 'Modern Family', 'Community', 'New Girl', 'The Good Place'),
imdb_id = c('tt0386676', 'tt1266020', 'tt1442437', 'tt1439629', 'tt1826940', 'tt4955642'),
no_seasons = c(9, 7, 11, 6, 7, 4),
)
# import packages
library(tidyverse)
library(rvest)
library(lubridate)
library(scales)
# import packages
library(tidyverse)
library(rvest)
library(lubridate)
library(scales)
# define list of shows
shows = tibble(
name = c('The Office', 'Parks & Recreation', 'Modern Family', 'Community', 'New Girl', 'The Good Place'),
imdb_id = c('tt0386676', 'tt1266020', 'tt1442437', 'tt1439629', 'tt1826940', 'tt4955642'),
# import
library(rJava)
library(rMouse)
Sys.setenv(JAVA_HOME='C:/Program Files/Java/jre1.8.0_241') # for 64-bit version
# loop
condition = FALSE
while(condition == FALSE) {
Sys.sleep(30)
library(rvest)
library(stringr)
# get list of gists
all_links = c()
for (i in 1:25) {
url = paste('https://gist.github.com/erikgregorywebb?page=', i, sep ='')
print(url)
Sys.sleep(1)
page = read_html(url)
library(tidyverse)
library(rvest)
library(tools)
library(fuzzyjoin)
# extract list of programs by state (source: mastersindatascience.org)
url = 'https://www.mastersindatascience.org/schools/'
page = read_html(url)
states = page %>% html_nodes('.row') %>% html_nodes('a') %>% html_attr('href')