Erik erikgregorywebb

## imdb-character-importance-plot.R
  # calculate character % present
  character_importance_plot = all_characters %>%
    mutate(show = factor(show, levels = c('The Office', 'Parks & Recreation', 'Modern Family', 'Community', 'New Girl', 'The Good Place'))) %>%
    mutate(per = character_episodes / total_episodes) %>%
    ggplot(., aes(x = reorder(character_name , per), y = per, col = show)) +
    geom_point(size = 3) +
    coord_flip() +
    theme(legend.position = 'none') +
    labs(title = 'Character Importance by Show', subtitle = 'IMDb Episode Descriptions, All Seasons',
         y = 'Percent Present in Episode Descriptions', x = '', fill = 'Season', caption = 'Data Source: IDMb.com | Author: @erikgregorywebb') +

## imdb-character-importance.R
  # define character count function
  get_character_counts = function(df, show_name, character_name) {
    total_episodes = df %>% filter(show == show_name) %>% nrow()
    character_episodes = df %>%
      filter(show == show_name) %>% filter(str_detect(description, character_name)) %>% nrow()
    result = tibble(show = show_name, character_name = character_name,
                    total_episodes = total_episodes, character_episodes = character_episodes)
    return(result)
  }

## imdb-rating-trends-plot.R
  rating_trends_plot = all_shows %>%
    mutate(season = factor(season, levels = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11))) %>%
    mutate(show = factor(show, levels = c('The Office', 'Parks & Recreation', 'Modern Family', 'Community', 'New Girl', 'The Good Place'))) %>%
    ggplot(., aes(x = airdate, y = rating, col = show, group = season)) +
    geom_line(alpha = .5) + geom_smooth(se = F, size = 2) +
    theme(legend.position = 'none') +
    labs(title = 'Ratings Trend by Show', subtitle = 'IMDb Episode Average Rating, All Seasons',
         y = 'Average Rating', x = 'Year', fill = 'Season', caption = 'Data Source: IDMb.com | Author: @erikgregorywebb') +
    facet_wrap(~show, scales = 'free') +
    scale_y_continuous(limits = c(6, 10)) +

## imdb-scrape-clean.R
  # define scraper function
  scrape_show = function(name, imdb_id, no_seasons) {
    datalist = list()
    n = 1
    for (i in 1:no_seasons) {
      Sys.sleep(3)
      url = paste('https://www.imdb.com/title/', imdb_id, '/episodes?season=', i , sep = '')
      page = read_html(url)
      episodes = page %>% html_node('.list') %>% html_nodes('.info')
      for (j in 1:length(episodes)) {

## imdb-define-shows.R
  # define list of shows
  shows = tibble(
    name = c('The Office', 'Parks & Recreation', 'Modern Family', 'Community', 'New Girl', 'The Good Place'),
    imdb_id = c('tt0386676', 'tt1266020', 'tt1442437', 'tt1439629', 'tt1826940', 'tt4955642'),
    no_seasons = c(9, 7, 11, 6, 7, 4),
  )

## imdb-packages.R
  # import packages
  library(tidyverse)
  library(rvest)
  library(lubridate)
  library(scales)

## imdb-all.R
# import packages
library(tidyverse)
library(rvest)
library(lubridate)
library(scales)

# define list of shows
shows = tibble(
  name = c('The Office', 'Parks & Recreation', 'Modern Family', 'Community', 'New Girl', 'The Good Place'),
  imdb_id = c('tt0386676', 'tt1266020', 'tt1442437', 'tt1439629', 'tt1826940', 'tt4955642'),

## mouse-backup.R

# import
library(rJava)
library(rMouse)
Sys.setenv(JAVA_HOME='C:/Program Files/Java/jre1.8.0_241') # for 64-bit version

# loop
condition = FALSE
while(condition == FALSE) {
  Sys.sleep(30)

## top-r-packages.R
library(rvest)
library(stringr)

# get list of gists
all_links = c()
for (i in 1:25) {
  url = paste('https://gist.github.com/erikgregorywebb?page=', i, sep ='')
  print(url)
  Sys.sleep(1)
  page = read_html(url)

## hbcus-scrape.R
library(tidyverse)
library(rvest)
library(tools)
library(fuzzyjoin)

# extract list of programs by state (source: mastersindatascience.org)
url = 'https://www.mastersindatascience.org/schools/'
page = read_html(url)
states = page %>% html_nodes('.row') %>% html_nodes('a') %>% html_attr('href')
	# calculate character % present
	character_importance_plot = all_characters %>%
	mutate(show = factor(show, levels = c('The Office', 'Parks & Recreation', 'Modern Family', 'Community', 'New Girl', 'The Good Place'))) %>%
	mutate(per = character_episodes / total_episodes) %>%
	ggplot(., aes(x = reorder(character_name , per), y = per, col = show)) +
	geom_point(size = 3) +
	coord_flip() +
	theme(legend.position = 'none') +
	labs(title = 'Character Importance by Show', subtitle = 'IMDb Episode Descriptions, All Seasons',
	y = 'Percent Present in Episode Descriptions', x = '', fill = 'Season', caption = 'Data Source: IDMb.com \| Author: @erikgregorywebb') +
	# define character count function
	get_character_counts = function(df, show_name, character_name) {
	total_episodes = df %>% filter(show == show_name) %>% nrow()
	character_episodes = df %>%
	filter(show == show_name) %>% filter(str_detect(description, character_name)) %>% nrow()
	result = tibble(show = show_name, character_name = character_name,
	total_episodes = total_episodes, character_episodes = character_episodes)
	return(result)
	}
	rating_trends_plot = all_shows %>%
	mutate(season = factor(season, levels = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11))) %>%
	mutate(show = factor(show, levels = c('The Office', 'Parks & Recreation', 'Modern Family', 'Community', 'New Girl', 'The Good Place'))) %>%
	ggplot(., aes(x = airdate, y = rating, col = show, group = season)) +
	geom_line(alpha = .5) + geom_smooth(se = F, size = 2) +
	theme(legend.position = 'none') +
	labs(title = 'Ratings Trend by Show', subtitle = 'IMDb Episode Average Rating, All Seasons',
	y = 'Average Rating', x = 'Year', fill = 'Season', caption = 'Data Source: IDMb.com \| Author: @erikgregorywebb') +
	facet_wrap(~show, scales = 'free') +
	scale_y_continuous(limits = c(6, 10)) +
	# define scraper function
	scrape_show = function(name, imdb_id, no_seasons) {
	datalist = list()
	n = 1
	for (i in 1:no_seasons) {
	Sys.sleep(3)
	url = paste('https://www.imdb.com/title/', imdb_id, '/episodes?season=', i , sep = '')
	page = read_html(url)
	episodes = page %>% html_node('.list') %>% html_nodes('.info')
	for (j in 1:length(episodes)) {
	# define list of shows
	shows = tibble(
	name = c('The Office', 'Parks & Recreation', 'Modern Family', 'Community', 'New Girl', 'The Good Place'),
	imdb_id = c('tt0386676', 'tt1266020', 'tt1442437', 'tt1439629', 'tt1826940', 'tt4955642'),
	no_seasons = c(9, 7, 11, 6, 7, 4),
	)
	# import packages
	library(tidyverse)
	library(rvest)
	library(lubridate)
	library(scales)

	# import
	library(rJava)
	library(rMouse)
	Sys.setenv(JAVA_HOME='C:/Program Files/Java/jre1.8.0_241') # for 64-bit version

	# loop
	condition = FALSE
	while(condition == FALSE) {
	Sys.sleep(30)
	library(rvest)
	library(stringr)

	# get list of gists
	all_links = c()
	for (i in 1:25) {
	url = paste('https://gist.github.com/erikgregorywebb?page=', i, sep ='')
	print(url)
	Sys.sleep(1)
	page = read_html(url)
	library(tidyverse)
	library(rvest)
	library(tools)
	library(fuzzyjoin)

	# extract list of programs by state (source: mastersindatascience.org)
	url = 'https://www.mastersindatascience.org/schools/'
	page = read_html(url)
	states = page %>% html_nodes('.row') %>% html_nodes('a') %>% html_attr('href')