Skip to content

Instantly share code, notes, and snippets.

@TysonStanley
Last active October 31, 2019 07:10
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save TysonStanley/269abbacf2743e39da18e1d90400a0c1 to your computer and use it in GitHub Desktop.
Save TysonStanley/269abbacf2743e39da18e1d90400a0c1 to your computer and use it in GitHub Desktop.
# Tidy Tuesday (on Friday)
# 25 October 2019
# Tyson S. Barrett
# Packages ----
library(tidyverse) # CRAN
library(dtplyr) # CRAN
library(data.table) # CRAN
library(tidyfast) # remotes::install_github("tysonstanley/tidyfast")
# Data ----
url <- "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-22/horror_movies.csv"
horror_movies <- fread(url)
# for plots later on ----
library(ggbeeswarm)
theme_set(
theme_minimal() +
theme(panel.grid.major.x = element_blank())
)
# Data Dictionary ----
tibble::tribble(
~variable, ~class, ~description,
"title", "character", "Title of the movie",
"genres", "character", "Movie Generes",
"release_date", "character", "Movie release date - day-month-year",
"release_country", "character", "Release country",
"movie_rating", "character", "MPAA Rating",
"review_rating", "double", "Movie rating (0 - 10)",
"movie_run_time", "character", "Movie run time (minutes)",
"plot", "character", "Short plot description (raw text)",
"cast", "character", "Cast",
"language", "character", "Language",
"filming_locations", "character", "Filming location",
"budget", "character", "Budget (US Dollars)"
)
# dtplyr and tidyfast ----
horror <-
horror_movies %>%
lazy_dt() %>%
select(title, movie_rating, review_rating,
movie_run_time, plot, cast, filming_locations,
release_country, budget, release_date) %>%
mutate(usa = dt_case_when(release_country == "USA" ~ "USA",
release_country != "USA" ~ "Other Country")) %>%
mutate(movie_rating = fct_lump(movie_rating, 6)) %>%
mutate(budget = as.numeric(str_remove_all(budget, "\\$|,")),
budget_mil = budget/1e6) %>%
mutate(release_date = lubridate::dmy(release_date),
release_year = lubridate::year(release_date)) %>%
as.data.table()
# nested by movie rating ----
# adds column with lin-log regression model
# extracts and tidies the model
horror_nest <-
dt_nest(horror, movie_rating) %>%
lazy_dt() %>%
mutate(rate_budget = purrr::map(data, ~{
lm(review_rating ~ log(budget) + release_year, data = .x)
})) %>%
mutate(rate_budget_tidy = purrr::map(rate_budget, ~broom::tidy(.x))) %>%
as.data.table()
# unnest model estimates ----
# grab just the logged budget estimates
horror_coef <-
dt_unnest(horror_nest,
col = rate_budget_tidy,
id = movie_rating) %>%
lazy_dt() %>%
filter(term == "log(budget)") %>%
as_tibble()
# visualize model results ----
# estimates and confidence intervals
horror_coef %>%
ggplot(aes(x = movie_rating, y = estimate, color = movie_rating)) +
geom_point() +
geom_errorbar(aes(ymin = estimate - 1.96*std.error,
ymax = estimate + 1.96*std.error),
width = .3) +
labs(x = "",
y = "Rating Change per Additional\n1% Increase in Budget",
title = "") +
theme(legend.position = "none")
# Result: There does not seem to be a strong relationship
# between the amount of money spent on movie and
# the movie ratings for most movie ratings.
# That is, spending more doesn't consistently
# improve the rating of the movie. Only looks like
# it improves ratings (modestly) for PG-13 and R rated
# movies.
# additional visual of the same relationship
horror %>%
ggplot(aes(x = log(budget),
y = review_rating,
color = movie_rating)) +
geom_point(alpha = .3) +
geom_smooth(method = "lm") +
facet_wrap(~movie_rating, scales = "free") +
theme(legend.position = "none") +
labs(y = "Movie Review Rating",
x = "Budget (logged units)")
# Result: Same conclusions as before, where PG-13 and R
# appear to have a positive relationship but this
# doesn't hold for other ratings
# other explorations ----
# Movie Rating by Release Date ----
ggplot(horror, aes(x = release_date, y = review_rating)) +
geom_point(alpha = .7, color = "darkgrey") +
geom_smooth(color = "coral3", size = 2) +
labs(x = "",
y = "Movie Review Rating")
# Movie Rating by USA vs Other ----
ggplot(horror) +
aes(x = usa, y = review_rating, color = movie_rating) +
geom_beeswarm(alpha = .7) +
stat_summary(color = "red") +
facet_wrap(~movie_rating) +
labs(x = "",
y = "Movie Review Rating") +
theme(legend.position = "none")
@TysonStanley
Copy link
Author

Output figures

Figure 1

2019-10-22-fig1

Figure 2

2019-10-22-fig2

Figure 3

2019-10-22-fig3

Figure 4

2019-10-22-fig4

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment