Skip to content

Instantly share code, notes, and snippets.

@conormm
Created September 2, 2017 18:32
Show Gist options
  • Save conormm/7f3e039ab03cb0abc00754518af86e39 to your computer and use it in GitHub Desktop.
Save conormm/7f3e039ab03cb0abc00754518af86e39 to your computer and use it in GitHub Desktop.
library(tidyverse)
library(here)
library(broom)
library(corrr)
library(forcats)
library(stringr)
library(lubridate)
library(gridExtra)
df <- read_csv("movie_metadata.csv")
df %>% glimpse()
imdb_score_summary <- df %>%
summarise_each(funs(min, max, mean, median, sd), imdb_score) %>%
gather(stat, value)
df %>%
ggplot(aes(imdb_score)) +
geom_histogram(bins = 50, colour = "black", fill = "red", alpha = 0.4) +
theme_minimal() +
annotation_custom(tableGrob(imdb_score_summary, rows = NULL),
xmin = 2.5,
xmax = 4,
ymin = 300, ymax = 400) +
labs(title = "IMDB score distribution")
df %>%
group_by(title_year) %>%
summarise(mean_rating = mean(imdb_score),
upper_rating = quantile(imdb_score, 0.975),
lower_rating = quantile(imdb_score, 0.0275)) %>%
ggplot(aes(title_year, mean_rating)) +
geom_line(colour = "dodger blue") +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", colour = "red", alpha = 0.6, se = FALSE) +
geom_errorbar(aes(ymin = upper_rating, ymax = lower_rating)) +
geom_rug() +
theme_minimal() +
labs(title = "Average IMDB movie rating per year",
x = "Year",
y = "IMDB rating")
df %>% map_df(~sum(is.na(.))) %>% glimpse()
replace_na_median <- function(x){
x[is.na(x)] <- median(x, na.rm = TRUE)
x
}
num_df <- df %>%
map_if(is_numeric, replace_na_median) %>%
as_data_frame() %>%
select_if(is_numeric)
num_df <- num_df %>%
mutate(years_since_release = 2017 - title_year) %>%
select(-title_year)
correlations <- corrr::correlate(num_df) %>%
gather(variable, correlation, 2:16) %>%
select(rowname, variable, correlation) %>%
mutate(high_correlation = ifelse(abs(correlation) > 0.50, "high", "not so high"))
correlations %>%
filter(abs(correlation) > 0.5) %>%
arrange(rowname)
correlations %>%
ggplot(aes(reorder(rowname, correlation), reorder(variable, correlation), fill = correlation)) +
geom_tile(alpha = 0.6, colour = "black") +
geom_text(aes(label = round(correlation, 2), colour = high_correlation)) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 70, hjust = 1)) +
scale_fill_gradient()
num_df <- num_df %>%
select(-actor_1_facebook_likes,
-actor_2_facebook_likes,
-actor_3_facebook_likes,
-num_voted_users,
-movie_facebook_likes)
fit <- num_df %>%
map(scale) %>%
as.data.frame() %>%
bootstrap(10) %>%
do(tidy(lm(imdb_score ~., data = .), conf.int = TRUE)) %>%
mutate(sig_0005 = p.value < 0.005)
fit %>%
filter(term != "(Intercept)") %>%
group_by(term) %>%
ggplot(aes(reorder(term, estimate), estimate, colour = sig_0005)) +
geom_point(alpha = 0.4) +
coord_flip() +
theme_minimal() +
geom_hline(aes(yintercept = 0), linetype = "dashed", colour = "blue") +
labs(title = "Linear regression estimates",
subtitle = "Dependent variable: IMDB score",
x = "Independent variable",
y = "estimate")
fit %>%
filter(term != "(Intercept)") %>%
group_by(term) %>%
summarise(estimate = mean(estimate),
conf.low = mean(conf.low),
conf.high = mean(conf.high),
sig_0005 = ifelse(sum(sig_0005) > 6, "significant (< 0.005)", "not significant")) %>%
ggplot(aes(y = reorder(term, estimate), x = estimate, colour = sig_0005)) +
geom_point() +
geom_errorbarh(aes(xmax = conf.high, xmin = conf.low)) +
theme_minimal() +
geom_vline(aes(xintercept = 0), linetype = "dashed", colour = "blue") +
labs(title = "Linear regression estimates",
subtitle = "Dependent variable: IMDB score",
x = "Independent variable",
y = "estimate")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment