Created
December 8, 2016 18:26
-
-
Save dpwrussell/4c8c451b8f411e13e83fb74e38656c4e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library("mosaic") | |
library("Hmisc") | |
# Load the movies and remove non-US/UK observations | |
allMovies <- read.csv('movie_metadata.csv', na.strings=c("", "NA")) | |
subsetMovies <- subset(allMovies, country=='USA' | country=='UK') | |
# Examine the variables of interest for NA | |
# No budget | |
nrow(subsetMovies[is.na(subsetMovies$budget),]) | |
# No gross | |
nrow(subsetMovies[is.na(subsetMovies$gross),]) | |
# No imdb_score | |
nrow(subsetMovies[is.na(subsetMovies$imdb_score),]) | |
# No content_rating | |
nrow(subsetMovies[is.na(subsetMovies$content_rating),]) | |
# Combination | |
nrow(subsetMovies[is.na(subsetMovies$budget) | is.na(subsetMovies$gross) | is.na(subsetMovies$imdb_score) | is.na(subsetMovies$content_rating),]) | |
# Remove the incomplete columns | |
completeSubsetMovies <- subsetMovies[!is.na(subsetMovies$budget) & !is.na(subsetMovies$gross) & !is.na(subsetMovies$imdb_score) & !is.na(subsetMovies$content_rating),] | |
# Add the profit ratio column | |
movies <- transform(completeSubsetMovies, roi = log(gross/budget, 10)) | |
movies$content_rating <- droplevels(movies$content_rating) | |
movies$country <- droplevels(movies$country) | |
# Show need for log10 roi | |
d <- density(movies$gross/movies$budget) | |
pdf("figures/density_roi.pdf") | |
plot(d, main="", xlab="Gross / Budget") | |
dev.off() | |
d <- density(log(movies$gross/movies$budget, 10)) | |
pdf("figures/density_roi_log.pdf") | |
plot(d, main="", xlab="log 10(Gross/Budget)") | |
dev.off() | |
pdf("figures/density_budget.pdf") | |
d <- density(movies$budget) | |
plot(d, main="", xlab="Budget") | |
dev.off() | |
pdf("figures/xy_roi_budget.pdf") | |
xyplot(roi ~ budget/1000000, data=movies, type=c("p","r"), ylab="Return On Investment (log 10)", xlab="Budget (USD Millions)") | |
dev.off() | |
budget_m <- movies$budget/1000000 | |
cor(roi ~ budget, data=movies) | |
roi_budget_model <- lm(movies$roi ~ budget_m) | |
rsquared(roi_budget_model) | |
pdf("figures/density_imdb_score.pdf") | |
d <- density(movies$imdb_score) | |
plot(d, main="", xlab="IMDB Score") | |
dev.off() | |
pdf("figures/xy_roi_imdb_score.pdf") | |
xyplot(roi ~ imdb_score, data=movies, type=c("p","r"), ylab="Return On Investment (log 10)", xlab="IMDB Score") | |
dev.off() | |
cor(roi ~ imdb_score, data=movies) | |
roi_imdb_score_model <- lm(movies$roi ~ movies$imdb_score) | |
rsquared(roi_imdb_score_model) | |
# Calculate the expected probability of profitability | |
profitability_ct <- table(movies$roi > 0) | |
profitability_ft <- prop.table(profitability_ct) | |
# Calculate the minimum number of observations required in a category for a chi-squared two-way test for independence | |
profitability_min_obs <- ceiling(5/min(profitability_ft)) | |
# Make a list of categories that satisfy this criteria | |
profitability_cats <- names(which(table(movies$content_rating) > profitability_min_obs)) | |
# Subset movies to get only the movis with levels that satisfy the minimum critera | |
movies_cats <- subset(movies, movies$content_rating %in% profitability_cats) | |
movies_cats$content_rating <- droplevels(movies_cats$content_rating) | |
# Contingency table of profitable movies and content_rating | |
profitable_content_rating_ct <- table(movies_cats$content_rating, movies_cats$roi > 0, dnn=c("content_rating", "Profitability")) | |
colnames(profitable_content_rating_ct) <- c("Unprofitable", "Profitable") | |
sink("tables/profitable_content_rating_ct.tex") | |
#xtable(profitable_content_rating_ct) | |
latex(profitable_content_rating_ct, file="", rowlabel = "Content Rating", cgroup ="Profitability", label="tab:profitable_content_rating_ct", caption="Contingency table of \\emph{content\\_rating} and \\emph{profitability}", caption.loc="bottom") | |
sink() | |
# Chi-squared two-way test for independence | |
xchisq.test(profitable_content_rating_ct) | |
# Flip plot upside down (TODO Must be a better way! Probably in bar plot) | |
profitable_proportion <- t(prop.table(profitable_content_rating_ct, 1)) | |
profitable_proportion <- profitable_proportion[order(rownames(profitable_proportion)),] | |
# Histogram showing profitability of content_ratings | |
pdf("figures/bar_profitable_content_rating.pdf") | |
barplot(profitable_proportion, las=2, legend=TRUE, xlab="Content Rating", ylab="Profitable Proportion") | |
abline(h=max(profitability_ft), col="red") | |
text(6,max(profitability_ft),paste("Expected Proportion: ", round(max(profitability_ft), 2), sep=""), pos=3, col="red") | |
dev.off() | |
# Country counts | |
tally(~country, data = movies) | |
# Boxplots showing distribution of roi by country | |
pdf("figures/box_roi_country.pdf") | |
bwplot(roi ~ country, data = movies, xlab="Country", ylab="roi") | |
dev.off() | |
# By country T-test | |
t.test(roi ~ country, data=movies) | |
# Remove intermediate variables from environment (Leave only movies) | |
rm(d, allMovies, subsetMovies, completeSubsetMovies, movies_cats) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment