Skip to content

Instantly share code, notes, and snippets.

@dpwrussell

dpwrussell/movies.R

Created Dec 8, 2016
Embed
What would you like to do?
library("mosaic")
library("Hmisc")
# Load the movies and remove non-US/UK observations
allMovies <- read.csv('movie_metadata.csv', na.strings=c("", "NA"))
subsetMovies <- subset(allMovies, country=='USA' | country=='UK')
# Examine the variables of interest for NA
# No budget
nrow(subsetMovies[is.na(subsetMovies$budget),])
# No gross
nrow(subsetMovies[is.na(subsetMovies$gross),])
# No imdb_score
nrow(subsetMovies[is.na(subsetMovies$imdb_score),])
# No content_rating
nrow(subsetMovies[is.na(subsetMovies$content_rating),])
# Combination
nrow(subsetMovies[is.na(subsetMovies$budget) | is.na(subsetMovies$gross) | is.na(subsetMovies$imdb_score) | is.na(subsetMovies$content_rating),])
# Remove the incomplete columns
completeSubsetMovies <- subsetMovies[!is.na(subsetMovies$budget) & !is.na(subsetMovies$gross) & !is.na(subsetMovies$imdb_score) & !is.na(subsetMovies$content_rating),]
# Add the profit ratio column
movies <- transform(completeSubsetMovies, roi = log(gross/budget, 10))
movies$content_rating <- droplevels(movies$content_rating)
movies$country <- droplevels(movies$country)
# Show need for log10 roi
d <- density(movies$gross/movies$budget)
pdf("figures/density_roi.pdf")
plot(d, main="", xlab="Gross / Budget")
dev.off()
d <- density(log(movies$gross/movies$budget, 10))
pdf("figures/density_roi_log.pdf")
plot(d, main="", xlab="log 10(Gross/Budget)")
dev.off()
pdf("figures/density_budget.pdf")
d <- density(movies$budget)
plot(d, main="", xlab="Budget")
dev.off()
pdf("figures/xy_roi_budget.pdf")
xyplot(roi ~ budget/1000000, data=movies, type=c("p","r"), ylab="Return On Investment (log 10)", xlab="Budget (USD Millions)")
dev.off()
budget_m <- movies$budget/1000000
cor(roi ~ budget, data=movies)
roi_budget_model <- lm(movies$roi ~ budget_m)
rsquared(roi_budget_model)
pdf("figures/density_imdb_score.pdf")
d <- density(movies$imdb_score)
plot(d, main="", xlab="IMDB Score")
dev.off()
pdf("figures/xy_roi_imdb_score.pdf")
xyplot(roi ~ imdb_score, data=movies, type=c("p","r"), ylab="Return On Investment (log 10)", xlab="IMDB Score")
dev.off()
cor(roi ~ imdb_score, data=movies)
roi_imdb_score_model <- lm(movies$roi ~ movies$imdb_score)
rsquared(roi_imdb_score_model)
# Calculate the expected probability of profitability
profitability_ct <- table(movies$roi > 0)
profitability_ft <- prop.table(profitability_ct)
# Calculate the minimum number of observations required in a category for a chi-squared two-way test for independence
profitability_min_obs <- ceiling(5/min(profitability_ft))
# Make a list of categories that satisfy this criteria
profitability_cats <- names(which(table(movies$content_rating) > profitability_min_obs))
# Subset movies to get only the movis with levels that satisfy the minimum critera
movies_cats <- subset(movies, movies$content_rating %in% profitability_cats)
movies_cats$content_rating <- droplevels(movies_cats$content_rating)
# Contingency table of profitable movies and content_rating
profitable_content_rating_ct <- table(movies_cats$content_rating, movies_cats$roi > 0, dnn=c("content_rating", "Profitability"))
colnames(profitable_content_rating_ct) <- c("Unprofitable", "Profitable")
sink("tables/profitable_content_rating_ct.tex")
#xtable(profitable_content_rating_ct)
latex(profitable_content_rating_ct, file="", rowlabel = "Content Rating", cgroup ="Profitability", label="tab:profitable_content_rating_ct", caption="Contingency table of \\emph{content\\_rating} and \\emph{profitability}", caption.loc="bottom")
sink()
# Chi-squared two-way test for independence
xchisq.test(profitable_content_rating_ct)
# Flip plot upside down (TODO Must be a better way! Probably in bar plot)
profitable_proportion <- t(prop.table(profitable_content_rating_ct, 1))
profitable_proportion <- profitable_proportion[order(rownames(profitable_proportion)),]
# Histogram showing profitability of content_ratings
pdf("figures/bar_profitable_content_rating.pdf")
barplot(profitable_proportion, las=2, legend=TRUE, xlab="Content Rating", ylab="Profitable Proportion")
abline(h=max(profitability_ft), col="red")
text(6,max(profitability_ft),paste("Expected Proportion: ", round(max(profitability_ft), 2), sep=""), pos=3, col="red")
dev.off()
# Country counts
tally(~country, data = movies)
# Boxplots showing distribution of roi by country
pdf("figures/box_roi_country.pdf")
bwplot(roi ~ country, data = movies, xlab="Country", ylab="roi")
dev.off()
# By country T-test
t.test(roi ~ country, data=movies)
# Remove intermediate variables from environment (Leave only movies)
rm(d, allMovies, subsetMovies, completeSubsetMovies, movies_cats)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment