sonamgupta1105/gist:f9f8005093d0e0282c2a63182184ea5e

## gistfile1.txt
# All the visualizations are for Title.type = Feature Films. I have used the same packages as mentioned in the example you had given.
library(ggplot2)
library(dplyr)
movie_data <- read.csv("movie_data.csv")

#creating the movies dataframe
dfMoviesFeatureFilm <- subset(movie_data, movie_data$Title.type == "Feature Film")

#Vis1 that plots imdb-ratings for feature films excluding the missing values
dfMoviesFeatureFilmRating <- subset(dfMoviesFeatureFilm, dfMoviesFeatureFilm$IMDb.Rating != "")
plotRatingVis1 <- ggplot(dfMoviesFeatureFilm, aes_(x = dfMoviesFeatureFilm$IMDb.Rating))
plotRatingVis1 + geom_bar(stat = "count", fill = "coral3")

#In the above plot, the notable information about the feature films is that, most of the movies are rated between 6.0 to 8.0, few below 6.0
#and few above than 8.0, almost forming a bell curve for normal distribution. So we can conclude that the feature films present in the
#dataset, falls between 6.0-8.0 IMDb ratings.

#Vis 2
# to see the relation between domestic earnings and IMDb ratings
dfMoviesFeatureFilm[,4] <- as.numeric(sub('\\$','',gsub(',','',as.character(dfMoviesFeatureFilm[,4]))))
earningsDom <- dfMoviesFeatureFilm$Domestic
binsDomEarning <- 20
cutpointsDomEarnings <-quantile(earningsDom, (0:binsDomEarning)/binsDomEarning, na.rm=TRUE)
binnedDomEarnings <- cut(earningsDom, cutpointsDomEarnings, include.lowest=TRUE)
imdbRatingDomEarning <- ggplot(dfMoviesFeatureFilm, aes(x = binnedDomEarnings, y = dfMoviesFeatureFilm$IMDbRating)) + geom_tile(aes(fill = IMDbRating), color = 'red') +scale_fill_gradient(name = 'Relation of IMDb Ratings and Domestic Earnings for Feature films', low = 'yellow', high = 'red') + theme(axis.text.x = element_text(angle = 90, hjust =1 , vjust = 0.5),axis.title.y = element_blank())
imdbRatingDomEarning

#Vis 3
# International earnings and IMDbRating
dfMoviesFeatureFilm[,5] <- as.numeric(sub('\\$','',gsub(',','',as.character(dfMoviesFeatureFilm[,5]))))
earnings <- dfMoviesFeatureFilm$International
binsIntEarning <- 20
cutpointsIntEarnings <-quantile(earnings, (0:binsIntEarning)/binsIntEarning, na.rm=TRUE)
binnedIntEarnings <- cut(earnings, cutpointsIntEarnings, include.lowest=TRUE)
imdbRatingInternationalEarning <- ggplot(dfMoviesFeatureFilm, aes(x = binnedIntEarnings, y = dfMoviesFeatureFilm$IMDbRating)) + geom_tile(aes(fill = IMDbRating), color = 'red') +scale_fill_gradient(name = 'Relation of IMDb Ratings and international Earnings for Feature films', low = 'yellow', high = 'red') + theme(axis.text.x = element_text(angle = 90, hjust =1 , vjust = 0.5),axis.title.y = element_blank())
imdbRatingInternationalEarning
# For Vis 2 & 3, there are lot of NAs for domestic and international earnings columns but the highly rated movies are more popular
#internationally than domestic. THe movies that were rated around 7.0 did not really earn much internationally.

# Vis 4 This visualization has 2-3 iterations
# file name is ratingsVotesIteration1
imdbRatingVotes <- ggplot(dfMoviesFeatureFilm, aes(x = dfMoviesFeatureFilm$X..of.IMDb.votes, y = dfMoviesFeatureFilm$IMDb.Rating)) + geom_tile(aes(fill = dfMoviesFeatureFilm$IMDb.Rating), color = 'red') +scale_fill_gradient(name = 'Relation of IMDb votes and ratings for Feature films', low = 'yellow', high = 'red') + theme(axis.title.y = element_blank())
imdbRatingVotes
#Plotting heatmap for finding relation between IMDb Rating and number of IMDb votes-iteration 1
#This is first attempt to generate a heat map, preferred over creating a scatter plot since there are lot of data points to be
#plotted for votes. The idea behind the map was to see if the relation between the IMDb ratings and votes is significant and appropriate.
#After looking at the plot, we can tell the number of votes are according to the ratings. This plot needed revisions since the votes
#needed to be binned as they couldn't fit on the X-axis. Referred: http://datascienceplus.com/building-heatmaps-in-r/

#Vis 4 -- iteration 2 -- file name is messedXaxisBinnedVotesRating
#Binning the number of votes with x-axis labels messed up
votes <- dfMoviesFeatureFilm$X..of.IMDb.votes
binsVotes <- 20
cutpointsVotesRating <- quantile(votes, (0:binsVotes)/binsVotes, na.rm = TRUE)
binned <- cut(votes, cutpointsVotesRating, include.lowest = TRUE)
imdbRatingVotes <- ggplot(dfMoviesFeatureFilm, aes(x = binned, y = dfMoviesFeatureFilm$IMDb.Rating)) + geom_tile(aes(fill = IMDb.Rating), color = 'red') + xlab('Number of binned votes') + scale_fill_gradient(name = 'Relation of IMDb votes and ratings for Feature films', low = 'yellow', high = 'red') + theme(axis.title.y = element_blank())
imdbRatingVotes
#This iteration then created a better looking heatmap which is visually understandable to realize the fact that the films that has
#higher rating has higher number of votes.

#Vis 4 -- iteration 3 -- file name is votesBinnedImprovedXaxis
This revision was to fix the labels of X-axis
```{r, echo = TRUE}
#Binning the number of votes with 90degree labels for x-axis
votes <- dfMoviesFeatureFilm$X..of.IMDb.votes
binsVotes <- 20
cutpointsVotesRating <- quantile(votes, (0:binsVotes)/binsVotes, na.rm = TRUE)
binned <- cut(votes, cutpointsVotesRating, include.lowest = TRUE)
imdbRatingVotes <- ggplot(dfMoviesFeatureFilm, aes(x = binned, y = dfMoviesFeatureFilm$IMDb.Rating)) + geom_tile(aes(fill = dfMoviesFeatureFilm$IMDb.Rating), color = 'red') +scale_fill_gradient(name = 'Relation of IMDb votes and ratings for Feature films', low = 'yellow', high = 'red') + theme(axis.text.x = element_text(angle = 90, hjust =1 , vjust = 0.5),axis.title.y = element_blank())
imdbRatingVotes

#Vis 5
ratingsIMDBTomato <- ggplot(dfMoviesFeatureFilm, aes(x = Rotten.Tom., y = IMDb.Rating)) + geom_point(aes(color = IMDb.Rating)) + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
ratingsIMDBTomato
#Plotting a scatter plot between Rotten tomato and IMDb ratings. The darker the points on the graph, lesser are the ratings.
#The dataset is very messy and has a lot of missing values for rotten tomato ratings as we see a straight line of points denoting that.
#The labels on the X-axis are not sorted but are in the order as they are in the dataset.
	# All the visualizations are for Title.type = Feature Films. I have used the same packages as mentioned in the example you had given.
	library(ggplot2)
	library(dplyr)
	movie_data <- read.csv("movie_data.csv")

	#creating the movies dataframe
	dfMoviesFeatureFilm <- subset(movie_data, movie_data$Title.type == "Feature Film")

	#Vis1 that plots imdb-ratings for feature films excluding the missing values
	dfMoviesFeatureFilmRating <- subset(dfMoviesFeatureFilm, dfMoviesFeatureFilm$IMDb.Rating != "")
	plotRatingVis1 <- ggplot(dfMoviesFeatureFilm, aes_(x = dfMoviesFeatureFilm$IMDb.Rating))
	plotRatingVis1 + geom_bar(stat = "count", fill = "coral3")

	#In the above plot, the notable information about the feature films is that, most of the movies are rated between 6.0 to 8.0, few below 6.0
	#and few above than 8.0, almost forming a bell curve for normal distribution. So we can conclude that the feature films present in the
	#dataset, falls between 6.0-8.0 IMDb ratings.

	#Vis 2
	# to see the relation between domestic earnings and IMDb ratings
	dfMoviesFeatureFilm[,4] <- as.numeric(sub('\\$','',gsub(',','',as.character(dfMoviesFeatureFilm[,4]))))
	earningsDom <- dfMoviesFeatureFilm$Domestic
	binsDomEarning <- 20
	cutpointsDomEarnings <-quantile(earningsDom, (0:binsDomEarning)/binsDomEarning, na.rm=TRUE)
	binnedDomEarnings <- cut(earningsDom, cutpointsDomEarnings, include.lowest=TRUE)
	imdbRatingDomEarning <- ggplot(dfMoviesFeatureFilm, aes(x = binnedDomEarnings, y = dfMoviesFeatureFilm$IMDbRating)) + geom_tile(aes(fill = IMDbRating), color = 'red') +scale_fill_gradient(name = 'Relation of IMDb Ratings and Domestic Earnings for Feature films', low = 'yellow', high = 'red') + theme(axis.text.x = element_text(angle = 90, hjust =1 , vjust = 0.5),axis.title.y = element_blank())
	imdbRatingDomEarning

	#Vis 3
	# International earnings and IMDbRating
	dfMoviesFeatureFilm[,5] <- as.numeric(sub('\\$','',gsub(',','',as.character(dfMoviesFeatureFilm[,5]))))
	earnings <- dfMoviesFeatureFilm$International
	binsIntEarning <- 20
	cutpointsIntEarnings <-quantile(earnings, (0:binsIntEarning)/binsIntEarning, na.rm=TRUE)
	binnedIntEarnings <- cut(earnings, cutpointsIntEarnings, include.lowest=TRUE)
	imdbRatingInternationalEarning <- ggplot(dfMoviesFeatureFilm, aes(x = binnedIntEarnings, y = dfMoviesFeatureFilm$IMDbRating)) + geom_tile(aes(fill = IMDbRating), color = 'red') +scale_fill_gradient(name = 'Relation of IMDb Ratings and international Earnings for Feature films', low = 'yellow', high = 'red') + theme(axis.text.x = element_text(angle = 90, hjust =1 , vjust = 0.5),axis.title.y = element_blank())
	imdbRatingInternationalEarning
	# For Vis 2 & 3, there are lot of NAs for domestic and international earnings columns but the highly rated movies are more popular
	#internationally than domestic. THe movies that were rated around 7.0 did not really earn much internationally.

	# Vis 4 This visualization has 2-3 iterations
	# file name is ratingsVotesIteration1
	imdbRatingVotes <- ggplot(dfMoviesFeatureFilm, aes(x = dfMoviesFeatureFilm$X..of.IMDb.votes, y = dfMoviesFeatureFilm$IMDb.Rating)) + geom_tile(aes(fill = dfMoviesFeatureFilm$IMDb.Rating), color = 'red') +scale_fill_gradient(name = 'Relation of IMDb votes and ratings for Feature films', low = 'yellow', high = 'red') + theme(axis.title.y = element_blank())
	imdbRatingVotes
	#Plotting heatmap for finding relation between IMDb Rating and number of IMDb votes-iteration 1
	#This is first attempt to generate a heat map, preferred over creating a scatter plot since there are lot of data points to be
	#plotted for votes. The idea behind the map was to see if the relation between the IMDb ratings and votes is significant and appropriate.
	#After looking at the plot, we can tell the number of votes are according to the ratings. This plot needed revisions since the votes
	#needed to be binned as they couldn't fit on the X-axis. Referred: http://datascienceplus.com/building-heatmaps-in-r/

	#Vis 4 -- iteration 2 -- file name is messedXaxisBinnedVotesRating
	#Binning the number of votes with x-axis labels messed up
	votes <- dfMoviesFeatureFilm$X..of.IMDb.votes
	binsVotes <- 20
	cutpointsVotesRating <- quantile(votes, (0:binsVotes)/binsVotes, na.rm = TRUE)
	binned <- cut(votes, cutpointsVotesRating, include.lowest = TRUE)
	imdbRatingVotes <- ggplot(dfMoviesFeatureFilm, aes(x = binned, y = dfMoviesFeatureFilm$IMDb.Rating)) + geom_tile(aes(fill = IMDb.Rating), color = 'red') + xlab('Number of binned votes') + scale_fill_gradient(name = 'Relation of IMDb votes and ratings for Feature films', low = 'yellow', high = 'red') + theme(axis.title.y = element_blank())
	imdbRatingVotes
	#This iteration then created a better looking heatmap which is visually understandable to realize the fact that the films that has
	#higher rating has higher number of votes.

	#Vis 4 -- iteration 3 -- file name is votesBinnedImprovedXaxis
	This revision was to fix the labels of X-axis
	```{r, echo = TRUE}
	#Binning the number of votes with 90degree labels for x-axis
	votes <- dfMoviesFeatureFilm$X..of.IMDb.votes
	binsVotes <- 20
	cutpointsVotesRating <- quantile(votes, (0:binsVotes)/binsVotes, na.rm = TRUE)
	binned <- cut(votes, cutpointsVotesRating, include.lowest = TRUE)
	imdbRatingVotes <- ggplot(dfMoviesFeatureFilm, aes(x = binned, y = dfMoviesFeatureFilm$IMDb.Rating)) + geom_tile(aes(fill = dfMoviesFeatureFilm$IMDb.Rating), color = 'red') +scale_fill_gradient(name = 'Relation of IMDb votes and ratings for Feature films', low = 'yellow', high = 'red') + theme(axis.text.x = element_text(angle = 90, hjust =1 , vjust = 0.5),axis.title.y = element_blank())
	imdbRatingVotes

	#Vis 5
	ratingsIMDBTomato <- ggplot(dfMoviesFeatureFilm, aes(x = Rotten.Tom., y = IMDb.Rating)) + geom_point(aes(color = IMDb.Rating)) + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
	ratingsIMDBTomato
	#Plotting a scatter plot between Rotten tomato and IMDb ratings. The darker the points on the graph, lesser are the ratings.
	#The dataset is very messy and has a lot of missing values for rotten tomato ratings as we see a straight line of points denoting that.
	#The labels on the X-axis are not sorted but are in the order as they are in the dataset.