Skip to content

Instantly share code, notes, and snippets.

@inkhorn
Created May 2, 2013 01:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save inkhorn/5499509 to your computer and use it in GitHub Desktop.
Save inkhorn/5499509 to your computer and use it in GitHub Desktop.
Casino Analysis
library(ff)
library(ffbase)
library(stringr)
library(ggplot2)
library(ggthemes)
library(reshape2)
library(RgoogleMaps)
# Loading 2 copies of the same data set so that I can convert one and have the original for its text values
casino = read.csv("/home/inkhorn/Downloads/casino_survey_results20130325.csv")
casino.orig = read.csv("/home/inkhorn/Downloads/casino_survey_results20130325.csv")
# Here's the dataset of canadian postal codes and latitude/longitude coordinates
pcodes = read.csv.ffdf(file="/home/inkhorn/Downloads/zipcodeset.txt", first.rows=50000, next.rows=50000, colClasses=NA, header=FALSE)
# I'm doing some numerical recoding here. If you can tell me a cleaner way of doing this
# then by all means please do. I found this process really annoyingly tedious.
casino$Q1_A = ifelse(casino.orig$Q1_A == "Neutral or Mixed Feelings", 3,
ifelse(casino.orig$Q1_A == "Somewhat in Favour", 4,
ifelse(casino.orig$Q1_A == "Somewhat Opposed", 2,
ifelse(casino.orig$Q1_A == "Strongly in Favour", 5,
ifelse(casino.orig$Q1_A == "Strongly Opposed", 1,NA)))))
casino$Q2_A = ifelse(casino.orig$Q2_A == "Does Not Fit My Image At All", 1,
ifelse(casino.orig$Q2_A == "Neutral / I am Not Sure",2,
ifelse(casino.orig$Q2_A == "Fits Image Somewhat", 3,
ifelse(casino.orig$Q2_A == "Fits Image Perfectly", 4, NA))))
for (i in 8:24) {
casino[,i] = ifelse(casino.orig[,i] == "Not Important At All", 1,
ifelse(casino.orig[,i] == "Somewhat Important", 2,
ifelse(casino.orig[,i] == "Very Important", 3,NA)))}
for (i in c(31:32,47,48,63,64)) {
casino[,i] = ifelse(casino.orig[,i] == "Highly Suitable",5,
ifelse(casino.orig[,i] == "Neutral or Mixed Feelings",3,
ifelse(casino.orig[,i] == "Somewhat Suitable",4,
ifelse(casino.orig[,i] == "Somewhat Unsuitable",2,
ifelse(casino.orig[,i] == "Strongly Unsuitable",1,NA)))))}
# There tended to be blank responses in the original dataset. When seeking to
# plot the responses in their original text option format, I got rid of them in some cases,
# or coded them in "Did not disclose" in others.
casino.orig$Q1_A[casino.orig$Q1_A == ""] = NA
casino.orig$Q1_A = factor(casino.orig$Q1_A, levels=c("Strongly Opposed","Somewhat Opposed","Neutral or Mixed Feelings","Somewhat in Favour","Strongly in Favour"))
# Here's the graph showing how people feel about a new casino
ggplot(subset(casino.orig, !is.na(Q1_A)), aes(x=Q1_A,y=..count../sum(..count..))) + geom_bar(fill="forest green") + coord_flip() + ggtitle("How do you feel about having a new casino in Toronto?") + scale_x_discrete(name="") + theme_wsj() + theme(title=element_text(size=22),plot.title=element_text(hjust=.8)) + stat_bin(aes(label = sprintf("%.02f %%", ..count../sum(..count..)*100)), geom="text") + scale_y_continuous(labels=percent)
# How does the casino fit into your image of toronto...
ggplot(subset(casino.orig, Q2_A!= ''), aes(x=Q2_A,y=..count../sum(..count..))) + geom_bar(fill="forest green") + coord_flip() + ggtitle("How does a new casino in Toronto fit your image of the City of Toronto?") + scale_x_discrete(name="") + theme_wsj() + theme(title=element_text(size=22),plot.title=element_text(hjust=.8)) + stat_bin(aes(label = sprintf("%.02f %%", ..count../sum(..count..)*100)),geom="text") + scale_y_continuous(labels=percent)
# Where you'd prefer to see it located
ggplot(subset(casino.orig, Q6!= ''), aes(x=Q6,y=..count../sum(..count..))) + geom_bar(fill="forest green") + coord_flip() + ggtitle("If a casino is built, where would you prefer to see it located?") + scale_x_discrete(name="") + theme_wsj() + theme(title=element_text(size=22),plot.title=element_text(hjust=.8)) + stat_bin(aes(label = sprintf("%.02f %%", ..count../sum(..count..)*100)), geom="text") + scale_y_continuous(labels=percent)
# Here I reorder the text labels from the questions asking about suitability of the downtown location
casino.orig$Q7_A_StandAlone = reorder(casino.orig$Q7_A_StandAlone, casino$Q7_A_StandAlone)
casino.orig$Q7_A_Integrated = reorder(casino.orig$Q7_A_Integrated, casino$Q7_A_Integrated)
# Reshaping the downtown ratings data for graphing..
stand.and.integrated.ratings.downtown = cbind(prop.table(as.matrix(table(casino.orig$Q7_A_StandAlone)[1:5])),
prop.table(as.matrix(table(casino.orig$Q7_A_Integrated)[1:5])))
colnames(stand.and.integrated.ratings.downtown) = c("Standalone Casino","Integrated Entertainment Complex")
stand.and.integrated.ratings.downtown.long = melt(stand.and.integrated.ratings.downtown, varnames=c("Rating","Casino Type"), value.name="Percentage")
# Graphing ratings of casino suitability for the downtown location
ggplot(stand.and.integrated.ratings.downtown.long, aes(x=stand.and.integrated.ratings.downtown.long$"Casino Type", fill=Rating, y=Percentage,label=sprintf("%.02f %%", Percentage*100))) + geom_bar(position="dodge") + coord_flip() + ggtitle("Ratings of Casino Suitability \nin Downtown Toronto by Casino Type") + scale_x_discrete(name="") + theme(title=element_text(size=22),plot.title=element_text(hjust=.8)) + scale_y_continuous(labels=percent) + geom_text(aes(x=stand.and.integrated.ratings.downtown.long$"Casino Type", y=Percentage, ymax=Percentage, label=sprintf("%.01f%%",Percentage*100), hjust=.75),position = position_dodge(width=1),size=4) + scale_fill_few(palette="light") + theme_wsj()
# Reshaping the exhibition place ratings for graphing
stand.and.integrated.ratings.exhibition = cbind(prop.table(as.matrix(table(casino.orig$Q7_B_StandAlone)[2:6])),
prop.table(as.matrix(table(casino.orig$Q7_B_Integrated)[2:6])))
colnames(stand.and.integrated.ratings.exhibition) = c("Standalone Casino","Integrated Entertainment Complex")
stand.and.integrated.ratings.exhibition.long = melt(stand.and.integrated.ratings.exhibition, varnames=c("Rating","Casino Type"), value.name="Percentage")
# Reordering the rating text labels for the graphing.
stand.and.integrated.ratings.exhibition.long$Rating = factor(stand.and.integrated.ratings.exhibition.long$Rating, levels=levels(casino.orig$Q7_A_StandAlone)[1:5])
# Graphing ratings of casino suitability for the exhibition place location
ggplot(stand.and.integrated.ratings.exhibition.long, aes(x=stand.and.integrated.ratings.exhibition.long$"Casino Type", fill=Rating, y=Percentage,label=sprintf("%.02f %%", Percentage*100))) + geom_bar(position="dodge") + coord_flip() + ggtitle("Ratings of Casino Suitability \nat Exhibition Place by Casino Type") + scale_x_discrete(name="") + theme(title=element_text(size=22),plot.title=element_text(hjust=.8)) + scale_y_continuous(labels=percent) + geom_text(aes(x=stand.and.integrated.ratings.exhibition.long$"Casino Type", y=Percentage, ymax=Percentage, label=sprintf("%.01f%%",Percentage*100), hjust=.75), position = position_dodge(width=1),size=4) + scale_fill_few(palette="light") + theme_wsj()
# Reshaping the Port Lands ratings for graphing
stand.and.integrated.ratings.portlands = cbind(prop.table(as.matrix(table(casino.orig$Q7_C_StandAlone)[2:6])),
prop.table(as.matrix(table(casino.orig$Q7_C_Integrated)[2:6])))
colnames(stand.and.integrated.ratings.portlands) = c("Standalone Casino", "Integrated Entertainment Complex")
stand.and.integrated.ratings.portlands.long = melt(stand.and.integrated.ratings.portlands, varnames=c("Rating","Casino Type"), value.name="Percentage")
# Reording the rating text labels for the graping.
stand.and.integrated.ratings.portlands.long$Rating = factor(stand.and.integrated.ratings.portlands.long$Rating, levels=levels(casino.orig$Q7_A_StandAlone)[1:5])
# Graphing ratings of casino suitability for the port lands location
ggplot(stand.and.integrated.ratings.portlands.long, aes(x=stand.and.integrated.ratings.portlands.long$"Casino Type", fill=Rating, y=Percentage,label=sprintf("%.02f %%", Percentage*100))) + geom_bar(position="dodge") + coord_flip() + ggtitle("Ratings of Casino Suitability \nat Port Lands by Casino Type") + scale_x_discrete(name="") + theme(title=element_text(size=22),plot.title=element_text(hjust=.8)) + scale_y_continuous(labels=percent) + geom_text(aes(x=stand.and.integrated.ratings.portlands.long$"Casino Type", y=Percentage, ymax=Percentage, label=sprintf("%.01f%%",Percentage*100), hjust=.75), position = position_dodge(width=1),size=4) + scale_fill_few(palette="light") + theme_wsj()
# This was the part in my analysis where I looked at postal codes (FSAs really) and their coordinates
# Sorry I'm not more linear in how I do my analysis vs. write about it :)
# You'll notice that I've imported the geocode file as ffdf. This led to faster merging with the
# original casino data set. This meant that I had to coerce the casino.orig data frame into ffdf format
# But I work with it every day at work, so I'm used to it by now, despite its idiosynchracies.
casino.orig$PostalCode = toupper(casino.orig$PostalCode)
pcodes = read.csv.ffdf(file="/home/inkhorn/Downloads/zipcodeset.txt", first.rows=50000, next.rows=50000, colClasses=NA, header=FALSE)
names(pcodes) = c("Postal","Lat","Long","City","Prov")
pcodes$FSA = as.ff(as.factor(toupper(substr(pcodes[,"Postal"], 1,3))))
casino.orig = as.ffdf(casino.orig)
casino.orig$PostalCode = as.ff(as.factor(toupper(casino.orig[,"PostalCode"])))
casino.orig = merge(casino.orig, pcodes, by.x="PostalCode", by.y="FSA", all.x=TRUE)
# This is the code for the full map I generated
casino.gc = casino.orig[which(!is.na(casino.orig[,"Lat"])),] # making sure only records with coordinates are included...
mymap = MapBackground(lat=casino.gc$Lat, lon=casino.gc$Long)
PlotOnStaticMap(mymap, casino.gc$Lat, casino.gc$Long, cex=1.5, pch=21, bg="orange")
# Here I'm getting a list of cities, winnowing it down, and using it to filter the
# geocode coordinates to zoom in on the map I generated.
cities = data.frame(table(casino.orig[,"City"]))
cities = cities[cities$Freq > 0,]
cities = cities[order(cities$Freq, decreasing=TRUE),]
cities = cities[cities$Var1 != '',]
cities.filter = cities[1:28,] # Here's my top cities variable (i set an arbitrary dividing line...)
names(cities.filter) = c("City","# Responses")
# Here's where I filtered the original casino ffdf so that it only contained the cities
# that I wanted to see in Southern Ontario
casino.top.so = casino.orig[which(casino.orig[,"City"] %in% cities.filter$Var1),]
# here's a transparency function that I used for the southern ontario map
addTrans <- function(color,trans)
{
# This function adds transparancy to a color.
# Define transparancy with an integer between 0 and 255
# 0 being fully transparant and 255 being fully visable
# Works with either color and trans a vector of equal length,
# or one of the two of length 1.
if (length(color)!=length(trans)&!any(c(length(color),length(trans))==1)) stop("Vector lengths not correct")
if (length(color)==1 & length(trans)>1) color <- rep(color,length(trans))
if (length(trans)==1 & length(color)>1) trans <- rep(trans,length(color))
num2hex <- function(x)
{
hex <- unlist(strsplit("0123456789ABCDEF",split=""))
return(paste(hex[(x-x%%16)/16+1],hex[x%%16+1],sep=""))
}
rgb <- rbind(col2rgb(color),trans)
res <- paste("#",apply(apply(rgb,2,num2hex),2,paste,collapse=""),sep="")
return(res)
}
# Finally here's the southern ontario map code
mymap = MapBackground(lat=casino.top.so$Lat, lon=casino.top.so$Long)
PlotOnStaticMap(mymap, casino.top.so$Lat, casino.top.so$Long, cex=1.5, pch=21, bg=addTrans("orange",10))
# Here's some code for summarizing and plotting the response data to the question
# around issues of importance regarding the new casino (question 3)
q3.summary = matrix(NA, 16,1,dimnames=list(c("Design of the facility",
"Employment opportunities","Entertainment and cultural activities",
"Expanded convention facilities", "Integration with surrounding areas",
"New hotel accommodations","Problem gambling & health concerns",
"Public safety and social concerns","Public space",
"Restaurants","Retail","Revenue for the City","Support for local businesses",
"Tourist attraction","Traffic concerns","Training and career development"),c("% Very Important")))
for (i in 8:23) {
q3.summary[i-7] = mean(casino[,i] == 3, na.rm=TRUE)}
q3.summary = as.data.frame(q3.summary[order(q3.summary[,1], decreasing = FALSE),])
names(q3.summary)[1] = "% Very Important"
q3.summary$Concern = rownames(q3.summary)
q3.summary = q3.summary[order(q3.summary$"% Very Important", decreasing=FALSE),]
q3.summary$Concern = factor(q3.summary$Concern, levels=q3.summary$Concern)
ggplot(q3.summary, aes(x=Concern, y=q3.summary$"% Very Important")) + geom_point(size=5, colour="forest green") + coord_flip() + ggtitle("Issues of Importance Surrounding\nthe New Casino") + scale_x_discrete(name="Issues of Importance") + theme(title=element_text(size=22),plot.title=element_text(hjust=.8)) + scale_y_continuous(labels=percent) + theme_wsj()
# This chunk of code deals with summarizing and plotting the questions surrounding
# what features people might want if a new Integrated Entertainment Complex is built
q7a.summary = matrix(NA, 9,1, dimnames=list(c("No Casino","Casino Only", "Convention Centre Space", "Cultural and Arts Facilities",
"Hotel","Nightclubs","Restaurants","Retail","Theatre"),c("% Include")))
for (i in 36:44) {
q7a.summary[i-35] = mean(casino[,i], na.rm=TRUE)}
q7a.summary = as.data.frame(q7a.summary[order(q7a.summary[,1], decreasing = FALSE),])
names(q7a.summary)[1] = "% Include"
q7a.summary$feature = rownames(q7a.summary)
q7a.summary$feature = factor(q7a.summary$feature, levels=q7a.summary$feature)
ggplot(q7a.summary, aes(x=feature, y=q7a.summary$"% Include")) + geom_point(size=5, colour="forest green") + coord_flip() + ggtitle("What People Would Want in an Integrated\nEntertainment Complex in Downtown Toronto") + scale_x_discrete(name="Features") + theme(title=element_text(size=22),plot.title=element_text(hjust=.8)) + scale_y_continuous(labels=percent,name="% Wanting the Feature") + theme_wsj()
q7b.summary = matrix(NA, 9,1, dimnames=list(c("No Casino","Casino Only", "Convention Centre Space", "Cultural and Arts Facilities",
"Hotel","Nightclubs","Restaurants","Retail","Theatre"),c("% Include")))
for (i in 52:60) {
q7b.summary[i-51] = mean(casino[,i], na.rm=TRUE)}
q7b.summary = as.data.frame(q7b.summary[order(q7b.summary[,1], decreasing = FALSE),])
names(q7b.summary)[1] = "% Include"
q7b.summary$feature = rownames(q7b.summary)
q7b.summary$feature = factor(q7b.summary$feature, levels=q7b.summary$feature)
ggplot(q7b.summary, aes(x=feature, y=q7b.summary$"% Include")) + geom_point(size=5, colour="forest green") + coord_flip() + ggtitle("What People Would Want in an Integrated\nEntertainment Complex at the Exhbition Place") + scale_x_discrete(name="Features") + theme(title=element_text(size=22),plot.title=element_text(hjust=.8)) + scale_y_continuous(labels=percent,name="% Wanting the Feature") + theme_wsj()
q7c.summary = matrix(NA, 9,1, dimnames=list(c("No Casino","Casino Only", "Convention Centre Space", "Cultural and Arts Facilities",
"Hotel","Nightclubs","Restaurants","Retail","Theatre"),c("% Include")))
for (i in 68:76) {
q7c.summary[i-67] = mean(casino[,i], na.rm=TRUE)}
q7c.summary = as.data.frame(q7c.summary[order(q7c.summary[,1], decreasing = FALSE),])
names(q7c.summary)[1] = "% Include"
q7c.summary$feature = rownames(q7c.summary)
q7c.summary$feature = factor(q7c.summary$feature, levels=q7c.summary$feature)
ggplot(q7c.summary, aes(x=feature, y=q7b.summary$"% Include")) + geom_point(size=5, colour="forest green") + coord_flip() + ggtitle("What People Would Want in an Integrated\nEntertainment Complex in Port Lands") + scale_x_discrete(name="Features") + theme(title=element_text(size=22),plot.title=element_text(hjust=.8)) + scale_y_continuous(labels=percent,name="% Wanting the Feature") + theme_wsj()
# It sucks, but I imported yet another version of the casino dataset so that I wouldn't have to use
# the annoying ffdf indexing notation (e.g. df[,"variable1"])
casino.orig2 = read.csv("/home/inkhorn/Downloads/casino_survey_results20130325.csv")
# Finally, here's some code where I processed and plotted the Gender and Age demographic variables
casino$Gender = casino.orig$Gender
casino$Gender = ifelse(!(casino.orig2$Gender %in% c("Female","Male","Transgendered")), "Did not disclose",
ifelse(casino.orig2$Gender == "Female","Female",
ifelse(casino.orig2$Gender == "Male","Male","Transgendered")))
casino$Gender = factor(casino$Gender, levels=c("Transgendered","Did not disclose","Female","Male"))
ggplot(casino, aes(x=Gender,y=..count../sum(..count..))) + geom_bar(fill="forest green") + coord_flip() + ggtitle("Gender Distribution of Respondents") + scale_x_discrete(name="") + theme_wsj() + theme(title=element_text(size=22),plot.title=element_text(hjust=.8)) + stat_bin(aes(label = sprintf("%.02f %%", ..count../sum(..count..)*100)),
geom="text") + scale_y_continuous(labels=percent)
casino$Age = ifelse(casino.orig2$Age == "", "Did not disclose",
ifelse(casino.orig2$Age == "Under 15", "Under 15",
ifelse(casino.orig2$Age == "15-24", "15-24",
ifelse(casino.orig2$Age == "25-34", "25-34",
ifelse(casino.orig2$Age == "35-44", "35-44",
ifelse(casino.orig2$Age == "45-54","45-54",
ifelse(casino.orig2$Age == "55-64","55-64",
ifelse(casino.orig2$Age == "65 or older", "65 or older","Did not disclose"))))))))
casino$Age = factor(casino$Age, levels=c("Did not disclose","Under 15","15-24","25-34","35-44","45-54","55-64","65 or older"))
ggplot(casino, aes(x=Age,y=..count../sum(..count..))) + geom_bar(fill="forest green") + coord_flip() + ggtitle("Age Distribution of Respondents") + scale_x_discrete(name="") + theme_wsj() + theme(title=element_text(size=22),plot.title=element_text(hjust=.8)) + stat_bin(aes(label = sprintf("%.02f %%", ..count../sum(..count..)*100)), geom="text") + scale_y_continuous(labels=percent)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment