secret
Created

Chicago Data Visualization ggplot2 Tutorial 1 Script

  • Download Gist
ggplot2-tutorial-1.R
R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
#INSTALLS ggplot2 PACKAGE. YOU WILL BE ASKED TO PICK A SERVER
install.packages("ggplot2")
 
#IF ALREADY INSTALLED, THIS WILL LOAD IT FOR USE
library(ggplot2)
 
# SIMPLE HISTOGRAM USING QPLOT() "QUICK PLOT" FUNCTION
qplot(rating, data=movies, geom="histogram")
 
# CAN CONTROL OTHER VISUAL ELEMENTS THROUGH OPTIONS
qplot(rating, data=movies, geom="histogram", binwidth=0.1)
 
# A WEIGHTED HISTOGRAM
qplot(rating, data=movies, weight=votes, geom="histogram", binwidth=0.1)
 
# NOW A SCATTER PLOT
qplot(mpaa, rating, data=movies)
 
# JITTER TO PREVENT OVER PLOTTING
qplot(mpaa, rating, data=movies, geom="jitter")
 
# LAYER OTHER PLOT TYPES
qplot(mpaa, rating, data=movies, geom=c("boxplot", "jitter"))
 
# OOPS, OTHER WAY
qplot(mpaa, rating, data=movies, geom=c("jitter", "boxplot"))
 
# ADD OTHER DIMENSTIONS THROUGH COLORS
qplot(mpaa, rating, data=movies, geom=c("jitter"), color=factor(Action))
 
# ANOTHER DATA SET WITH CONTINUOUS DATA
qplot(wt, mpg, data=mtcars, color=factor(cyl))
 
# QPLOT() IS QUICK, BUT LIMITED IN OPTIONS. GGPLOT() GIVES YOU FULL CONTROL
# SAME PLOT AS BEFORE USING GGPLOT()
ggplot(mtcars, aes(x=wt, y=mpg, color=factor(cyl))) + geom_point()
 
# THE AES() OPTION CONTROLS THE AESTHETIC VALUES
ggplot(mtcars, aes(x=wt, y=mpg, color=factor(cyl), shape=qsec)) + geom_point()
 
# GGPLOT IS BUILT AROUND THE GRAMMAR OF GRAPHICS TO BUILD LAYERS
p <- ggplot(mtcars, aes(x=wt, y=mpg))
p + geom_point(aes(color=factor(cyl), shape=qsec))
 
# THE FIRST P PRODUCES NOTHING, THE USER LAYERS THE GRAPHICAL ELEMENTS
p + geom_point(aes(color=factor(cyl), shape=qsec)) + geom_jitter(position=position_jitter(height=5))
 
# OVERLAY A LOESS ON A DOT PLOT
p + geom_point(aes(color=factor(cyl))) + stat_smooth()
 
# THE ORIGINAL p DOESN'T CONTAIN DATA, SO LINE WITHOUT POINTS IS:
p + stat_smooth()
 
# CAN ALSO USE LM OR GLM MODEL
p + stat_smooth(method="lm") + geom_point()
 
# MAKING THE LINE VERY UGLY
p + stat_smooth(fill="blue", size=2, alpha=1)
 
# OVERLAY A LEAST-SQUARES REGRESSOR FOR EACH CYL
c <- ggplot(mtcars, aes(y=wt, x=mpg, color=factor(cyl)))
c + stat_smooth(method=lm) + geom_point()
 
# FACETING BREAKS THE GRAPH INTO GRIDS FOR ANALYSIS
p + geom_point() + facet_grid(. ~ cyl)
 
# CAN OVERLAY A LEAST-SQUARES REGRESSION FOR EACH FACET
p + geom_point() + stat_smooth(method=lm) + facet_grid(. ~ cyl)
 
# ADD SOME COLOR
c <- ggplot(mtcars, aes(y=wt, x=mpg, color=factor(cyl)))
c + geom_point() + stat_smooth(method=lm)
 
# EXTRAPOLATING THE LINES
c + geom_point() + stat_smooth(method=lm, fullrange=TRUE, alpha=0.1)
 
##########################
# SOME MEANINGFUL EXAMPLES
 
 
setwd("C:\\Users\\tls573\\Dropbox\\Chicago Data Visualization\\ggplot2")
crime <- read.csv("Data\\Crimes_-_2011.csv")
 
# FIX DATA
str(crime) # Show data structures
crime$Date <- strptime(crime$Date, "%m/%d/%Y %H:%M") # Changes date from factor to proper POSIXlt date.
crime$Ward <- as.factor(crime$Ward) # Ward isn't a continuous variable, it's distinct sets
crime$Beat <- as.factor(crime$Beat) # Beat isn't a continuous variable, it's distinct sets
 
date.graph <- ggplot(crime, aes(x = Date)) # This prepares graphs with Date on the x-axis.
date.graph + geom_histogram() # Create a histogram with a date of crime on the x-axis. Shows the frequency of crime.
date.graph + geom_histogram(binwidth = range(x)/30)
 
qplot(date, data=crime, geom="histogram")
 
# THERE ARE LOTS OF GEOMETRIC SHAPES TO DISPLAY DATA
# SOME BASIC SHAPES ARE:
#
# geom_bar() a bar graph
# geom_histogram() a histogram
# geom_density() like a histogram, but shows periodic distribution
# geom_line() line graph
# geom_point() scatterplot
# geom_boxplot() Boxplot
 
# EXAMPLE DENSITY FUNCTION
date.graph + geom_density() # SHOWS THE PERIODIC DISTRIBUTION OF CRIMES
date.graph + geom_density()
date.graph + geom_density(adjust = 1/2) # ROUGH
date.graph + geom_density(adjust = 3) # SMOOTH
date.graph + geom_density(fill="blue") # A BLUE PLOT
date.graph + geom_density(fill="blue", alpha = .2) # BLUE AND TRANSPARENT
date.graph + geom_density(size = 2) + geom_histogram(aes(y=..density..)) # DRAW DENSITY AND HISTOGRAM, MUST ADJUST HISTOGRAM TO DENSITY
date.graph + geom_histogram(aes(y=..density..)) + geom_density(size = 2) # SAME AS PREVIOUS GRAPH, BUT THE DENSITY IS DRAWN ON TOP
 
# EXAMPLES OF THE BAR CHART
crime.type <- qplot(Primary.Type, data=crime, geom="bar") # COLUMN GRAPH OF CRIMES BY PRIMARY TYPE OF CRIME
crime.type + coord_flip() # Horizontal bar chart
 
# WE HAVE SOME TYPE-Os THAT ARE CREATING DUPLICATIONS. LET'S FIX THIS.
levels(crime$Primary.Type)
levels(crime$Primary.Type)[11] <- "INTERFERENCE WITH PUBLIC OFFICER"
levels(crime$Primary.Type)[22] <- "OTHER OFFENSE"
 
ggplot(crime, aes(x = Primary.Type)) + geom_bar() + coord_flip() # SAME HORIZONTAL BAR CHART AS ABOVE
 
# PLOT THE WARD AND ARRESTS
ward.graph <- qplot(Ward, data=crime, geom="bar") # Which ward has the most crimes?
ward.graph + coord_flip() # Graph it with horizontal bar chart
ward.arrests <- ggplot(crime, aes(x = Ward, fill = Arrest))
ward.arrests + geom_bar() + coord_flip()
 
# PLOT THE WARD AND CRIME TYPE
ward.crime.type <- ggplot(crime, aes(x= Ward, y = Primary.Type))
ward.crime.type + geom_point() # LOTS OF OVERPLOTTING
ward.crime.type + geom_point(position="jitter") # JITTER THE PLOT, BETTER, BUT NOT PERFECT
 
# PLOT THE WARD, CRIME TYPE, AND WHETHER THERE WAS AN ARREST
ward.crime.arrest <- ggplot(crime, aes(x = Ward, y = Primary.Type))
ward.crime.arrest + geom_point(aes(color = Arrest),position = "jitter") # Color shows whether there was an arrest
ward.crime.arrest + geom_point(aes(color = Arrest, shape = Domestic), position = "jitter") # Now we add a shape to determine if there was an arrest...but it's getting a bit thick.
 
# DO CERTAIN CRIMES HAPPEN DURING CERTAIN PERIODS?
arrest.time <- ggplot(crime, aes(x = Date))
arrest.time + geom_histogram() + facet_grid(Arrest ~ Domestic)
 
 
 
 
# NOW LET'S WORK WITH THE PROPOSED CHICAGO BUDGET DATA FOR 2013.
 
 
 
 
# IMPORT BUDGET DATA
budget <- read.csv("Data\\Budget_-_2013_Budget_Recommendations_-_Appropriations.csv")
 
# FIX DATA
budget$X2012.APPROPRATION <- as.numeric(sub("\\$","",budget$X2012.APPROPRATION))
budget$X2012.REVISED.APPROPRIATION <- as.numeric(sub("\\$","",budget$X2012.REVISED.APPROPRIATION))
budget$X2013.RECOMMENDATION <- as.numeric(sub("\\$","",budget$X2013.RECOMMENDATION))
 
names(budget)[10] <- "X2012.APPROPRIATION" # RENAME MISSPELLED VARIABLE
budget$DEPARTMENT.NUMBER <- as.factor(budget$DEPARTMENT.NUMBER)
 
budget <- budget[-4382, ]; budget <- budget[-4381, ]; budget <- budget[-4380, ]; budget <- budget[-4033, ]; budget <- budget[-2576, ]
 
# DISTRIBUTION OF INCOME SOURCES FOR EACH DEPARTMENT
budget.distribution <- ggplot(budget, aes(x = DEPARTMENT.DESCRIPTION, y = X2013.RECOMMENDATION))
budget.distribution + geom_boxplot() + coord_flip()
 
# CHANGE IN AMOUNTS
budget.comparisons <- ggplot(budget, aes(x = X2013.RECOMMENDATION, y = X2012.REVISED.APPROPRIATION))
budget.comparisons + geom_point() # GRAPH LAST YEAR APPROPRIATIONS TO 2013
budget.comparisons + geom_point() + geom_abline(intercept = 0, slope = 1) # THE DIAGONAL LINE QUICKLY SHOWS WHO RECEIVED AN INCREASE
 
# LET'S LOOK HOW THE DISTRIBUTION OF FUNDS HAS CHANGED FOR SALARIES
budget.salaries <- subset(budget, APPROPRIATION.ACCOUNT.DESCRIPTION == "SALARIES AND WAGES - ON PAYROLL")
budget.account <- ggplot(budget.salaries, aes(x = X2013.RECOMMENDATION))
budget.account + geom_histogram(binwidth=100000) + scale_y_continuous(limits = c(0, 20))

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.