Last active
December 21, 2015 22:19
-
-
Save adamwespiser/6374323 to your computer and use it in GitHub Desktop.
ggplot2 intro (Aug 2013)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
################################## | |
# INTRO | |
# | |
################################# | |
## ggplot2 tutorial by Adam Wespiser | |
# August 29, 2013 | |
# download R from: http://www.r-project.org/ | |
# ggplot2 documentation: http://docs.ggplot2.org/ | |
# R blog aggregator: r-bloggers.com | |
# | |
# R intro book : "R Coobook" by Paul Teetor | |
# R language design: "Software for Data Analysis: Programming with R" by John Chambers | |
# R data guide: "Data Manipulation with R" by Phil Spector | |
# ggplot2 book: "ggplot2: Elegant Graphics for Data Analysis" by Hadley Wickham | |
# add in this an an example: | |
## http://www.r-bloggers.com/2d-plot-with-histograms-for-each-dimension-2013-edition/ | |
################################## | |
# INSTALL LIBRARIES | |
# | |
################################# | |
if (! "ggplot2" %in% rownames(installed.packages())){ | |
install.packages("ggplot2") | |
} | |
library(ggplot2) | |
if (! "plyr" %in% rownames(installed.packages())){ | |
install.packages("plyr") | |
} | |
library(plyr) | |
if (! "grid" %in% rownames(installed.packages())){ | |
install.packages("grid") | |
} | |
library(grid) | |
if (! "reshape2" %in% rownames(installed.packages())){ | |
install.packages("reshape2") | |
} | |
library(reshape2) | |
if (! "gridExtra" %in% rownames(installed.packages())){ | |
install.packages("gridExtra") | |
} | |
library(gridExtra) | |
################################## | |
# R Basics | |
# | |
################################# | |
# to start a session, just invode R on the command line | |
# $ R | |
# Rstudio is the defacto IDE for R programming | |
# Once in a session, you can set and change the working directory | |
# > setwd("/Users/adam/sandbox/presentation/") | |
current.dir = getwd() | |
setwd(current.dir) | |
# to quit an R session, just type | |
# > quit() | |
# or without being prompted to save your history, | |
# > quit("no") | |
# view your command history | |
history() | |
# to see all the variables: | |
ls() | |
# remove variables | |
x = 1 | |
x <- 1 | |
rm(x) | |
# remove all variable | |
rm(list=unlist(ls())) | |
# find information about a function | |
?ggplot2 | |
# search documentation | |
??color | |
# run a script | |
# source("myScript.R") | |
# run a script from the cmd line | |
# R CMD BATCH myScript.R output.txt | |
################################## | |
# Data Frames | |
# | |
################################# | |
## Data.frames | |
## Stores arrays of numerical, factor, and string data | |
# read in a file | |
if (file.exists("file.csv")){ | |
dataFrame = read.csv(file="file.csv") | |
} | |
# create a data.frame | |
basic.df = data.frame(time=1:100, | |
value=runif(100)) | |
# where 1:100 = c(1,2,3,...,100) | |
# runif(n) returns a vector of n uniform random floats between 0 and 1 | |
# view data.frames | |
mtcars | |
# view information on data.frames | |
str(mtcars) | |
# get dimensions | |
dim(mtcars) | |
# get row/col names | |
rownames(mtcars) | |
colnames(mtcars) | |
# select data.frame row: | |
# mtcars[row,col] | |
# first 10 row, all columns | |
mtcars[1:10,] | |
# selection single column | |
mtcars$mpg # return vector | |
mtcars[["mpg"]] # retrun vector | |
mtcars[,"mpg"] # return vector | |
# select multiple cols | |
mtcars[c("mpg","cyl")] # return data.frame | |
# select specific columns | |
# use 'which' function to get vector of index values | |
iris[which(iris$Species== "setosa"),] | |
################################## | |
# ggplot2 basics | |
# plot types: density, bar, scatter, boxplot | |
################################# | |
## now on to ggplot | |
## every function as at least three things: | |
# 1) data ... iris | |
# 2) aesthetic mapping ... Sepal.Width vs. Sepal.Length | |
# 3) geometric object ... scatter plot (point) | |
## Scatter Plot | |
ggplot(iris,aes(x=Sepal.Width,y=Sepal.Length)) + geom_point() | |
## save a file: 1) enter ggplot2 commands 2) ggsave | |
# note the extension determines the file encoding | |
ggsave("fisherIrisData.pdf",height=6,width=5) | |
ggsave("fisherIrisData.png",height=6,width=5) | |
ggsave("fisherIrisData.jpg",height=6,width=5) | |
## Aesthetic Mappings: | |
# we can map: color, shape to data.frames or arbritrary values | |
ggplot(iris,aes(x=Sepal.Width,y=Sepal.Length,shape=Species)) + geom_point() | |
ggplot(iris,aes(x=Sepal.Width,y=Sepal.Length,color=Species)) + geom_point() | |
# set aesthetics to fixed values: | |
ggplot(iris,aes(x=Sepal.Width,y=Sepal.Length)) + geom_point(color="red",size=4) | |
ggplot(diamonds,aes(x=carat,y=price))+ geom_point(alpha=I(0.4)) | |
ggplot(diamonds,aes(x=carat,y=price))+ geom_point(alpha=I(0.1)) | |
# geoms can be layered | |
ggplot(iris,aes(x=Sepal.Width,y=Sepal.Length,color=Species,shape=Species)) + | |
geom_point(size=4) + | |
geom_point(size=2,color="grey") | |
## Line Plot | |
# note, the aesthetic mappings work the same | |
ggplot(economics,aes(x=date,y=uempmed))+geom_line() | |
# add a vertical line: | |
ggplot(economics,aes(x=date,y=uempmed))+ geom_line() + | |
geom_vline(x=as.numeric(as.Date("1986-09-12"))) | |
ggplot(economics,aes(x=date,y=uempmed * pop))+ geom_line() + | |
geom_vline(x=as.numeric(as.Date("1986-09-12"))) | |
## Bar Plot | |
ggplot(mtcars,aes(x=cyl))+geom_bar(binwidth=1) | |
# that looks weird...what's happening? | |
# ans = x scale is numeric, when we want categorical data. | |
# for categories, R uses "factors" | |
ggplot(mtcars,aes(x=factor(cyl)))+geom_bar() | |
# color the bars by type | |
ggplot(mtcars,aes(x=factor(cyl),fill=factor(gear)))+geom_bar() | |
# another, better example: | |
ggplot(diamonds, aes(clarity, fill=cut)) + geom_bar() | |
## Density Plot | |
ggplot(movies, aes(length))+geom_density() | |
# we need to "zoom in" to get a resonable view | |
# create distributions of short vs. non-short movies: | |
ggplot(movies, aes(x=length,fill=factor(Short)))+geom_density() + xlim(0,200) | |
# Set transperancy of distro | |
ggplot(iris, aes(x=Sepal.Width,fill=Species))+geom_density(alpha=I(0.4)) | |
## FreqPoly | |
# like density, except not normalized to area=1 | |
ggplot(iris, aes(x=Sepal.Width,fill=Species))+geom_freqpoly() | |
# let's adjust the bin and line thickness | |
ggplot(iris, aes(x=Sepal.Width,color=Species))+geom_freqpoly(bin=0.2,size=2) | |
## boxplot | |
ggplot(movies[movies$year > 1990,],aes(x=factor(year),y=rating))+geom_boxplot() | |
ggplot(movies[movies$year > 1990,],aes(x=factor(year),y=rating,fill=factor(Short)))+geom_boxplot() | |
## Path | |
# instead of a line, we can follow each subsequent point to the next... | |
# create a random walk: | |
steps = 500 | |
# set our reduce function, so we don't have to use loops... | |
reduceFn = function(x,y)c(x[1]+runif(1)*2 - 1,x) | |
# create a function that returns a data.frame with a random walk of "steps" steps | |
randomWalk = function(steps)data.frame(x= Reduce(f=reduceFn,1:steps,init=c(1)), | |
y= Reduce(f=reduceFn,1:steps,init=c(1)), | |
index=1:(steps + 1)) | |
# if you use a line plot, notice that the x values are ordered... | |
ggplot(randomWalk(1000),aes(x,y,color=index))+geom_line() | |
ggplot(randomWalk(1000),aes(x,y,color=index))+geom_path() + geom_point() | |
################################## | |
# Data Labels and Titles | |
# | |
################################# | |
# ggtitle, xlab, ylab | |
ggplot(iris,aes(x=Sepal.Width,y=Sepal.Length))+geom_point()+ | |
ggtitle("Measurement of Flowers")+ | |
xlab("width (cm)")+ | |
ylab("length (cm)") | |
################################## | |
# Themes | |
# | |
################################# | |
# adjust the background color, tick marks, etc | |
#store what we have so far to a variable | |
iris.plot = ggplot(iris,aes(x=Sepal.Width,y=Sepal.Length,color=Species,shape=Species)) + | |
geom_point(size=4) + | |
geom_point(size=2,color="grey")+ | |
ggtitle("Measurement of Flowers")+ | |
xlab("width (cm)")+ | |
ylab("length (cm)") | |
# plot with... | |
iris.plot | |
# change the background to black and white | |
iris.plot + theme_bw() | |
# some other themes to consider: | |
iris.plot + theme_classic() | |
iris.plot + theme_minimal() | |
# you can customize the theme and specify features... | |
iris.plot + | |
theme(plot.background = element_rect(fill = "white"), | |
panel.background = element_rect(fill = "white"), | |
legend.key = element_rect(fill = "white"), | |
legend.background = element_rect(colour = "black"), | |
panel.grid.minor = element_blank(), | |
panel.grid.major = element_line(colour = "lightgrey", linetype = "dotted"), | |
axis.line = element_line(), | |
axis.text = element_text(colour = "black")) | |
################################## | |
# Legend Positioning (theme subtopic?) | |
# | |
################################# | |
# what options do we have for the position of the legend? | |
# examples borrowed from: http://www.r-bloggers.com/ggplot2-cheatsheet-for-scatterplots/ | |
g1<-ggplot(mtc, aes(x = hp, y = mpg)) + geom_point(aes(color=factor(vs))) | |
#move legend to the inside | |
g2 <- g1 + theme(legend.position=c(1,1),legend.justification=c(1,1)) | |
# move legend to the bottom | |
g3 <- g1 + theme(legend.position = "bottom") | |
# change labels | |
g4 <- g1 + scale_color_discrete(name ="Engine", | |
labels=c("V-engine", "Straight engine")) | |
grid.arrange(g2, g3, g4, nrow=1) | |
################################## | |
# Facets | |
# | |
################################# | |
# Let's just look at PG, R, and PG-13 rated movies... | |
mov.df = movies[as.character(movies$mpaa) %in% c("PG","R","PG-13"),] | |
mov.df$mpaa = mov.df$mpaa[,drop=TRUE] | |
ggplot(mov.df,aes(x=rating,fill=factor(Short))) + | |
geom_density(alpha=I(0.3)) + theme_bw() + | |
facet_wrap(~mpaa) | |
# seperate movies by decade, rating: view rating distribution | |
mov.df$decade = mov.df$year %/% 10 * 10 | |
mov50plus.df = mov.df[mov.df$decade >= 1950,] | |
# facet_grid( y ~ x ) | |
ggplot(mov50plus.df, aes(x=rating,fill=factor(Drama))) + | |
geom_density(alpha=I(0.4))+ theme_bw()+ | |
facet_grid(decade ~ mpaa ) | |
################################## | |
# Grids | |
# | |
################################# | |
# Set up two different plots of iris data for two species, and view them | |
# side by side. | |
irisSmall.df <- subset(iris, Species != "versicolor") | |
p1 <- ggplot(irisSmall.df, aes(x= Sepal.Width, y = Sepal.Length)) + geom_point(aes(color=factor(Species))) | |
p2 <- ggplot(irisSmall.df, aes(x = Petal.Width, y = Petal.Length)) + geom_point(aes(color=factor(Species))) | |
# all plots in one row | |
grid.arrange(p1,p2,nrow=1) | |
# all plots in one col | |
grid.arrange(p1,p2,ncol=1) | |
################################## | |
# Coordinates | |
# | |
################################# | |
## Coordinates | |
# coord_flip | |
# flip the coords, easier to read x labels... | |
ggplot(mov50plus.df,aes(x=factor(year),y=length))+geom_boxplot()+theme_bw() | |
ggplot(mov50plus.df,aes(x=factor(year),y=length))+geom_boxplot()+theme_bw() + coord_flip() | |
# coord_polar | |
# How many teams do baseball players play for? | |
playerTeam.df = ddply(baseball,.(id),function(df)length(unique(df$team))) | |
teamsPerPlayer.df = as.data.frame(table(playerTeam.df$V1)) | |
colnames(teamsPerPlayer.df) = c("teamsPlayedFor","numberOfPlayers") | |
teamsPerPlayer.df$teamsPlayedFor = as.numeric(teamsPerPlayer.df) | |
ggplot(teamsPerPlayer.df,aes(x=factor(1),y=numberOfPlayers,fill=factor(teamsPlayedFor))) + | |
geom_bar(stat="identity") + | |
theme_bw() + | |
coord_polar(theta="y") | |
# coord_trans | |
# apply a transform to your coordinates, simpler than applying it to the data... | |
ct.plot = ggplot(diamonds, aes(carat,price))+geom_point(alpha=I(0.1),size=1)+ | |
theme_bw() | |
ct.plot | |
ct.plot + scale_y_log10() | |
ct.plot + scale_x_log10() | |
ct.plot + coord_trans(x="log10",y="log10") | |
## change the axis ticks w/ labels | |
ct.plot + scale_y_continuous(breaks= seq(0,20000,by=2500))+ | |
scale_x_continuous(breaks= seq(0,5,by=0.5)) | |
################################## | |
# Color/Fill | |
# | |
################################# | |
# note: different scales are used for discrete and continuous data | |
# scale_color_manual for discrete data | |
scm.plot = ggplot(iris,aes(x=Sepal.Length,y=Sepal.Width,color=factor(Species))) + geom_point(size=3)+ | |
theme_bw() | |
# automatic assignment | |
scm.plot + scale_color_manual(values=c("black","red","blue")) | |
# specify colors and assignment to factor values | |
scm.plot + scale_color_manual(values=c("setosa"= "black","versicolor"= "red","virginica"= "blue")) | |
## scale_colour_gradient for continuous data | |
ggplot(baseball,aes(x=hr,y=rbi,color=h))+ geom_point()+ | |
theme_bw() + | |
scale_colour_gradient(low="red",high="green") | |
## scale_colour_gradientn provides smoothing over multiple color values... | |
sin.df <-data.frame(x=1:1000,y=sin(1:1000),rr=1:1000 - (2*pi)*(1:1000 %/% 2*pi)) | |
ggplot(sin.df,aes(x,y,color=rr))+geom_point(size=2)+theme_bw()+ | |
scale_colour_gradientn(colours=c("black","yellow","blue")) | |
scgn.plot = ggplot(baseball,aes(x=hr,y=rbi,color=h))+ geom_point()+ theme_bw() | |
scgn.plot + scale_color_gradientn(colours=rainbow(7)) | |
# you can cycle through the colors for better resolution | |
scgn.plot + scale_color_gradientn(colours=rep(rainbow(10),2)) | |
## scale_colour_brewer, offers a set of nice palettes for discrete coloring: | |
ggplot(iris,aes(x=Sepal.Length,y=Sepal.Width,color=Species))+geom_point(size=3)+scale_colour_brewer(palette="Set1")+theme_bw() | |
## scale_fill_* works the same as the scale_colour_*, and is used when the geom is "filled" instead of "colored" | |
# (note this change from factor to integer is improper) | |
teamsPerPlayer.df$teamsPlayedForInt = 1:12 | |
ggplot(teamsPerPlayer.df,aes(x=factor(1),y=numberOfPlayers,fill=teamsPlayedForInt)) + | |
geom_bar(stat="identity") + | |
theme_bw() + | |
coord_polar(theta="y")+ | |
scale_fill_gradientn(colours=rainbow(10)) | |
# we can try again using discrete values to get proper labels... | |
ggplot(teamsPerPlayer.df,aes(x=factor(1),y=numberOfPlayers,fill=factor(teamsPlayedFor))) + | |
geom_bar(stat="identity") + | |
theme_bw() + | |
coord_polar(theta="y")+ | |
scale_fill_manual(values=rainbow(12),guide= guide_legend(ncol = 4,byrow=TRUE))+ | |
theme(legend.position = "top", | |
panel.grid.minor.y = element_line(colour = "grey"), | |
panel.grid.major.y = element_line(colour = "grey"), | |
axis.text.x = element_text(size=rel(1.5)), | |
axis.text.y = element_blank(), | |
panel.border = element_blank(), | |
axis.ticks.y = element_blank()) + | |
scale_y_discrete(limits= seq(0,1228,by=100))+ | |
xlab("") | |
##################### | |
# Advanced | |
# Examples | |
#################### | |
# 1 | |
# objective: look at the variance and mean of samples of discrete data | |
# approach: create a function that initiates the dataset, then returns a | |
# function whose arguments set the plot window | |
makePlotFunction <-function(size,fun,mean,range,rangeLower=2){ | |
df<-as.data.frame(t(as.matrix( | |
sapply(floor(runif(size)*(range))+ rangeLower, | |
function(x){vec<-sapply(fun(x,mean),floor); | |
c(mean(vec),var(vec),length(vec))}))))#end of data.frame | |
colnames(df)<-c("mean","variance","n"); | |
function(pMin,pMax,xVec,yVec){ | |
df$pSize = as.numeric(df$n) | |
ggplot(df,aes(x=mean,y=variance,color=factor(n),size=pSize))+ | |
geom_point()+ | |
scale_size(range = c(pMin,pMax))+ | |
theme_bw()+ | |
xlim(xVec)+ | |
ylim(yVec)}} | |
# set the function | |
f1= makePlotFunction(size=10000,fun=rnorm,mean=2,range=80,rangeLower=100) | |
f1(xVec=c(1,2),yVec=c(0,2),pMin=2.2,pMax=0.2) | |
f1(xVec=c(1.4,1.6),yVec=c(1.1,1.4),pMin=2.2,pMax=0.2) | |
## Mosiac Plot | |
library(MASS) | |
data(caith) | |
# melt function :: wide data.frame -> long data.frame | |
caith.melt= melt( | |
cbind(as.data.frame(caith), | |
row=rownames(caith)), | |
id.var=c("row")) | |
# rename the columns | |
colnames(caith.melt) = c("eyeColor","hairColor","proportion") | |
ggplot(caith.melt ,aes(x=eyeColor,y=proportion,fill=hairColor,color=hairColor)) + | |
geom_bar(position="fill",stat="identity",size=1) + | |
scale_fill_manual(values=c(topo.colors(4),topo.colors(1))) + | |
theme_bw() + | |
scale_color_manual(values=rep("black",6)) | |
### Mandelbrot set | |
normIter3 = function(n,z,P=2,N=200){n - log( (log(abs(z),base=N)),base=P) } | |
iterate.until.escape <- function(z, c, trans, cond, max=50, response=dwell) { | |
#we iterate all active points in the same array operation, | |
#and keeping track of which points are still iterating. | |
active <- seq_along(z) | |
dwell <- z | |
dwell[] <- 0 | |
for (i in 1:max) { | |
z[active] <- trans(z[active], c[active]); | |
survived <- cond(z[active]) | |
dwell[active[!survived]] <- normIter3(i, z[active[!survived]]) | |
#dwell[active[!survived]] <- i | |
active <- active[survived] | |
if (length(active) == 0) break | |
} | |
eval(substitute(response)) | |
} | |
getArrayZoom <- function(re.vec=c(-2,1),im.vec=c(-1.5,1.5),max=25,len=500){ | |
re = seq(re.vec[1], re.vec[2], len=len) | |
im = seq(im.vec[1], im.vec[2], len=len) | |
c <- outer(re, im, function(x,y) complex(real=x, imaginary=y)) | |
x <- iterate.until.escape(array(0, dim(c)), c, | |
function(z,c)z^2+c, | |
function(z)abs(z) <= 2, | |
max=max) # was 100 | |
melt(x) | |
} | |
xRel = c(-2,0.5) | |
yImg = c(-1.5,1.5) | |
ggplot(getArrayZoom(re.vec=xRel, | |
im.vec=yImg), | |
aes(Var1,Var2,fill=as.numeric(value))) + | |
geom_raster() + | |
scale_fill_gradientn(colours=c("black",(rainbow(7)),na.value="black")) + | |
theme_minimal()+ | |
xlab("pixels of Real values")+ | |
ylab("imaginary values") | |
ggplot(getArrayZoom(re.vec=c(-0.11,-.09),im.vec=c(0.95,0.965)), aes(Var1,Var2,fill=as.numeric(value)^(1/1))) + | |
geom_raster() + | |
scale_fill_gradientn(colours=rainbow(7),na.value="black") + | |
theme_minimal() | |
#ggsave("ms-r=-.1_i=0.95_depth=25.pdf") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment