Skip to content

Instantly share code, notes, and snippets.

@adamwespiser
Last active December 21, 2015 22:19
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save adamwespiser/6374323 to your computer and use it in GitHub Desktop.
Save adamwespiser/6374323 to your computer and use it in GitHub Desktop.
ggplot2 intro (Aug 2013)
##################################
# INTRO
#
#################################
## ggplot2 tutorial by Adam Wespiser
# August 29, 2013
# download R from: http://www.r-project.org/
# ggplot2 documentation: http://docs.ggplot2.org/
# R blog aggregator: r-bloggers.com
#
# R intro book : "R Coobook" by Paul Teetor
# R language design: "Software for Data Analysis: Programming with R" by John Chambers
# R data guide: "Data Manipulation with R" by Phil Spector
# ggplot2 book: "ggplot2: Elegant Graphics for Data Analysis" by Hadley Wickham
# add in this an an example:
## http://www.r-bloggers.com/2d-plot-with-histograms-for-each-dimension-2013-edition/
##################################
# INSTALL LIBRARIES
#
#################################
if (! "ggplot2" %in% rownames(installed.packages())){
install.packages("ggplot2")
}
library(ggplot2)
if (! "plyr" %in% rownames(installed.packages())){
install.packages("plyr")
}
library(plyr)
if (! "grid" %in% rownames(installed.packages())){
install.packages("grid")
}
library(grid)
if (! "reshape2" %in% rownames(installed.packages())){
install.packages("reshape2")
}
library(reshape2)
if (! "gridExtra" %in% rownames(installed.packages())){
install.packages("gridExtra")
}
library(gridExtra)
##################################
# R Basics
#
#################################
# to start a session, just invode R on the command line
# $ R
# Rstudio is the defacto IDE for R programming
# Once in a session, you can set and change the working directory
# > setwd("/Users/adam/sandbox/presentation/")
current.dir = getwd()
setwd(current.dir)
# to quit an R session, just type
# > quit()
# or without being prompted to save your history,
# > quit("no")
# view your command history
history()
# to see all the variables:
ls()
# remove variables
x = 1
x <- 1
rm(x)
# remove all variable
rm(list=unlist(ls()))
# find information about a function
?ggplot2
# search documentation
??color
# run a script
# source("myScript.R")
# run a script from the cmd line
# R CMD BATCH myScript.R output.txt
##################################
# Data Frames
#
#################################
## Data.frames
## Stores arrays of numerical, factor, and string data
# read in a file
if (file.exists("file.csv")){
dataFrame = read.csv(file="file.csv")
}
# create a data.frame
basic.df = data.frame(time=1:100,
value=runif(100))
# where 1:100 = c(1,2,3,...,100)
# runif(n) returns a vector of n uniform random floats between 0 and 1
# view data.frames
mtcars
# view information on data.frames
str(mtcars)
# get dimensions
dim(mtcars)
# get row/col names
rownames(mtcars)
colnames(mtcars)
# select data.frame row:
# mtcars[row,col]
# first 10 row, all columns
mtcars[1:10,]
# selection single column
mtcars$mpg # return vector
mtcars[["mpg"]] # retrun vector
mtcars[,"mpg"] # return vector
# select multiple cols
mtcars[c("mpg","cyl")] # return data.frame
# select specific columns
# use 'which' function to get vector of index values
iris[which(iris$Species== "setosa"),]
##################################
# ggplot2 basics
# plot types: density, bar, scatter, boxplot
#################################
## now on to ggplot
## every function as at least three things:
# 1) data ... iris
# 2) aesthetic mapping ... Sepal.Width vs. Sepal.Length
# 3) geometric object ... scatter plot (point)
## Scatter Plot
ggplot(iris,aes(x=Sepal.Width,y=Sepal.Length)) + geom_point()
## save a file: 1) enter ggplot2 commands 2) ggsave
# note the extension determines the file encoding
ggsave("fisherIrisData.pdf",height=6,width=5)
ggsave("fisherIrisData.png",height=6,width=5)
ggsave("fisherIrisData.jpg",height=6,width=5)
## Aesthetic Mappings:
# we can map: color, shape to data.frames or arbritrary values
ggplot(iris,aes(x=Sepal.Width,y=Sepal.Length,shape=Species)) + geom_point()
ggplot(iris,aes(x=Sepal.Width,y=Sepal.Length,color=Species)) + geom_point()
# set aesthetics to fixed values:
ggplot(iris,aes(x=Sepal.Width,y=Sepal.Length)) + geom_point(color="red",size=4)
ggplot(diamonds,aes(x=carat,y=price))+ geom_point(alpha=I(0.4))
ggplot(diamonds,aes(x=carat,y=price))+ geom_point(alpha=I(0.1))
# geoms can be layered
ggplot(iris,aes(x=Sepal.Width,y=Sepal.Length,color=Species,shape=Species)) +
geom_point(size=4) +
geom_point(size=2,color="grey")
## Line Plot
# note, the aesthetic mappings work the same
ggplot(economics,aes(x=date,y=uempmed))+geom_line()
# add a vertical line:
ggplot(economics,aes(x=date,y=uempmed))+ geom_line() +
geom_vline(x=as.numeric(as.Date("1986-09-12")))
ggplot(economics,aes(x=date,y=uempmed * pop))+ geom_line() +
geom_vline(x=as.numeric(as.Date("1986-09-12")))
## Bar Plot
ggplot(mtcars,aes(x=cyl))+geom_bar(binwidth=1)
# that looks weird...what's happening?
# ans = x scale is numeric, when we want categorical data.
# for categories, R uses "factors"
ggplot(mtcars,aes(x=factor(cyl)))+geom_bar()
# color the bars by type
ggplot(mtcars,aes(x=factor(cyl),fill=factor(gear)))+geom_bar()
# another, better example:
ggplot(diamonds, aes(clarity, fill=cut)) + geom_bar()
## Density Plot
ggplot(movies, aes(length))+geom_density()
# we need to "zoom in" to get a resonable view
# create distributions of short vs. non-short movies:
ggplot(movies, aes(x=length,fill=factor(Short)))+geom_density() + xlim(0,200)
# Set transperancy of distro
ggplot(iris, aes(x=Sepal.Width,fill=Species))+geom_density(alpha=I(0.4))
## FreqPoly
# like density, except not normalized to area=1
ggplot(iris, aes(x=Sepal.Width,fill=Species))+geom_freqpoly()
# let's adjust the bin and line thickness
ggplot(iris, aes(x=Sepal.Width,color=Species))+geom_freqpoly(bin=0.2,size=2)
## boxplot
ggplot(movies[movies$year > 1990,],aes(x=factor(year),y=rating))+geom_boxplot()
ggplot(movies[movies$year > 1990,],aes(x=factor(year),y=rating,fill=factor(Short)))+geom_boxplot()
## Path
# instead of a line, we can follow each subsequent point to the next...
# create a random walk:
steps = 500
# set our reduce function, so we don't have to use loops...
reduceFn = function(x,y)c(x[1]+runif(1)*2 - 1,x)
# create a function that returns a data.frame with a random walk of "steps" steps
randomWalk = function(steps)data.frame(x= Reduce(f=reduceFn,1:steps,init=c(1)),
y= Reduce(f=reduceFn,1:steps,init=c(1)),
index=1:(steps + 1))
# if you use a line plot, notice that the x values are ordered...
ggplot(randomWalk(1000),aes(x,y,color=index))+geom_line()
ggplot(randomWalk(1000),aes(x,y,color=index))+geom_path() + geom_point()
##################################
# Data Labels and Titles
#
#################################
# ggtitle, xlab, ylab
ggplot(iris,aes(x=Sepal.Width,y=Sepal.Length))+geom_point()+
ggtitle("Measurement of Flowers")+
xlab("width (cm)")+
ylab("length (cm)")
##################################
# Themes
#
#################################
# adjust the background color, tick marks, etc
#store what we have so far to a variable
iris.plot = ggplot(iris,aes(x=Sepal.Width,y=Sepal.Length,color=Species,shape=Species)) +
geom_point(size=4) +
geom_point(size=2,color="grey")+
ggtitle("Measurement of Flowers")+
xlab("width (cm)")+
ylab("length (cm)")
# plot with...
iris.plot
# change the background to black and white
iris.plot + theme_bw()
# some other themes to consider:
iris.plot + theme_classic()
iris.plot + theme_minimal()
# you can customize the theme and specify features...
iris.plot +
theme(plot.background = element_rect(fill = "white"),
panel.background = element_rect(fill = "white"),
legend.key = element_rect(fill = "white"),
legend.background = element_rect(colour = "black"),
panel.grid.minor = element_blank(),
panel.grid.major = element_line(colour = "lightgrey", linetype = "dotted"),
axis.line = element_line(),
axis.text = element_text(colour = "black"))
##################################
# Legend Positioning (theme subtopic?)
#
#################################
# what options do we have for the position of the legend?
# examples borrowed from: http://www.r-bloggers.com/ggplot2-cheatsheet-for-scatterplots/
g1<-ggplot(mtc, aes(x = hp, y = mpg)) + geom_point(aes(color=factor(vs)))
#move legend to the inside
g2 <- g1 + theme(legend.position=c(1,1),legend.justification=c(1,1))
# move legend to the bottom
g3 <- g1 + theme(legend.position = "bottom")
# change labels
g4 <- g1 + scale_color_discrete(name ="Engine",
labels=c("V-engine", "Straight engine"))
grid.arrange(g2, g3, g4, nrow=1)
##################################
# Facets
#
#################################
# Let's just look at PG, R, and PG-13 rated movies...
mov.df = movies[as.character(movies$mpaa) %in% c("PG","R","PG-13"),]
mov.df$mpaa = mov.df$mpaa[,drop=TRUE]
ggplot(mov.df,aes(x=rating,fill=factor(Short))) +
geom_density(alpha=I(0.3)) + theme_bw() +
facet_wrap(~mpaa)
# seperate movies by decade, rating: view rating distribution
mov.df$decade = mov.df$year %/% 10 * 10
mov50plus.df = mov.df[mov.df$decade >= 1950,]
# facet_grid( y ~ x )
ggplot(mov50plus.df, aes(x=rating,fill=factor(Drama))) +
geom_density(alpha=I(0.4))+ theme_bw()+
facet_grid(decade ~ mpaa )
##################################
# Grids
#
#################################
# Set up two different plots of iris data for two species, and view them
# side by side.
irisSmall.df <- subset(iris, Species != "versicolor")
p1 <- ggplot(irisSmall.df, aes(x= Sepal.Width, y = Sepal.Length)) + geom_point(aes(color=factor(Species)))
p2 <- ggplot(irisSmall.df, aes(x = Petal.Width, y = Petal.Length)) + geom_point(aes(color=factor(Species)))
# all plots in one row
grid.arrange(p1,p2,nrow=1)
# all plots in one col
grid.arrange(p1,p2,ncol=1)
##################################
# Coordinates
#
#################################
## Coordinates
# coord_flip
# flip the coords, easier to read x labels...
ggplot(mov50plus.df,aes(x=factor(year),y=length))+geom_boxplot()+theme_bw()
ggplot(mov50plus.df,aes(x=factor(year),y=length))+geom_boxplot()+theme_bw() + coord_flip()
# coord_polar
# How many teams do baseball players play for?
playerTeam.df = ddply(baseball,.(id),function(df)length(unique(df$team)))
teamsPerPlayer.df = as.data.frame(table(playerTeam.df$V1))
colnames(teamsPerPlayer.df) = c("teamsPlayedFor","numberOfPlayers")
teamsPerPlayer.df$teamsPlayedFor = as.numeric(teamsPerPlayer.df)
ggplot(teamsPerPlayer.df,aes(x=factor(1),y=numberOfPlayers,fill=factor(teamsPlayedFor))) +
geom_bar(stat="identity") +
theme_bw() +
coord_polar(theta="y")
# coord_trans
# apply a transform to your coordinates, simpler than applying it to the data...
ct.plot = ggplot(diamonds, aes(carat,price))+geom_point(alpha=I(0.1),size=1)+
theme_bw()
ct.plot
ct.plot + scale_y_log10()
ct.plot + scale_x_log10()
ct.plot + coord_trans(x="log10",y="log10")
## change the axis ticks w/ labels
ct.plot + scale_y_continuous(breaks= seq(0,20000,by=2500))+
scale_x_continuous(breaks= seq(0,5,by=0.5))
##################################
# Color/Fill
#
#################################
# note: different scales are used for discrete and continuous data
# scale_color_manual for discrete data
scm.plot = ggplot(iris,aes(x=Sepal.Length,y=Sepal.Width,color=factor(Species))) + geom_point(size=3)+
theme_bw()
# automatic assignment
scm.plot + scale_color_manual(values=c("black","red","blue"))
# specify colors and assignment to factor values
scm.plot + scale_color_manual(values=c("setosa"= "black","versicolor"= "red","virginica"= "blue"))
## scale_colour_gradient for continuous data
ggplot(baseball,aes(x=hr,y=rbi,color=h))+ geom_point()+
theme_bw() +
scale_colour_gradient(low="red",high="green")
## scale_colour_gradientn provides smoothing over multiple color values...
sin.df <-data.frame(x=1:1000,y=sin(1:1000),rr=1:1000 - (2*pi)*(1:1000 %/% 2*pi))
ggplot(sin.df,aes(x,y,color=rr))+geom_point(size=2)+theme_bw()+
scale_colour_gradientn(colours=c("black","yellow","blue"))
scgn.plot = ggplot(baseball,aes(x=hr,y=rbi,color=h))+ geom_point()+ theme_bw()
scgn.plot + scale_color_gradientn(colours=rainbow(7))
# you can cycle through the colors for better resolution
scgn.plot + scale_color_gradientn(colours=rep(rainbow(10),2))
## scale_colour_brewer, offers a set of nice palettes for discrete coloring:
ggplot(iris,aes(x=Sepal.Length,y=Sepal.Width,color=Species))+geom_point(size=3)+scale_colour_brewer(palette="Set1")+theme_bw()
## scale_fill_* works the same as the scale_colour_*, and is used when the geom is "filled" instead of "colored"
# (note this change from factor to integer is improper)
teamsPerPlayer.df$teamsPlayedForInt = 1:12
ggplot(teamsPerPlayer.df,aes(x=factor(1),y=numberOfPlayers,fill=teamsPlayedForInt)) +
geom_bar(stat="identity") +
theme_bw() +
coord_polar(theta="y")+
scale_fill_gradientn(colours=rainbow(10))
# we can try again using discrete values to get proper labels...
ggplot(teamsPerPlayer.df,aes(x=factor(1),y=numberOfPlayers,fill=factor(teamsPlayedFor))) +
geom_bar(stat="identity") +
theme_bw() +
coord_polar(theta="y")+
scale_fill_manual(values=rainbow(12),guide= guide_legend(ncol = 4,byrow=TRUE))+
theme(legend.position = "top",
panel.grid.minor.y = element_line(colour = "grey"),
panel.grid.major.y = element_line(colour = "grey"),
axis.text.x = element_text(size=rel(1.5)),
axis.text.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank()) +
scale_y_discrete(limits= seq(0,1228,by=100))+
xlab("")
#####################
# Advanced
# Examples
####################
# 1
# objective: look at the variance and mean of samples of discrete data
# approach: create a function that initiates the dataset, then returns a
# function whose arguments set the plot window
makePlotFunction <-function(size,fun,mean,range,rangeLower=2){
df<-as.data.frame(t(as.matrix(
sapply(floor(runif(size)*(range))+ rangeLower,
function(x){vec<-sapply(fun(x,mean),floor);
c(mean(vec),var(vec),length(vec))}))))#end of data.frame
colnames(df)<-c("mean","variance","n");
function(pMin,pMax,xVec,yVec){
df$pSize = as.numeric(df$n)
ggplot(df,aes(x=mean,y=variance,color=factor(n),size=pSize))+
geom_point()+
scale_size(range = c(pMin,pMax))+
theme_bw()+
xlim(xVec)+
ylim(yVec)}}
# set the function
f1= makePlotFunction(size=10000,fun=rnorm,mean=2,range=80,rangeLower=100)
f1(xVec=c(1,2),yVec=c(0,2),pMin=2.2,pMax=0.2)
f1(xVec=c(1.4,1.6),yVec=c(1.1,1.4),pMin=2.2,pMax=0.2)
## Mosiac Plot
library(MASS)
data(caith)
# melt function :: wide data.frame -> long data.frame
caith.melt= melt(
cbind(as.data.frame(caith),
row=rownames(caith)),
id.var=c("row"))
# rename the columns
colnames(caith.melt) = c("eyeColor","hairColor","proportion")
ggplot(caith.melt ,aes(x=eyeColor,y=proportion,fill=hairColor,color=hairColor)) +
geom_bar(position="fill",stat="identity",size=1) +
scale_fill_manual(values=c(topo.colors(4),topo.colors(1))) +
theme_bw() +
scale_color_manual(values=rep("black",6))
### Mandelbrot set
normIter3 = function(n,z,P=2,N=200){n - log( (log(abs(z),base=N)),base=P) }
iterate.until.escape <- function(z, c, trans, cond, max=50, response=dwell) {
#we iterate all active points in the same array operation,
#and keeping track of which points are still iterating.
active <- seq_along(z)
dwell <- z
dwell[] <- 0
for (i in 1:max) {
z[active] <- trans(z[active], c[active]);
survived <- cond(z[active])
dwell[active[!survived]] <- normIter3(i, z[active[!survived]])
#dwell[active[!survived]] <- i
active <- active[survived]
if (length(active) == 0) break
}
eval(substitute(response))
}
getArrayZoom <- function(re.vec=c(-2,1),im.vec=c(-1.5,1.5),max=25,len=500){
re = seq(re.vec[1], re.vec[2], len=len)
im = seq(im.vec[1], im.vec[2], len=len)
c <- outer(re, im, function(x,y) complex(real=x, imaginary=y))
x <- iterate.until.escape(array(0, dim(c)), c,
function(z,c)z^2+c,
function(z)abs(z) <= 2,
max=max) # was 100
melt(x)
}
xRel = c(-2,0.5)
yImg = c(-1.5,1.5)
ggplot(getArrayZoom(re.vec=xRel,
im.vec=yImg),
aes(Var1,Var2,fill=as.numeric(value))) +
geom_raster() +
scale_fill_gradientn(colours=c("black",(rainbow(7)),na.value="black")) +
theme_minimal()+
xlab("pixels of Real values")+
ylab("imaginary values")
ggplot(getArrayZoom(re.vec=c(-0.11,-.09),im.vec=c(0.95,0.965)), aes(Var1,Var2,fill=as.numeric(value)^(1/1))) +
geom_raster() +
scale_fill_gradientn(colours=rainbow(7),na.value="black") +
theme_minimal()
#ggsave("ms-r=-.1_i=0.95_depth=25.pdf")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment