adamwespiser/ggplot2Presentation.R

## ggplot2Presentation.R
##################################
#    INTRO
#
#################################


## ggplot2 tutorial by Adam Wespiser
# August 29, 2013
# download R from: http://www.r-project.org/
# ggplot2 documentation: http://docs.ggplot2.org/
# R blog aggregator: r-bloggers.com
#
# R intro book : "R Coobook" by Paul Teetor
# R language design: "Software for Data Analysis: Programming with R" by John Chambers
# R data guide: "Data Manipulation with R" by Phil Spector
# ggplot2 book: "ggplot2: Elegant Graphics for Data Analysis" by Hadley Wickham


# add in this an an example:
##  http://www.r-bloggers.com/2d-plot-with-histograms-for-each-dimension-2013-edition/

##################################
#    INSTALL LIBRARIES
#
#################################

if (! "ggplot2" %in% rownames(installed.packages())){
install.packages("ggplot2")
}
library(ggplot2)

if (! "plyr" %in% rownames(installed.packages())){
install.packages("plyr")
}
library(plyr)

if (! "grid" %in% rownames(installed.packages())){
install.packages("grid")
}
library(grid)

if (! "reshape2" %in% rownames(installed.packages())){
install.packages("reshape2")
}
library(reshape2)

if (! "gridExtra" %in% rownames(installed.packages())){
install.packages("gridExtra")
}
library(gridExtra)


##################################
#    R Basics
#
#################################

# to start a session, just invode R on the command line
# $ R
# Rstudio is the defacto IDE for R programming

# Once in a session, you can set and change the working directory
# > setwd("/Users/adam/sandbox/presentation/")

current.dir = getwd()
setwd(current.dir)

# to quit an R session, just type
# > quit()
# or without being prompted to save your history,
# > quit("no")

# view your command history
history()

# to see all the variables:
ls()

# remove variables
x = 1
x <- 1
rm(x)

# remove all variable
rm(list=unlist(ls()))

# find information about a function
?ggplot2

# search documentation
??color

# run a script
# source("myScript.R")

# run a script from the cmd line
# R CMD BATCH myScript.R output.txt


##################################
#    Data Frames
#
#################################

## Data.frames
## Stores arrays of numerical, factor, and string data
# read in a file
if (file.exists("file.csv")){
dataFrame = read.csv(file="file.csv")
}

# create a data.frame
basic.df = data.frame(time=1:100,
                      value=runif(100))
# where 1:100 = c(1,2,3,...,100)
# runif(n) returns a vector of n uniform random floats between 0 and 1

# view data.frames
mtcars

# view information on data.frames
str(mtcars)

# get dimensions
dim(mtcars)

# get row/col names
rownames(mtcars)
colnames(mtcars)

# select data.frame row:
# mtcars[row,col]
# first 10 row, all columns
mtcars[1:10,]

# selection single column
mtcars$mpg # return vector
mtcars[["mpg"]] # retrun vector
mtcars[,"mpg"] # return vector

# select multiple cols
mtcars[c("mpg","cyl")] # return data.frame


# select specific columns
# use 'which' function to get vector of index values
iris[which(iris$Species==  "setosa"),]


##################################
#    ggplot2 basics
#    plot types: density, bar, scatter, boxplot
#################################

## now on to ggplot
## every function as at least three things:
# 1) data   ... iris
# 2) aesthetic mapping  ... Sepal.Width vs. Sepal.Length
# 3) geometric object ...  scatter plot (point)

## Scatter Plot

ggplot(iris,aes(x=Sepal.Width,y=Sepal.Length)) + geom_point()

## save a file: 1) enter ggplot2 commands 2) ggsave
# note the extension determines the file encoding
ggsave("fisherIrisData.pdf",height=6,width=5)
ggsave("fisherIrisData.png",height=6,width=5)
ggsave("fisherIrisData.jpg",height=6,width=5)


## Aesthetic Mappings:
# we can map: color, shape to data.frames or arbritrary values

ggplot(iris,aes(x=Sepal.Width,y=Sepal.Length,shape=Species)) + geom_point()
ggplot(iris,aes(x=Sepal.Width,y=Sepal.Length,color=Species)) + geom_point()

# set aesthetics to fixed values:
ggplot(iris,aes(x=Sepal.Width,y=Sepal.Length)) + geom_point(color="red",size=4)
ggplot(diamonds,aes(x=carat,y=price))+ geom_point(alpha=I(0.4))
ggplot(diamonds,aes(x=carat,y=price))+ geom_point(alpha=I(0.1))


# geoms can be layered
ggplot(iris,aes(x=Sepal.Width,y=Sepal.Length,color=Species,shape=Species)) +
    geom_point(size=4) +
    geom_point(size=2,color="grey")

## Line Plot
# note, the aesthetic mappings work the same
ggplot(economics,aes(x=date,y=uempmed))+geom_line()

# add a vertical line:
ggplot(economics,aes(x=date,y=uempmed))+ geom_line() +
    geom_vline(x=as.numeric(as.Date("1986-09-12")))

ggplot(economics,aes(x=date,y=uempmed * pop))+ geom_line() +
    geom_vline(x=as.numeric(as.Date("1986-09-12")))

## Bar Plot
ggplot(mtcars,aes(x=cyl))+geom_bar(binwidth=1)

# that looks weird...what's happening?
# ans = x scale is numeric, when we want categorical data.
# for categories, R uses "factors"

ggplot(mtcars,aes(x=factor(cyl)))+geom_bar()

# color the bars by type
ggplot(mtcars,aes(x=factor(cyl),fill=factor(gear)))+geom_bar()

# another, better example:
ggplot(diamonds, aes(clarity, fill=cut)) + geom_bar()

## Density Plot
ggplot(movies, aes(length))+geom_density()

# we need to "zoom in" to get a resonable view
# create distributions of short vs. non-short movies:
ggplot(movies, aes(x=length,fill=factor(Short)))+geom_density() + xlim(0,200)

# Set transperancy of distro
ggplot(iris, aes(x=Sepal.Width,fill=Species))+geom_density(alpha=I(0.4))

## FreqPoly
# like density, except not normalized to area=1
ggplot(iris, aes(x=Sepal.Width,fill=Species))+geom_freqpoly()

# let's adjust the bin and line thickness
ggplot(iris, aes(x=Sepal.Width,color=Species))+geom_freqpoly(bin=0.2,size=2)

## boxplot
ggplot(movies[movies$year > 1990,],aes(x=factor(year),y=rating))+geom_boxplot()
ggplot(movies[movies$year > 1990,],aes(x=factor(year),y=rating,fill=factor(Short)))+geom_boxplot()

## Path
# instead of a line, we can follow each subsequent point to the next...

# create a random walk:
steps = 500
# set our reduce function, so we don't have to use loops...
reduceFn = function(x,y)c(x[1]+runif(1)*2 - 1,x)
# create a function that returns a data.frame with a random walk of "steps" steps
randomWalk  = function(steps)data.frame(x= Reduce(f=reduceFn,1:steps,init=c(1)),
                        y=  Reduce(f=reduceFn,1:steps,init=c(1)),
                        index=1:(steps + 1))

# if you use a line plot, notice that the x values are ordered...
ggplot(randomWalk(1000),aes(x,y,color=index))+geom_line()


ggplot(randomWalk(1000),aes(x,y,color=index))+geom_path() + geom_point()


##################################
#    Data Labels and Titles
#
#################################

# ggtitle, xlab, ylab
ggplot(iris,aes(x=Sepal.Width,y=Sepal.Length))+geom_point()+
   ggtitle("Measurement of Flowers")+
   xlab("width (cm)")+
   ylab("length (cm)")


##################################
#    Themes
#
#################################

# adjust the background color, tick marks, etc

#store what we have so far to a variable
iris.plot = ggplot(iris,aes(x=Sepal.Width,y=Sepal.Length,color=Species,shape=Species)) +
   geom_point(size=4) +
   geom_point(size=2,color="grey")+
   ggtitle("Measurement of Flowers")+
   xlab("width (cm)")+
   ylab("length (cm)")

# plot with...
iris.plot

# change the background to black and white
iris.plot + theme_bw()

# some other themes to consider:
iris.plot + theme_classic()
iris.plot + theme_minimal()

# you can customize the theme and specify features...

iris.plot +
     theme(plot.background = element_rect(fill = "white"),
           panel.background = element_rect(fill = "white"),
           legend.key = element_rect(fill = "white"),
           legend.background = element_rect(colour = "black"),
           panel.grid.minor = element_blank(),
           panel.grid.major = element_line(colour = "lightgrey", linetype = "dotted"),
           axis.line = element_line(),
           axis.text = element_text(colour = "black"))


##################################
#  Legend Positioning (theme subtopic?)
#
#################################
# what options do we have for the position of the legend?
# examples borrowed from: http://www.r-bloggers.com/ggplot2-cheatsheet-for-scatterplots/

g1<-ggplot(mtc, aes(x = hp, y = mpg)) + geom_point(aes(color=factor(vs)))
#move legend to the inside
g2 <- g1 + theme(legend.position=c(1,1),legend.justification=c(1,1))
# move legend to the bottom
g3 <- g1 + theme(legend.position = "bottom")
# change labels
g4 <- g1 + scale_color_discrete(name ="Engine",
                                labels=c("V-engine", "Straight engine"))

grid.arrange(g2, g3, g4, nrow=1)


##################################
#    Facets
#
#################################

# Let's just look at PG, R, and PG-13 rated movies...
mov.df = movies[as.character(movies$mpaa) %in% c("PG","R","PG-13"),]
mov.df$mpaa = mov.df$mpaa[,drop=TRUE]
ggplot(mov.df,aes(x=rating,fill=factor(Short))) +
    geom_density(alpha=I(0.3)) + theme_bw() +
    facet_wrap(~mpaa)

# seperate movies by decade, rating: view rating distribution
mov.df$decade = mov.df$year %/% 10 * 10
mov50plus.df = mov.df[mov.df$decade >= 1950,]


# facet_grid( y ~ x )
ggplot(mov50plus.df, aes(x=rating,fill=factor(Drama))) +
    geom_density(alpha=I(0.4))+ theme_bw()+
    facet_grid(decade ~ mpaa )

##################################
#   Grids
#
#################################

# Set up two different plots of iris data for two species, and view them
# side by side.
irisSmall.df <- subset(iris, Species != "versicolor")

p1 <- ggplot(irisSmall.df, aes(x= Sepal.Width, y = Sepal.Length)) + geom_point(aes(color=factor(Species)))
p2 <- ggplot(irisSmall.df, aes(x = Petal.Width, y = Petal.Length)) + geom_point(aes(color=factor(Species)))

# all plots in one row
grid.arrange(p1,p2,nrow=1)

# all plots in one col
grid.arrange(p1,p2,ncol=1)


##################################
#    Coordinates
#
#################################

## Coordinates

# coord_flip
# flip the coords, easier to read x labels...
ggplot(mov50plus.df,aes(x=factor(year),y=length))+geom_boxplot()+theme_bw()
ggplot(mov50plus.df,aes(x=factor(year),y=length))+geom_boxplot()+theme_bw() + coord_flip()

# coord_polar
# How many teams do baseball players play for?
playerTeam.df = ddply(baseball,.(id),function(df)length(unique(df$team)))
teamsPerPlayer.df = as.data.frame(table(playerTeam.df$V1))

colnames(teamsPerPlayer.df) = c("teamsPlayedFor","numberOfPlayers")
teamsPerPlayer.df$teamsPlayedFor = as.numeric(teamsPerPlayer.df)
ggplot(teamsPerPlayer.df,aes(x=factor(1),y=numberOfPlayers,fill=factor(teamsPlayedFor))) +
    geom_bar(stat="identity") +
    theme_bw() +
    coord_polar(theta="y")

# coord_trans
# apply a transform to your coordinates, simpler than applying it to the data...
ct.plot = ggplot(diamonds, aes(carat,price))+geom_point(alpha=I(0.1),size=1)+
              theme_bw()
ct.plot
ct.plot + scale_y_log10()
ct.plot + scale_x_log10()
ct.plot + coord_trans(x="log10",y="log10")

## change the axis ticks w/ labels
ct.plot + scale_y_continuous(breaks= seq(0,20000,by=2500))+
           scale_x_continuous(breaks= seq(0,5,by=0.5))


##################################
#    Color/Fill
#
#################################

# note: different scales are used for discrete and continuous data

# scale_color_manual for discrete data
scm.plot = ggplot(iris,aes(x=Sepal.Length,y=Sepal.Width,color=factor(Species))) + geom_point(size=3)+
    theme_bw()

# automatic assignment
scm.plot + scale_color_manual(values=c("black","red","blue"))
# specify colors and assignment to factor values
scm.plot + scale_color_manual(values=c("setosa"= "black","versicolor"= "red","virginica"= "blue"))

## scale_colour_gradient for continuous data
ggplot(baseball,aes(x=hr,y=rbi,color=h))+ geom_point()+
   theme_bw() +
   scale_colour_gradient(low="red",high="green")

## scale_colour_gradientn provides smoothing over multiple color values...
sin.df <-data.frame(x=1:1000,y=sin(1:1000),rr=1:1000 - (2*pi)*(1:1000 %/% 2*pi))
ggplot(sin.df,aes(x,y,color=rr))+geom_point(size=2)+theme_bw()+
   scale_colour_gradientn(colours=c("black","yellow","blue"))

scgn.plot = ggplot(baseball,aes(x=hr,y=rbi,color=h))+ geom_point()+ theme_bw()
scgn.plot + scale_color_gradientn(colours=rainbow(7))

# you can cycle through the colors for better resolution
scgn.plot + scale_color_gradientn(colours=rep(rainbow(10),2))

## scale_colour_brewer, offers a set of nice palettes for discrete coloring:
ggplot(iris,aes(x=Sepal.Length,y=Sepal.Width,color=Species))+geom_point(size=3)+scale_colour_brewer(palette="Set1")+theme_bw()

## scale_fill_* works the same as the scale_colour_*, and is used when the geom is "filled" instead of "colored"

# (note this change from factor to integer is improper)
teamsPerPlayer.df$teamsPlayedForInt = 1:12

ggplot(teamsPerPlayer.df,aes(x=factor(1),y=numberOfPlayers,fill=teamsPlayedForInt)) +
    geom_bar(stat="identity") +
    theme_bw() +
    coord_polar(theta="y")+
    scale_fill_gradientn(colours=rainbow(10))


# we can try again using discrete values to get proper labels...
ggplot(teamsPerPlayer.df,aes(x=factor(1),y=numberOfPlayers,fill=factor(teamsPlayedFor))) +
    geom_bar(stat="identity") +
    theme_bw() +
    coord_polar(theta="y")+
    scale_fill_manual(values=rainbow(12),guide= guide_legend(ncol = 4,byrow=TRUE))+
    theme(legend.position = "top",
          panel.grid.minor.y = element_line(colour = "grey"),
          panel.grid.major.y = element_line(colour = "grey"),
          axis.text.x = element_text(size=rel(1.5)),
          axis.text.y = element_blank(),
          panel.border = element_blank(),
          axis.ticks.y = element_blank()) +
    scale_y_discrete(limits= seq(0,1228,by=100))+
    xlab("")


#####################
#  Advanced
#  Examples
####################

# 1
# objective: look at the variance and mean of samples of discrete data
# approach: create a function that initiates the dataset, then returns a
# function whose arguments set the plot window
makePlotFunction <-function(size,fun,mean,range,rangeLower=2){
    df<-as.data.frame(t(as.matrix(
                     sapply(floor(runif(size)*(range))+ rangeLower,
                     function(x){vec<-sapply(fun(x,mean),floor);
                                c(mean(vec),var(vec),length(vec))}))))#end of data.frame
    colnames(df)<-c("mean","variance","n");
         function(pMin,pMax,xVec,yVec){
                df$pSize = as.numeric(df$n)
                ggplot(df,aes(x=mean,y=variance,color=factor(n),size=pSize))+
                geom_point()+
                scale_size(range = c(pMin,pMax))+
                theme_bw()+
                xlim(xVec)+
                ylim(yVec)}}

# set the function
f1= makePlotFunction(size=10000,fun=rnorm,mean=2,range=80,rangeLower=100)
f1(xVec=c(1,2),yVec=c(0,2),pMin=2.2,pMax=0.2)
f1(xVec=c(1.4,1.6),yVec=c(1.1,1.4),pMin=2.2,pMax=0.2)

## Mosiac Plot
library(MASS)
data(caith)

# melt function :: wide data.frame -> long data.frame
caith.melt= melt(
                 cbind(as.data.frame(caith),
                       row=rownames(caith)),
                 id.var=c("row"))

# rename the columns
colnames(caith.melt) = c("eyeColor","hairColor","proportion")

ggplot(caith.melt ,aes(x=eyeColor,y=proportion,fill=hairColor,color=hairColor)) +
    geom_bar(position="fill",stat="identity",size=1) +
    scale_fill_manual(values=c(topo.colors(4),topo.colors(1))) +
    theme_bw() +
    scale_color_manual(values=rep("black",6))

### Mandelbrot set
normIter3 = function(n,z,P=2,N=200){n - log( (log(abs(z),base=N)),base=P) }
iterate.until.escape <- function(z, c, trans, cond, max=50, response=dwell) {
    #we iterate all active points in the same array operation,
    #and keeping track of which points are still iterating.
    active <- seq_along(z)
    dwell <- z
    dwell[] <- 0
    for (i in 1:max) {
        z[active] <- trans(z[active], c[active]);
        survived <- cond(z[active])
        dwell[active[!survived]] <- normIter3(i, z[active[!survived]])
        #dwell[active[!survived]] <- i
        active <- active[survived]
        if (length(active) == 0) break
    }
    eval(substitute(response))
}


getArrayZoom <- function(re.vec=c(-2,1),im.vec=c(-1.5,1.5),max=25,len=500){
        re = seq(re.vec[1], re.vec[2], len=len)
        im = seq(im.vec[1], im.vec[2], len=len)
        c <- outer(re, im, function(x,y) complex(real=x, imaginary=y))
        x <- iterate.until.escape(array(0, dim(c)), c,
                                  function(z,c)z^2+c,
                                  function(z)abs(z) <= 2,
                                  max=max) # was 100
        melt(x)
}

xRel = c(-2,0.5)
yImg = c(-1.5,1.5)

ggplot(getArrayZoom(re.vec=xRel,
                    im.vec=yImg),
       aes(Var1,Var2,fill=as.numeric(value))) +
       geom_raster() +
       scale_fill_gradientn(colours=c("black",(rainbow(7)),na.value="black")) +
       theme_minimal()+
       xlab("pixels of Real values")+
       ylab("imaginary values")

  ggplot(getArrayZoom(re.vec=c(-0.11,-.09),im.vec=c(0.95,0.965)), aes(Var1,Var2,fill=as.numeric(value)^(1/1))) +
          geom_raster() +
          scale_fill_gradientn(colours=rainbow(7),na.value="black") +
          theme_minimal()
         #ggsave("ms-r=-.1_i=0.95_depth=25.pdf")
	##################################
	# INTRO
	#
	#################################


	## ggplot2 tutorial by Adam Wespiser
	# August 29, 2013
	# download R from: http://www.r-project.org/
	# ggplot2 documentation: http://docs.ggplot2.org/
	# R blog aggregator: r-bloggers.com
	#
	# R intro book : "R Coobook" by Paul Teetor
	# R language design: "Software for Data Analysis: Programming with R" by John Chambers
	# R data guide: "Data Manipulation with R" by Phil Spector
	# ggplot2 book: "ggplot2: Elegant Graphics for Data Analysis" by Hadley Wickham


	# add in this an an example:
	## http://www.r-bloggers.com/2d-plot-with-histograms-for-each-dimension-2013-edition/

	##################################
	# INSTALL LIBRARIES
	#
	#################################

	if (! "ggplot2" %in% rownames(installed.packages())){
	install.packages("ggplot2")
	}
	library(ggplot2)

	if (! "plyr" %in% rownames(installed.packages())){
	install.packages("plyr")
	}
	library(plyr)

	if (! "grid" %in% rownames(installed.packages())){
	install.packages("grid")
	}
	library(grid)

	if (! "reshape2" %in% rownames(installed.packages())){
	install.packages("reshape2")
	}
	library(reshape2)

	if (! "gridExtra" %in% rownames(installed.packages())){
	install.packages("gridExtra")
	}
	library(gridExtra)


	##################################
	# R Basics
	#
	#################################

	# to start a session, just invode R on the command line
	# $ R
	# Rstudio is the defacto IDE for R programming

	# Once in a session, you can set and change the working directory
	# > setwd("/Users/adam/sandbox/presentation/")

	current.dir = getwd()
	setwd(current.dir)

	# to quit an R session, just type
	# > quit()
	# or without being prompted to save your history,
	# > quit("no")

	# view your command history
	history()

	# to see all the variables:
	ls()

	# remove variables
	x = 1
	x <- 1
	rm(x)

	# remove all variable
	rm(list=unlist(ls()))

	# find information about a function
	?ggplot2

	# search documentation
	??color

	# run a script
	# source("myScript.R")

	# run a script from the cmd line
	# R CMD BATCH myScript.R output.txt



	##################################
	# Data Frames
	#
	#################################

	## Data.frames
	## Stores arrays of numerical, factor, and string data
	# read in a file
	if (file.exists("file.csv")){
	dataFrame = read.csv(file="file.csv")
	}

	# create a data.frame
	basic.df = data.frame(time=1:100,
	value=runif(100))
	# where 1:100 = c(1,2,3,...,100)
	# runif(n) returns a vector of n uniform random floats between 0 and 1

	# view data.frames
	mtcars

	# view information on data.frames
	str(mtcars)

	# get dimensions
	dim(mtcars)

	# get row/col names
	rownames(mtcars)
	colnames(mtcars)

	# select data.frame row:
	# mtcars[row,col]
	# first 10 row, all columns
	mtcars[1:10,]

	# selection single column
	mtcars$mpg # return vector
	mtcars[["mpg"]] # retrun vector
	mtcars[,"mpg"] # return vector

	# select multiple cols
	mtcars[c("mpg","cyl")] # return data.frame


	# select specific columns
	# use 'which' function to get vector of index values
	iris[which(iris$Species== "setosa"),]



	##################################
	# ggplot2 basics
	# plot types: density, bar, scatter, boxplot
	#################################

	## now on to ggplot
	## every function as at least three things:
	# 1) data ... iris
	# 2) aesthetic mapping ... Sepal.Width vs. Sepal.Length
	# 3) geometric object ... scatter plot (point)

	## Scatter Plot

	ggplot(iris,aes(x=Sepal.Width,y=Sepal.Length)) + geom_point()

	## save a file: 1) enter ggplot2 commands 2) ggsave
	# note the extension determines the file encoding
	ggsave("fisherIrisData.pdf",height=6,width=5)
	ggsave("fisherIrisData.png",height=6,width=5)
	ggsave("fisherIrisData.jpg",height=6,width=5)


	## Aesthetic Mappings:
	# we can map: color, shape to data.frames or arbritrary values

	ggplot(iris,aes(x=Sepal.Width,y=Sepal.Length,shape=Species)) + geom_point()
	ggplot(iris,aes(x=Sepal.Width,y=Sepal.Length,color=Species)) + geom_point()

	# set aesthetics to fixed values:
	ggplot(iris,aes(x=Sepal.Width,y=Sepal.Length)) + geom_point(color="red",size=4)
	ggplot(diamonds,aes(x=carat,y=price))+ geom_point(alpha=I(0.4))
	ggplot(diamonds,aes(x=carat,y=price))+ geom_point(alpha=I(0.1))


	# geoms can be layered
	ggplot(iris,aes(x=Sepal.Width,y=Sepal.Length,color=Species,shape=Species)) +
	geom_point(size=4) +
	geom_point(size=2,color="grey")

	## Line Plot
	# note, the aesthetic mappings work the same
	ggplot(economics,aes(x=date,y=uempmed))+geom_line()

	# add a vertical line:
	ggplot(economics,aes(x=date,y=uempmed))+ geom_line() +
	geom_vline(x=as.numeric(as.Date("1986-09-12")))

	ggplot(economics,aes(x=date,y=uempmed * pop))+ geom_line() +
	geom_vline(x=as.numeric(as.Date("1986-09-12")))

	## Bar Plot
	ggplot(mtcars,aes(x=cyl))+geom_bar(binwidth=1)

	# that looks weird...what's happening?
	# ans = x scale is numeric, when we want categorical data.
	# for categories, R uses "factors"

	ggplot(mtcars,aes(x=factor(cyl)))+geom_bar()

	# color the bars by type
	ggplot(mtcars,aes(x=factor(cyl),fill=factor(gear)))+geom_bar()

	# another, better example:
	ggplot(diamonds, aes(clarity, fill=cut)) + geom_bar()

	## Density Plot
	ggplot(movies, aes(length))+geom_density()

	# we need to "zoom in" to get a resonable view
	# create distributions of short vs. non-short movies:
	ggplot(movies, aes(x=length,fill=factor(Short)))+geom_density() + xlim(0,200)

	# Set transperancy of distro
	ggplot(iris, aes(x=Sepal.Width,fill=Species))+geom_density(alpha=I(0.4))

	## FreqPoly
	# like density, except not normalized to area=1
	ggplot(iris, aes(x=Sepal.Width,fill=Species))+geom_freqpoly()

	# let's adjust the bin and line thickness
	ggplot(iris, aes(x=Sepal.Width,color=Species))+geom_freqpoly(bin=0.2,size=2)

	## boxplot
	ggplot(movies[movies$year > 1990,],aes(x=factor(year),y=rating))+geom_boxplot()
	ggplot(movies[movies$year > 1990,],aes(x=factor(year),y=rating,fill=factor(Short)))+geom_boxplot()

	## Path
	# instead of a line, we can follow each subsequent point to the next...

	# create a random walk:
	steps = 500
	# set our reduce function, so we don't have to use loops...
	reduceFn = function(x,y)c(x[1]+runif(1)*2 - 1,x)
	# create a function that returns a data.frame with a random walk of "steps" steps
	randomWalk = function(steps)data.frame(x= Reduce(f=reduceFn,1:steps,init=c(1)),
	y= Reduce(f=reduceFn,1:steps,init=c(1)),
	index=1:(steps + 1))

	# if you use a line plot, notice that the x values are ordered...
	ggplot(randomWalk(1000),aes(x,y,color=index))+geom_line()


	ggplot(randomWalk(1000),aes(x,y,color=index))+geom_path() + geom_point()



	##################################
	# Data Labels and Titles
	#
	#################################

	# ggtitle, xlab, ylab
	ggplot(iris,aes(x=Sepal.Width,y=Sepal.Length))+geom_point()+
	ggtitle("Measurement of Flowers")+
	xlab("width (cm)")+
	ylab("length (cm)")


	##################################
	# Themes
	#
	#################################

	# adjust the background color, tick marks, etc

	#store what we have so far to a variable
	iris.plot = ggplot(iris,aes(x=Sepal.Width,y=Sepal.Length,color=Species,shape=Species)) +
	geom_point(size=4) +
	geom_point(size=2,color="grey")+
	ggtitle("Measurement of Flowers")+
	xlab("width (cm)")+
	ylab("length (cm)")

	# plot with...
	iris.plot

	# change the background to black and white
	iris.plot + theme_bw()

	# some other themes to consider:
	iris.plot + theme_classic()
	iris.plot + theme_minimal()

	# you can customize the theme and specify features...

	iris.plot +
	theme(plot.background = element_rect(fill = "white"),
	panel.background = element_rect(fill = "white"),
	legend.key = element_rect(fill = "white"),
	legend.background = element_rect(colour = "black"),
	panel.grid.minor = element_blank(),
	panel.grid.major = element_line(colour = "lightgrey", linetype = "dotted"),
	axis.line = element_line(),
	axis.text = element_text(colour = "black"))


	##################################
	# Legend Positioning (theme subtopic?)
	#
	#################################
	# what options do we have for the position of the legend?
	# examples borrowed from: http://www.r-bloggers.com/ggplot2-cheatsheet-for-scatterplots/

	g1<-ggplot(mtc, aes(x = hp, y = mpg)) + geom_point(aes(color=factor(vs)))
	#move legend to the inside
	g2 <- g1 + theme(legend.position=c(1,1),legend.justification=c(1,1))
	# move legend to the bottom
	g3 <- g1 + theme(legend.position = "bottom")
	# change labels
	g4 <- g1 + scale_color_discrete(name ="Engine",
	labels=c("V-engine", "Straight engine"))

	grid.arrange(g2, g3, g4, nrow=1)




	##################################
	# Facets
	#
	#################################

	# Let's just look at PG, R, and PG-13 rated movies...
	mov.df = movies[as.character(movies$mpaa) %in% c("PG","R","PG-13"),]
	mov.df$mpaa = mov.df$mpaa[,drop=TRUE]
	ggplot(mov.df,aes(x=rating,fill=factor(Short))) +
	geom_density(alpha=I(0.3)) + theme_bw() +
	facet_wrap(~mpaa)

	# seperate movies by decade, rating: view rating distribution
	mov.df$decade = mov.df$year %/% 10 * 10
	mov50plus.df = mov.df[mov.df$decade >= 1950,]


	# facet_grid( y ~ x )
	ggplot(mov50plus.df, aes(x=rating,fill=factor(Drama))) +
	geom_density(alpha=I(0.4))+ theme_bw()+
	facet_grid(decade ~ mpaa )

	##################################
	# Grids
	#
	#################################

	# Set up two different plots of iris data for two species, and view them
	# side by side.
	irisSmall.df <- subset(iris, Species != "versicolor")

	p1 <- ggplot(irisSmall.df, aes(x= Sepal.Width, y = Sepal.Length)) + geom_point(aes(color=factor(Species)))
	p2 <- ggplot(irisSmall.df, aes(x = Petal.Width, y = Petal.Length)) + geom_point(aes(color=factor(Species)))

	# all plots in one row
	grid.arrange(p1,p2,nrow=1)

	# all plots in one col
	grid.arrange(p1,p2,ncol=1)





	##################################
	# Coordinates
	#
	#################################

	## Coordinates

	# coord_flip
	# flip the coords, easier to read x labels...
	ggplot(mov50plus.df,aes(x=factor(year),y=length))+geom_boxplot()+theme_bw()
	ggplot(mov50plus.df,aes(x=factor(year),y=length))+geom_boxplot()+theme_bw() + coord_flip()

	# coord_polar
	# How many teams do baseball players play for?
	playerTeam.df = ddply(baseball,.(id),function(df)length(unique(df$team)))
	teamsPerPlayer.df = as.data.frame(table(playerTeam.df$V1))

	colnames(teamsPerPlayer.df) = c("teamsPlayedFor","numberOfPlayers")
	teamsPerPlayer.df$teamsPlayedFor = as.numeric(teamsPerPlayer.df)
	ggplot(teamsPerPlayer.df,aes(x=factor(1),y=numberOfPlayers,fill=factor(teamsPlayedFor))) +
	geom_bar(stat="identity") +
	theme_bw() +
	coord_polar(theta="y")

	# coord_trans
	# apply a transform to your coordinates, simpler than applying it to the data...
	ct.plot = ggplot(diamonds, aes(carat,price))+geom_point(alpha=I(0.1),size=1)+
	theme_bw()
	ct.plot
	ct.plot + scale_y_log10()
	ct.plot + scale_x_log10()
	ct.plot + coord_trans(x="log10",y="log10")

	## change the axis ticks w/ labels
	ct.plot + scale_y_continuous(breaks= seq(0,20000,by=2500))+
	scale_x_continuous(breaks= seq(0,5,by=0.5))



	##################################
	# Color/Fill
	#
	#################################

	# note: different scales are used for discrete and continuous data

	# scale_color_manual for discrete data
	scm.plot = ggplot(iris,aes(x=Sepal.Length,y=Sepal.Width,color=factor(Species))) + geom_point(size=3)+
	theme_bw()

	# automatic assignment
	scm.plot + scale_color_manual(values=c("black","red","blue"))
	# specify colors and assignment to factor values
	scm.plot + scale_color_manual(values=c("setosa"= "black","versicolor"= "red","virginica"= "blue"))

	## scale_colour_gradient for continuous data
	ggplot(baseball,aes(x=hr,y=rbi,color=h))+ geom_point()+
	theme_bw() +
	scale_colour_gradient(low="red",high="green")

	## scale_colour_gradientn provides smoothing over multiple color values...
	sin.df <-data.frame(x=1:1000,y=sin(1:1000),rr=1:1000 - (2pi)(1:1000 %/% 2*pi))
	ggplot(sin.df,aes(x,y,color=rr))+geom_point(size=2)+theme_bw()+
	scale_colour_gradientn(colours=c("black","yellow","blue"))

	scgn.plot = ggplot(baseball,aes(x=hr,y=rbi,color=h))+ geom_point()+ theme_bw()
	scgn.plot + scale_color_gradientn(colours=rainbow(7))

	# you can cycle through the colors for better resolution
	scgn.plot + scale_color_gradientn(colours=rep(rainbow(10),2))

	## scale_colour_brewer, offers a set of nice palettes for discrete coloring:
	ggplot(iris,aes(x=Sepal.Length,y=Sepal.Width,color=Species))+geom_point(size=3)+scale_colour_brewer(palette="Set1")+theme_bw()

	## scale_fill_* works the same as the scale_colour_*, and is used when the geom is "filled" instead of "colored"

	# (note this change from factor to integer is improper)
	teamsPerPlayer.df$teamsPlayedForInt = 1:12

	ggplot(teamsPerPlayer.df,aes(x=factor(1),y=numberOfPlayers,fill=teamsPlayedForInt)) +
	geom_bar(stat="identity") +
	theme_bw() +
	coord_polar(theta="y")+
	scale_fill_gradientn(colours=rainbow(10))


	# we can try again using discrete values to get proper labels...
	ggplot(teamsPerPlayer.df,aes(x=factor(1),y=numberOfPlayers,fill=factor(teamsPlayedFor))) +
	geom_bar(stat="identity") +
	theme_bw() +
	coord_polar(theta="y")+
	scale_fill_manual(values=rainbow(12),guide= guide_legend(ncol = 4,byrow=TRUE))+
	theme(legend.position = "top",
	panel.grid.minor.y = element_line(colour = "grey"),
	panel.grid.major.y = element_line(colour = "grey"),
	axis.text.x = element_text(size=rel(1.5)),
	axis.text.y = element_blank(),
	panel.border = element_blank(),
	axis.ticks.y = element_blank()) +
	scale_y_discrete(limits= seq(0,1228,by=100))+
	xlab("")



	#####################
	# Advanced
	# Examples
	####################

	# 1
	# objective: look at the variance and mean of samples of discrete data
	# approach: create a function that initiates the dataset, then returns a
	# function whose arguments set the plot window
	makePlotFunction <-function(size,fun,mean,range,rangeLower=2){
	df<-as.data.frame(t(as.matrix(
	sapply(floor(runif(size)*(range))+ rangeLower,
	function(x){vec<-sapply(fun(x,mean),floor);
	c(mean(vec),var(vec),length(vec))}))))#end of data.frame
	colnames(df)<-c("mean","variance","n");
	function(pMin,pMax,xVec,yVec){
	df$pSize = as.numeric(df$n)
	ggplot(df,aes(x=mean,y=variance,color=factor(n),size=pSize))+
	geom_point()+
	scale_size(range = c(pMin,pMax))+
	theme_bw()+
	xlim(xVec)+
	ylim(yVec)}}

	# set the function
	f1= makePlotFunction(size=10000,fun=rnorm,mean=2,range=80,rangeLower=100)
	f1(xVec=c(1,2),yVec=c(0,2),pMin=2.2,pMax=0.2)
	f1(xVec=c(1.4,1.6),yVec=c(1.1,1.4),pMin=2.2,pMax=0.2)

	## Mosiac Plot
	library(MASS)
	data(caith)

	# melt function :: wide data.frame -> long data.frame
	caith.melt= melt(
	cbind(as.data.frame(caith),
	row=rownames(caith)),
	id.var=c("row"))

	# rename the columns
	colnames(caith.melt) = c("eyeColor","hairColor","proportion")

	ggplot(caith.melt ,aes(x=eyeColor,y=proportion,fill=hairColor,color=hairColor)) +
	geom_bar(position="fill",stat="identity",size=1) +
	scale_fill_manual(values=c(topo.colors(4),topo.colors(1))) +
	theme_bw() +
	scale_color_manual(values=rep("black",6))

	### Mandelbrot set
	normIter3 = function(n,z,P=2,N=200){n - log( (log(abs(z),base=N)),base=P) }
	iterate.until.escape <- function(z, c, trans, cond, max=50, response=dwell) {
	#we iterate all active points in the same array operation,
	#and keeping track of which points are still iterating.
	active <- seq_along(z)
	dwell <- z
	dwell[] <- 0
	for (i in 1:max) {
	z[active] <- trans(z[active], c[active]);
	survived <- cond(z[active])
	dwell[active[!survived]] <- normIter3(i, z[active[!survived]])
	#dwell[active[!survived]] <- i
	active <- active[survived]
	if (length(active) == 0) break
	}
	eval(substitute(response))
	}


	getArrayZoom <- function(re.vec=c(-2,1),im.vec=c(-1.5,1.5),max=25,len=500){
	re = seq(re.vec[1], re.vec[2], len=len)
	im = seq(im.vec[1], im.vec[2], len=len)
	c <- outer(re, im, function(x,y) complex(real=x, imaginary=y))
	x <- iterate.until.escape(array(0, dim(c)), c,
	function(z,c)z^2+c,
	function(z)abs(z) <= 2,
	max=max) # was 100
	melt(x)
	}

	xRel = c(-2,0.5)
	yImg = c(-1.5,1.5)

	ggplot(getArrayZoom(re.vec=xRel,
	im.vec=yImg),
	aes(Var1,Var2,fill=as.numeric(value))) +
	geom_raster() +
	scale_fill_gradientn(colours=c("black",(rainbow(7)),na.value="black")) +
	theme_minimal()+
	xlab("pixels of Real values")+
	ylab("imaginary values")

	ggplot(getArrayZoom(re.vec=c(-0.11,-.09),im.vec=c(0.95,0.965)), aes(Var1,Var2,fill=as.numeric(value)^(1/1))) +
	geom_raster() +
	scale_fill_gradientn(colours=rainbow(7),na.value="black") +
	theme_minimal()
	#ggsave("ms-r=-.1_i=0.95_depth=25.pdf")