jstaf/biomaRt_tutorial.R

## biomaRt_tutorial.R
# biomaRt is a data mining tool that allows you extract massive amounts of data
# from online databases with minimal effort.

# This tutorial will use biomaRt to find the average distance between
# transcription factor binding sites and the genes they are known to act on in
# Drosophila (let's say we want to know how large an area we need to search
# up/down from a gene while looking for new binding sites). Interestingly, I
# don't think there's actually a published value for this, so we're doing
# something new here.

# Jeff Stafford

# Load our starting dataset.
known_enhancers <- read.delim(file= "oreganno_dmel_full.txt", as.is = TRUE)

# This file was generated from oreganno_FULL_08Nov10.txt.gz, retrieved from
# http://www.oreganno.org/oregano/Dump.jsp. Oreganno is a free and open
# regulatory element database. This is literally all of the Drosophila-related
# information in the database.
str(known_enhancers)

# Bash commands used to extract the Drosophila genes and generate the above file:
# gunzip oreganno_FULL_08Nov10.txt.gz
# grep -i 'drosophila melanogaster' > oreganno_dmel_data.txt
# head -n 1 oreganno_FULL_08Nov10.txt > oreganno_dmel_full.txt
# cat oreganno_dmel_data.txt >> oreganno_dmel_full.txt

# We are going to try to calculate the average distance between transcription
# factors and their targets. To do this, we need the start and end positions of
# each target.

# Our database already has the enhancer locations and targets.
head(cbind(known_enhancers$Gene.name, known_enhancers$chromStart))

# Now, let's try and retrieve all the information that would normally be in a
# .bed file for each gene (the gene start and end).

library(biomaRt)

# We can browse a list of "marts" to use, generally each one is maintained by a
# separate organization. We're going to use ensembl, as it generally has the
# most variety in terms of species.
martList <- listMarts()
head(martList)
ensembl <- useMart("ensembl")

# Which datasets do we want to use?
datasets <- listDatasets(ensembl)
head(datasets)
ensembl <- useMart("ensembl", dataset = "dmelanogaster_gene_ensembl")
# As an important side note, the Oreganno data and ensembl mart are both using
# the BDGP5 annotation (the current annotation is BDGP6). A lot of things
# changed between these two versions, so using mismatched annotations would be BAD.

# What data do we want to retrieve?
attributes <- listAttributes(ensembl)
head(attributes)

# In this case, we want to grab several different versions of each gene name,
# chromosome each gene is on, and the start and end positions for each.
bedmap <- getBM(attributes = c("ensembl_gene_id", # FBgn number - changes often, but is highly specific
                               "flybasename_gene", # Actual gene names
                               "flybasecgid_gene", # CG number - similar to FBgn, but not all datasets have this.
                               "chromosome_name",
                               "start_position",
                               "end_position"),
                filters = "flybasename_gene", # "filters" is the category of values we are searching by
                values = known_enhancers$Gene.name, # "values" are the actual values we are searching with
                mart = ensembl) # Need to specify the mart/dataset we are searching.

# If something goes wrong, or if Ensembl ever updates its mart to BDGP6, we can
# also load the data this way:
# bedmap <- read.csv("TFbedmap.csv", as.is = TRUE)

# Check to make sure we got an annotation for every gene known to be associated
# with a transcription factor.
length(unique(known_enhancers$Gene.ID)) == length(unique(bedmap$flybasename_gene))

# Upon closer inspection of our TF dataset, much of the mismatch is due to the
# fact that not all of our binding sites are actually binding sites. The dataset
# includes regulatory regions not associated with any gene.
unique(known_enhancers$Type)
sub <- subset(known_enhancers, known_enhancers$Type == "TRANSCRIPTION FACTOR BINDING SITE")
sum(is.na(match(bedmap$flybasecgid_gene, sub$Gene.ID)))/length(sub$Gene.ID)
# All in all, it looks like we are missing about gene start/end information for
# 11 percent of our genes. This is likely due to an annotation mismatch that I
# don't particularly feel like troubleshooting at the moment.

# Match up and annotate our enhancer db with what we know about their target genes.
tf_idx <- match(known_enhancers$Gene.ID, bedmap$flybasecgid_gene)

# With this index, we can simply add in our new data from ensembl.
known_enhancers$gene_start <- bedmap$start_position[tf_idx]
known_enhancers$gene_end <- bedmap$end_position[tf_idx]

# Okay so now we know where all of the genes in our TF database start. Lets
# calculate distances.
removeNegative <- function(bp) {
  if (is.na(bp)) {
    NA
  }
  else if (bp<0) {
    NA
  } else {
    bp
  }
}

known_enhancers$dist_from_start <- known_enhancers$chromStart - known_enhancers$gene_start
known_enhancers$dist_from_start <- apply(as.matrix(known_enhancers$dist_from_start),1,removeNegative)
known_enhancers$dist_from_end <- known_enhancers$chromStart - known_enhancers$gene_end
known_enhancers$dist_from_end <- apply(as.matrix(known_enhancers$dist_from_end),1,removeNegative)

mean(known_enhancers$dist_from_start, na.rm = TRUE)
mean(known_enhancers$dist_from_end, na.rm = TRUE)

range(known_enhancers$dist_from_start, na.rm = TRUE)
range(known_enhancers$dist_from_end, na.rm = TRUE)

# That's huge, but it looks like there two TFs with absurdly distant enhancers...
# I'm going to arbitrarily exclude outliers over 50kb as a result (the stuff
# we'd probably want to do for downstream analysis won't even go that far
# anyways).

start_subset <- subset(known_enhancers, known_enhancers$dist_from_start < 50000)
hist(start_subset$dist_from_start, breaks = seq(0,50000,1000))
end_subset <- subset(known_enhancers, known_enhancers$dist_from_end < 50000)
hist(end_subset$dist_from_end, breaks = seq(0,50000,1000))

start_mean <- mean(start_subset$dist_from_start, na.rm = TRUE)
start_sd <- sd(start_subset$dist_from_start, na.rm = TRUE)

end_mean <- mean(end_subset$dist_from_end, na.rm = TRUE)
end_sd <- sd(end_subset$dist_from_end, na.rm = TRUE)

start_mean + start_sd
end_mean + end_sd
# It looks like enhancers are generally within 26.5kb of genes in Drosophila.

length(start_subset$dist_from_start)/length(end_subset$dist_from_end)
# There are twice as many known enhancers upstream of genes compared to downstream.

# Again, these are very rough estimates and should be taken with a MASSIVE grain
# of salt (this is absurdly biased towards genes/enhancers that have been
# studied more). But hey, we answered our initial question and learned to use
# biomaRt, right?


## ggplot2_tutorial.R
# ggplot2 is a plotting framework that is relatively easy to use, powerful, AND
# it looks good.

# Jeff Stafford

library(ggplot2)

# Here's the data we're going to use (bundled with ggplot2).
str(msleep)
# Looks like it's data on sleep of some kind.
data <- msleep

# What is a ggplot2 object? Basically it is your data + information on how to
# interpret it + the actual geometry it uses to plot it.

# How to create ggplot2 objects:

# You can add as much data in the inital function call as you want. All of these
# work, but the final version is the only "complete" object that fully specifies
# the data used for the plot.
ref <- ggplot()
ref <- ggplot(data)
ref <- ggplot(data, aes(x = bodywt, y = sleep_total))

# To store an object (to add to it later/plot it on demand), just give it a
# reference. Simply typing the reference will display the plot (if you've
# provided enough information to make it.)
ref
# As you can see, we haven't specified everything we need yet.

# IMPORTANT: There are 3 components to making a plot with a ggplot object: your
# data, the aesthetic mappings of your data, and the geometry. If you are
# missing one, you won't get a functional plot.

# Your data should be a dataframe with everything you want to plot. Note that it
# is possible to put data from multiple sources (ie. different dataframes) in
# the same plot, but it's easier if everything is in the same 2-dimensional
# dataframe.
ref <- ggplot(data)

# The aesthetic mappings tell ggplot2 how to interpret your data. Which values
# in your dataframe are the y-values, x-values, what should be used for colors, etc.
ref <- ggplot(data, aes(x = bodywt, y = sleep_total))

# The geometry is the actual stuff that goes on the plot. You can specify any
# geometry as long as you have supplied the values it needs. If you've specified
# the required aesthetic mappings (which data corresponds to x, y, etc.), all
# you need to do is tell ggplot2 to create a certain geometry- for instance a
# scatterplot.

# Just add the geometry you want to your object. In this case, we are making a scatterplot.
ref <- ggplot(data, aes(x = bodywt, y = sleep_total)) + geom_point()
ref

# All you need to do to add more information to your plot/change things is add
# on more elements. Lets add a logarithmic scale on the x axis.
ref <- ref + scale_x_log10()
ref

# Lets add a smoothed mean.
ref + geom_smooth()

# You can also specify aesthetics inside the call to create geomtery.
ref <- ggplot(data) + geom_point(aes(x = bodywt, y = sleep_total)) + scale_x_log10()
ref
ref <- ref + geom_smooth()
ref
# Why didn't that work? This is because when we specfy aesthetics inside a call
# to geomtery it only applies for that layer (only geom_point got the x and y
# values). The only information that gets passed to all geometery calls is
# aethetics specified in the initial creation of the ggplot object.

# So if we wanted that to work, we'd have to do this:
ggplot(data) + scale_x_log10() +
  geom_point(aes(x = bodywt, y = sleep_total)) +
  geom_smooth(aes(x = bodywt, y = sleep_total))

# It's important to note that geometry will automatically use any aesthetic
# mappings that it understands, and ignore ones it doesn't. So if you specify as
# much stuff as you can in the inital call that can be used, it'll save you
# work.

# Like this:
ggplot(data, aes(x = bodywt, y = sleep_total)) + scale_x_log10() + geom_point() + geom_smooth()

# Let's follow up with a few very common plot/geometry types and mappings you
# might be interested in:

# These x and y mappings (and the log scale on the x axis will be used for all later plots).
plot <- ggplot(data, aes(x = bodywt, y = sleep_total)) + scale_x_log10()

# First lets add color based on what things eat. Note that it automatically adds a legend.
plot + geom_point(aes(color = vore))
# We used a factor there, but we can also use a continuous variable for color as well.
plot + geom_point(aes(color = brainwt))
# We can change the legend to change the colors in this case.
plot + geom_point(aes(color = brainwt)) + scale_color_gradient2()
# Change the colors
plot + geom_point(aes(color = log(brainwt))) +
  scale_color_gradient2(low = "green", mid = "yellow", high = "red",
                        midpoint = -4, na.value = "purple")

# How about changing size?
plot + geom_point(aes(size = sleep_rem))
# Or alpha (add some titles and labels while we're at it)?
plot + geom_point(aes(alpha = sleep_rem)) +
  xlab("this is our x axis") + ylab("this is our y axis") + ggtitle("title") + scale_alpha("our legend")

# If we want to simply change a plot value like marker shape or size without
# mapping it to data, just specify it outside the call to aesthetics.
plot + geom_point(aes(shape = vore), size = 6, color = "orange")

# Let's facet our data by a factor:
plot + geom_point() + facet_wrap(~vore)

# Let's put it all together...

library(scales)
# oob specifies what to do with out of bounds values for any scale (normally the
# value gets changed to NA), "squish" sets them to scale max or min, to use
# squish you need the "scales" package.

ggplot(data, aes(x = bodywt, y = sleep_total, size = log(brainwt), color = sleep_rem)) +
  scale_x_log10("Body weight") + scale_y_continuous("Total sleep (hours)") +
  geom_point() +
  facet_wrap(~ vore, nrow = 1 , ncol = 5) +
  scale_color_gradient(low = "firebrick1", na.value = "green", limits = c(0,4), oob = squish)

# Note that we were manipulating aesthetic mappings that geom_point()
# understands. To see what it understands, check out either the help for
# ?geom_point or its documentation (with examples) at
# http://docs.ggplot2.org/current/

# Now for a few other types of plots:

# Boxplot... note that stats are automatically performed, more about that later...
ggplot(data, aes(x = vore, y = sleep_total)) + geom_boxplot()
ggplot(data, aes(x = vore, y = sleep_total, fill = vore)) + geom_boxplot()

# 1D density
ggplot(data, aes(x = sleep_total, fill = vore)) + geom_density(alpha = 0.5)
# 2D density
ggplot(data, aes(x = sleep_total, y = sleep_rem)) + geom_density2d()
# Violin plot
ggplot(data, aes(x = vore, y = sleep_total)) + geom_violin()
# Jittered scatterplot
ggplot(data, aes(x = vore, y = sleep_total)) + geom_jitter(position = position_jitter(width = 0.2))
# Another method for jittering a scatterplot + violin plot
ggplot(data, aes(x = vore, y = sleep_total)) + geom_violin() + geom_point(position = "jitter")
# Bar plot
ggplot(data, aes(x = vore)) + geom_bar()
# Note that it automatically is binning the number of values in "vore".

# Bars are automatically ordered alphabetically (apparently people say that this
# is not a bug, it's a "feature"...). To reorder a factor:
reordered <- factor(data$vore, levels = c("herbi","omni","carni", "insecti", NA))
# Anything that reorders a factor will work to change bar order, order of color labels, etc.
ggplot() + geom_bar(aes(x = reordered))

# Let's graph mean sleep/category instead of just the raw number of animals in each category.
sub <- subset(data, is.na(data$vore) == FALSE)
categories <- unique(sub$vore)
sleepMeans <- rep(NA, length(categories))
names(sleepMeans) <- categories
sleepSEM <- sleepMeans
for (cat in categories) {
  sleepMeans[cat] <- mean(sub$sleep_total[sub$vore == cat])
  sleepSEM[cat] <- sd(sub$sleep_total)/sqrt(length(sub$sleep_total[sub$vore == cat]))
}
ggplot() + geom_bar(aes(x = sleepMeans, fill = names(sleepMeans)))
# What happened? geom_bar() and (ggplot2 in general) automatically bins values,
# which can be really annoying. So it's counting one value for each level of the factor.

# Use "stat_identity" when calling geom_bar instead (geom_bar() implicitly calls
# "stat_bin") and map a value to y.
ggplot() + geom_bar(aes(x = names(sleepMeans), y = sleepMeans, fill = names(sleepMeans)), stat = "identity")

# Converting to a dataframe for ease-of-use later.
sleep <- as.data.frame(sleepMeans)
colnames(sleep) <- c("means")

# Let's add error bars, we calculated standard error of the mean earlier...
plot <- ggplot(sleep, aes(x = rownames(sleep), y = means, fill = rownames(sleep),
                          ymin = means - sleepSEM, ymax = means + sleepSEM)) +
  geom_bar(stat = "identity")
plot + geom_errorbar()
# Change errorbar width:
plot + geom_errorbar(width = 0.5)


# Let's do an in-depth example (all of this can be applied to other plot types):

# Reorder bars in descending order of their value
idx <- order(sleep$means, decreasing = TRUE)
sleep$name <- factor(rownames(sleep), levels = rownames(sleep)[idx])

# Create a custom color palette with RColorBrewer
library(RColorBrewer)
display.brewer.all()
palette <- brewer.pal(n = length(rownames(sleep))*2, "Spectral")[seq.int(1,8,2)]
names(palette) <- levels(sleep$name)

# Notice that it's just using hexadecimal color codes. You can use a vector of
# any R colors/hex codes you can think of.
palette

example <- ggplot(sleep, aes(x = name, y = means, fill = name,
                  ymin = means - sleepSEM, ymax = means + sleepSEM)) +
  geom_bar(stat = "identity") + geom_errorbar(width = 0.5) +
  scale_y_continuous(limits = c(0, max(sleep$means)*1.5)) +
  xlab("Food type") + ylab("Average sleep per night (hours)") +
  scale_fill_manual(values = palette) +
  guides(fill = FALSE) # this kills the redundant legend
example

# Change theme elements to white.
example + theme(panel.background = element_rect(fill = "white"),
                panel.grid.major = element_line(colour = "white"),
                panel.grid.minor = element_line(colour = "white"))
# Or just change a large number of graphical elements at once to a specified theme:
example + theme_bw()

# ggthemes also has an excellent selection of themes to choose from. Check out
# what's available at: https://github.com/jrnold/ggthemes
library(ggthemes)
example + theme_wsj()

# To save a file use ggsave(). Defaults to last plot made but you can specify a
# plot with "plot = plotName" as one of the arguments. File extension is
# automatically chosen based on filename.
ggsave(filename = "example.png", width = 10, height = 10, units = "cm")

# I recommend using the Cairo package when exporting, as it performs
# antialiasing. This will only make a visible difference in plots with lots of
# tiny datapoints or complex shapes (ie. not a bar plot).
library(Cairo)
ggsave(filename = "example-cairo.png", width = 10, height = 10, units = "cm", type = "cairo-png")

# So yeah, ggplot2 is a pretty powerful package. To see what's possible, read
# the documentation at: http://docs.ggplot2.org/current/

# Also helpful:
# http://zevross.com/blog/2014/08/04/beautiful-plotting-in-r-a-ggplot2-cheatsheet-3/
	# biomaRt is a data mining tool that allows you extract massive amounts of data
	# from online databases with minimal effort.

	# This tutorial will use biomaRt to find the average distance between
	# transcription factor binding sites and the genes they are known to act on in
	# Drosophila (let's say we want to know how large an area we need to search
	# up/down from a gene while looking for new binding sites). Interestingly, I
	# don't think there's actually a published value for this, so we're doing
	# something new here.

	# Jeff Stafford

	# Load our starting dataset.
	known_enhancers <- read.delim(file= "oreganno_dmel_full.txt", as.is = TRUE)

	# This file was generated from oreganno_FULL_08Nov10.txt.gz, retrieved from
	# http://www.oreganno.org/oregano/Dump.jsp. Oreganno is a free and open
	# regulatory element database. This is literally all of the Drosophila-related
	# information in the database.
	str(known_enhancers)

	# Bash commands used to extract the Drosophila genes and generate the above file:
	# gunzip oreganno_FULL_08Nov10.txt.gz
	# grep -i 'drosophila melanogaster' > oreganno_dmel_data.txt
	# head -n 1 oreganno_FULL_08Nov10.txt > oreganno_dmel_full.txt
	# cat oreganno_dmel_data.txt >> oreganno_dmel_full.txt

	# We are going to try to calculate the average distance between transcription
	# factors and their targets. To do this, we need the start and end positions of
	# each target.

	# Our database already has the enhancer locations and targets.
	head(cbind(known_enhancers$Gene.name, known_enhancers$chromStart))

	# Now, let's try and retrieve all the information that would normally be in a
	# .bed file for each gene (the gene start and end).

	library(biomaRt)

	# We can browse a list of "marts" to use, generally each one is maintained by a
	# separate organization. We're going to use ensembl, as it generally has the
	# most variety in terms of species.
	martList <- listMarts()
	head(martList)
	ensembl <- useMart("ensembl")

	# Which datasets do we want to use?
	datasets <- listDatasets(ensembl)
	head(datasets)
	ensembl <- useMart("ensembl", dataset = "dmelanogaster_gene_ensembl")
	# As an important side note, the Oreganno data and ensembl mart are both using
	# the BDGP5 annotation (the current annotation is BDGP6). A lot of things
	# changed between these two versions, so using mismatched annotations would be BAD.

	# What data do we want to retrieve?
	attributes <- listAttributes(ensembl)
	head(attributes)

	# In this case, we want to grab several different versions of each gene name,
	# chromosome each gene is on, and the start and end positions for each.
	bedmap <- getBM(attributes = c("ensembl_gene_id", # FBgn number - changes often, but is highly specific
	"flybasename_gene", # Actual gene names
	"flybasecgid_gene", # CG number - similar to FBgn, but not all datasets have this.
	"chromosome_name",
	"start_position",
	"end_position"),
	filters = "flybasename_gene", # "filters" is the category of values we are searching by
	values = known_enhancers$Gene.name, # "values" are the actual values we are searching with
	mart = ensembl) # Need to specify the mart/dataset we are searching.

	# If something goes wrong, or if Ensembl ever updates its mart to BDGP6, we can
	# also load the data this way:
	# bedmap <- read.csv("TFbedmap.csv", as.is = TRUE)

	# Check to make sure we got an annotation for every gene known to be associated
	# with a transcription factor.
	length(unique(known_enhancers$Gene.ID)) == length(unique(bedmap$flybasename_gene))

	# Upon closer inspection of our TF dataset, much of the mismatch is due to the
	# fact that not all of our binding sites are actually binding sites. The dataset
	# includes regulatory regions not associated with any gene.
	unique(known_enhancers$Type)
	sub <- subset(known_enhancers, known_enhancers$Type == "TRANSCRIPTION FACTOR BINDING SITE")
	sum(is.na(match(bedmap$flybasecgid_gene, sub$Gene.ID)))/length(sub$Gene.ID)
	# All in all, it looks like we are missing about gene start/end information for
	# 11 percent of our genes. This is likely due to an annotation mismatch that I
	# don't particularly feel like troubleshooting at the moment.

	# Match up and annotate our enhancer db with what we know about their target genes.
	tf_idx <- match(known_enhancers$Gene.ID, bedmap$flybasecgid_gene)

	# With this index, we can simply add in our new data from ensembl.
	known_enhancers$gene_start <- bedmap$start_position[tf_idx]
	known_enhancers$gene_end <- bedmap$end_position[tf_idx]

	# Okay so now we know where all of the genes in our TF database start. Lets
	# calculate distances.
	removeNegative <- function(bp) {
	if (is.na(bp)) {
	NA
	}
	else if (bp<0) {
	NA
	} else {
	bp
	}
	}

	known_enhancers$dist_from_start <- known_enhancers$chromStart - known_enhancers$gene_start
	known_enhancers$dist_from_start <- apply(as.matrix(known_enhancers$dist_from_start),1,removeNegative)
	known_enhancers$dist_from_end <- known_enhancers$chromStart - known_enhancers$gene_end
	known_enhancers$dist_from_end <- apply(as.matrix(known_enhancers$dist_from_end),1,removeNegative)

	mean(known_enhancers$dist_from_start, na.rm = TRUE)
	mean(known_enhancers$dist_from_end, na.rm = TRUE)

	range(known_enhancers$dist_from_start, na.rm = TRUE)
	range(known_enhancers$dist_from_end, na.rm = TRUE)

	# That's huge, but it looks like there two TFs with absurdly distant enhancers...
	# I'm going to arbitrarily exclude outliers over 50kb as a result (the stuff
	# we'd probably want to do for downstream analysis won't even go that far
	# anyways).

	start_subset <- subset(known_enhancers, known_enhancers$dist_from_start < 50000)
	hist(start_subset$dist_from_start, breaks = seq(0,50000,1000))
	end_subset <- subset(known_enhancers, known_enhancers$dist_from_end < 50000)
	hist(end_subset$dist_from_end, breaks = seq(0,50000,1000))

	start_mean <- mean(start_subset$dist_from_start, na.rm = TRUE)
	start_sd <- sd(start_subset$dist_from_start, na.rm = TRUE)

	end_mean <- mean(end_subset$dist_from_end, na.rm = TRUE)
	end_sd <- sd(end_subset$dist_from_end, na.rm = TRUE)

	start_mean + start_sd
	end_mean + end_sd
	# It looks like enhancers are generally within 26.5kb of genes in Drosophila.

	length(start_subset$dist_from_start)/length(end_subset$dist_from_end)
	# There are twice as many known enhancers upstream of genes compared to downstream.

	# Again, these are very rough estimates and should be taken with a MASSIVE grain
	# of salt (this is absurdly biased towards genes/enhancers that have been
	# studied more). But hey, we answered our initial question and learned to use
	# biomaRt, right?
	# ggplot2 is a plotting framework that is relatively easy to use, powerful, AND
	# it looks good.

	# Jeff Stafford

	library(ggplot2)

	# Here's the data we're going to use (bundled with ggplot2).
	str(msleep)
	# Looks like it's data on sleep of some kind.
	data <- msleep

	# What is a ggplot2 object? Basically it is your data + information on how to
	# interpret it + the actual geometry it uses to plot it.

	# How to create ggplot2 objects:

	# You can add as much data in the inital function call as you want. All of these
	# work, but the final version is the only "complete" object that fully specifies
	# the data used for the plot.
	ref <- ggplot()
	ref <- ggplot(data)
	ref <- ggplot(data, aes(x = bodywt, y = sleep_total))

	# To store an object (to add to it later/plot it on demand), just give it a
	# reference. Simply typing the reference will display the plot (if you've
	# provided enough information to make it.)
	ref
	# As you can see, we haven't specified everything we need yet.

	# IMPORTANT: There are 3 components to making a plot with a ggplot object: your
	# data, the aesthetic mappings of your data, and the geometry. If you are
	# missing one, you won't get a functional plot.

	# Your data should be a dataframe with everything you want to plot. Note that it
	# is possible to put data from multiple sources (ie. different dataframes) in
	# the same plot, but it's easier if everything is in the same 2-dimensional
	# dataframe.
	ref <- ggplot(data)

	# The aesthetic mappings tell ggplot2 how to interpret your data. Which values
	# in your dataframe are the y-values, x-values, what should be used for colors, etc.
	ref <- ggplot(data, aes(x = bodywt, y = sleep_total))

	# The geometry is the actual stuff that goes on the plot. You can specify any
	# geometry as long as you have supplied the values it needs. If you've specified
	# the required aesthetic mappings (which data corresponds to x, y, etc.), all
	# you need to do is tell ggplot2 to create a certain geometry- for instance a
	# scatterplot.

	# Just add the geometry you want to your object. In this case, we are making a scatterplot.
	ref <- ggplot(data, aes(x = bodywt, y = sleep_total)) + geom_point()
	ref

	# All you need to do to add more information to your plot/change things is add
	# on more elements. Lets add a logarithmic scale on the x axis.
	ref <- ref + scale_x_log10()
	ref

	# Lets add a smoothed mean.
	ref + geom_smooth()

	# You can also specify aesthetics inside the call to create geomtery.
	ref <- ggplot(data) + geom_point(aes(x = bodywt, y = sleep_total)) + scale_x_log10()
	ref
	ref <- ref + geom_smooth()
	ref
	# Why didn't that work? This is because when we specfy aesthetics inside a call
	# to geomtery it only applies for that layer (only geom_point got the x and y
	# values). The only information that gets passed to all geometery calls is
	# aethetics specified in the initial creation of the ggplot object.

	# So if we wanted that to work, we'd have to do this:
	ggplot(data) + scale_x_log10() +
	geom_point(aes(x = bodywt, y = sleep_total)) +
	geom_smooth(aes(x = bodywt, y = sleep_total))

	# It's important to note that geometry will automatically use any aesthetic
	# mappings that it understands, and ignore ones it doesn't. So if you specify as
	# much stuff as you can in the inital call that can be used, it'll save you
	# work.

	# Like this:
	ggplot(data, aes(x = bodywt, y = sleep_total)) + scale_x_log10() + geom_point() + geom_smooth()

	# Let's follow up with a few very common plot/geometry types and mappings you
	# might be interested in:

	# These x and y mappings (and the log scale on the x axis will be used for all later plots).
	plot <- ggplot(data, aes(x = bodywt, y = sleep_total)) + scale_x_log10()

	# First lets add color based on what things eat. Note that it automatically adds a legend.
	plot + geom_point(aes(color = vore))
	# We used a factor there, but we can also use a continuous variable for color as well.
	plot + geom_point(aes(color = brainwt))
	# We can change the legend to change the colors in this case.
	plot + geom_point(aes(color = brainwt)) + scale_color_gradient2()
	# Change the colors
	plot + geom_point(aes(color = log(brainwt))) +
	scale_color_gradient2(low = "green", mid = "yellow", high = "red",
	midpoint = -4, na.value = "purple")

	# How about changing size?
	plot + geom_point(aes(size = sleep_rem))
	# Or alpha (add some titles and labels while we're at it)?
	plot + geom_point(aes(alpha = sleep_rem)) +
	xlab("this is our x axis") + ylab("this is our y axis") + ggtitle("title") + scale_alpha("our legend")

	# If we want to simply change a plot value like marker shape or size without
	# mapping it to data, just specify it outside the call to aesthetics.
	plot + geom_point(aes(shape = vore), size = 6, color = "orange")

	# Let's facet our data by a factor:
	plot + geom_point() + facet_wrap(~vore)

	# Let's put it all together...

	library(scales)
	# oob specifies what to do with out of bounds values for any scale (normally the
	# value gets changed to NA), "squish" sets them to scale max or min, to use
	# squish you need the "scales" package.

	ggplot(data, aes(x = bodywt, y = sleep_total, size = log(brainwt), color = sleep_rem)) +
	scale_x_log10("Body weight") + scale_y_continuous("Total sleep (hours)") +
	geom_point() +
	facet_wrap(~ vore, nrow = 1 , ncol = 5) +
	scale_color_gradient(low = "firebrick1", na.value = "green", limits = c(0,4), oob = squish)

	# Note that we were manipulating aesthetic mappings that geom_point()
	# understands. To see what it understands, check out either the help for
	# ?geom_point or its documentation (with examples) at
	# http://docs.ggplot2.org/current/

	# Now for a few other types of plots:

	# Boxplot... note that stats are automatically performed, more about that later...
	ggplot(data, aes(x = vore, y = sleep_total)) + geom_boxplot()
	ggplot(data, aes(x = vore, y = sleep_total, fill = vore)) + geom_boxplot()

	# 1D density
	ggplot(data, aes(x = sleep_total, fill = vore)) + geom_density(alpha = 0.5)
	# 2D density
	ggplot(data, aes(x = sleep_total, y = sleep_rem)) + geom_density2d()
	# Violin plot
	ggplot(data, aes(x = vore, y = sleep_total)) + geom_violin()
	# Jittered scatterplot
	ggplot(data, aes(x = vore, y = sleep_total)) + geom_jitter(position = position_jitter(width = 0.2))
	# Another method for jittering a scatterplot + violin plot
	ggplot(data, aes(x = vore, y = sleep_total)) + geom_violin() + geom_point(position = "jitter")
	# Bar plot
	ggplot(data, aes(x = vore)) + geom_bar()
	# Note that it automatically is binning the number of values in "vore".

	# Bars are automatically ordered alphabetically (apparently people say that this
	# is not a bug, it's a "feature"...). To reorder a factor:
	reordered <- factor(data$vore, levels = c("herbi","omni","carni", "insecti", NA))
	# Anything that reorders a factor will work to change bar order, order of color labels, etc.
	ggplot() + geom_bar(aes(x = reordered))

	# Let's graph mean sleep/category instead of just the raw number of animals in each category.
	sub <- subset(data, is.na(data$vore) == FALSE)
	categories <- unique(sub$vore)
	sleepMeans <- rep(NA, length(categories))
	names(sleepMeans) <- categories
	sleepSEM <- sleepMeans
	for (cat in categories) {
	sleepMeans[cat] <- mean(sub$sleep_total[sub$vore == cat])
	sleepSEM[cat] <- sd(sub$sleep_total)/sqrt(length(sub$sleep_total[sub$vore == cat]))
	}
	ggplot() + geom_bar(aes(x = sleepMeans, fill = names(sleepMeans)))
	# What happened? geom_bar() and (ggplot2 in general) automatically bins values,
	# which can be really annoying. So it's counting one value for each level of the factor.

	# Use "stat_identity" when calling geom_bar instead (geom_bar() implicitly calls
	# "stat_bin") and map a value to y.
	ggplot() + geom_bar(aes(x = names(sleepMeans), y = sleepMeans, fill = names(sleepMeans)), stat = "identity")

	# Converting to a dataframe for ease-of-use later.
	sleep <- as.data.frame(sleepMeans)
	colnames(sleep) <- c("means")

	# Let's add error bars, we calculated standard error of the mean earlier...
	plot <- ggplot(sleep, aes(x = rownames(sleep), y = means, fill = rownames(sleep),
	ymin = means - sleepSEM, ymax = means + sleepSEM)) +
	geom_bar(stat = "identity")
	plot + geom_errorbar()
	# Change errorbar width:
	plot + geom_errorbar(width = 0.5)


	# Let's do an in-depth example (all of this can be applied to other plot types):

	# Reorder bars in descending order of their value
	idx <- order(sleep$means, decreasing = TRUE)
	sleep$name <- factor(rownames(sleep), levels = rownames(sleep)[idx])

	# Create a custom color palette with RColorBrewer
	library(RColorBrewer)
	display.brewer.all()
	palette <- brewer.pal(n = length(rownames(sleep))*2, "Spectral")[seq.int(1,8,2)]
	names(palette) <- levels(sleep$name)

	# Notice that it's just using hexadecimal color codes. You can use a vector of
	# any R colors/hex codes you can think of.
	palette

	example <- ggplot(sleep, aes(x = name, y = means, fill = name,
	ymin = means - sleepSEM, ymax = means + sleepSEM)) +
	geom_bar(stat = "identity") + geom_errorbar(width = 0.5) +
	scale_y_continuous(limits = c(0, max(sleep$means)*1.5)) +
	xlab("Food type") + ylab("Average sleep per night (hours)") +
	scale_fill_manual(values = palette) +
	guides(fill = FALSE) # this kills the redundant legend
	example

	# Change theme elements to white.
	example + theme(panel.background = element_rect(fill = "white"),
	panel.grid.major = element_line(colour = "white"),
	panel.grid.minor = element_line(colour = "white"))
	# Or just change a large number of graphical elements at once to a specified theme:
	example + theme_bw()

	# ggthemes also has an excellent selection of themes to choose from. Check out
	# what's available at: https://github.com/jrnold/ggthemes
	library(ggthemes)
	example + theme_wsj()

	# To save a file use ggsave(). Defaults to last plot made but you can specify a
	# plot with "plot = plotName" as one of the arguments. File extension is
	# automatically chosen based on filename.
	ggsave(filename = "example.png", width = 10, height = 10, units = "cm")

	# I recommend using the Cairo package when exporting, as it performs
	# antialiasing. This will only make a visible difference in plots with lots of
	# tiny datapoints or complex shapes (ie. not a bar plot).
	library(Cairo)
	ggsave(filename = "example-cairo.png", width = 10, height = 10, units = "cm", type = "cairo-png")

	# So yeah, ggplot2 is a pretty powerful package. To see what's possible, read
	# the documentation at: http://docs.ggplot2.org/current/

	# Also helpful:
	# http://zevross.com/blog/2014/08/04/beautiful-plotting-in-r-a-ggplot2-cheatsheet-3/