Created
September 10, 2011 13:39
-
-
Save sckott/1208321 to your computer and use it in GitHub Desktop.
A brief introduction to R.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Intro Tutorial to R | |
# Scott Chamberlain <myrmecocystus@gmail.com> | |
# w/ code edited from Nathan Kraft and Dan Bunker | |
########## HELP ########## | |
# Getting Help with R on listserves online, etc. | |
# Go here for correct way to make reproducible examples to submit to help lists: | |
# https://github.com/hadley/devtools/wiki/Reproducibility | |
# Code writing style: | |
# Google style guide: http://google-styleguide.googlecode.com/svn/trunk/google-r-style.html#generallayout | |
# Henrik Bengtsson style guide: http://www1.maths.lth.se/help/R/RCC/ | |
# Hadley Wickham's style guide: https://github.com/hadley/devtools/wiki/Style | |
# Generally use <- instead of = when assigning an object to a name | |
########## Installing R ########## | |
# go to www.r-project.org, and download a 'precompiled binary' | |
# for your system from the 'cran' site nearest you | |
# follow links to 'download' | |
# follow the directions to install | |
# starting R: double click the R icon, or double click a .R file (a script) | |
########## R is case sensitive ########## | |
myvector <- c(8, 9, 5) | |
myvector # OK | |
Myvector # wrong case, no result | |
########## R as a calculator and simple operations ########## | |
# try these simple commands by typing them into the 'console' | |
2 + 2 | |
c(1, 2, 3) | |
x = c(1, 2, 3) | |
x | |
x = c(1:10, 13) | |
y = x^2 | |
y | |
ls() # this lists all objects you have created that are still in memory | |
rm(x) # Use the rm command to remove objects after you're finished. | |
rm(list = ls()) # removes everything!!! | |
ls() | |
########## Assignments ########## | |
# everything in R is an 'object'. That means that R knows something | |
# about each object you create, and that there are 'methods' | |
# available to work with objects of various classes. | |
########## Scalars ########## | |
a = 10; b <- 10 | |
# print to console | |
b | |
########## Vectors ########## | |
x = c(3, 5, 2, 8, 5, 6, 8, 10) | |
y = c(1, 3, 4, 5, 3, 9, 8, 3) | |
x; y | |
########### Lists ########## | |
# lists are arbitrary collections of variables | |
mylist = c(dd = "foo", ee = 66) | |
mylist | |
########## Character ########## | |
char1 = "this is some text" | |
char1 | |
########## Factors ########## | |
# factors are like character vectors but only contain values in predefined 'levels' | |
fac1 = factor(rep(c("a", "b"), 4)) | |
fac1 | |
########## Matrices ########## | |
xx = matrix(10, 2, 3) | |
xx | |
########## Dataframes ########## | |
# dataframes are special objects in R that we will use a lot. | |
# They are special because each column of a dataframe can | |
# itself be of a different class (UNLIKE MATRICES). | |
df = data.frame(x, y, fac1) | |
df # print data frame | |
summary(df) # basic stats on each column | |
str(df) # description of columns and format style | |
head(df) # top 6 rows | |
tail(df) # bottom 6 rows | |
df[ , 3] # pick columns | |
df[1 , ] # pick rows | |
########## Immutable data frames ########## | |
# Special case of a dataframe used in the plyr package created by Hadley | |
# Wickham. When summarising data in plyr, can use immutable data frames to | |
# do calculations faster, e.g., see the example below | |
library(plyr) | |
bigdf <- data.frame(a = rnorm(10000), b = rnorm(10000)) | |
bigdf_i <- idata.frame(bigdf) # make an immutable data frame | |
system.time( replicate(100, (ddply(bigdf, .(), mean))) ) | |
system.time( replicate(100, (ddply(bigdf_i, .(), mean))) ) # faster! | |
########## Transforming variables ########## | |
# Many options: log10, log, log2, sqrt, asin, etc. | |
df$lny=log10(df$y) | |
df | |
########## Indexing ########## | |
# indexing dataframes | |
df[1,2] # returns row 1, column 2 | |
df[1, 'y'] # same | |
df[,2] # returns column y as a vector | |
df[,'y'] # same | |
df$y # same | |
df[1:4,] # rows 1-4 | |
df[df$fac1=="a",] # rows where fac1 is 'a' | |
df[df$x>mean(df$x),] # rows where x is above the mean for x | |
df[order(df$x), ] # returns whole dataframe sorted by x | |
df[df$y%in%c(3:7),] # rows where 'y' matches a list of values | |
# indexing lists | |
mylist | |
mylist[1] # item 1 in list, with label | |
mylist[[1]] # the value of item 1 | |
########## Sequences/regular patterns ########## | |
####Sequence, or seq(), is useful for making vectors in regular patterns: | |
seq(from=1, to=10, by=1) | |
##can also be written as '1:10' in some contexts: | |
print(1:10) | |
#you can change the interval between elements: | |
seq(from=-10, to=30, by=5) | |
##same as: | |
seq(-10, 30, 5) | |
####repeat, or rep(), also useful for regular patterns | |
#repeats something as many times as you want: | |
rep(5, times=5) | |
##same as: | |
rep(5,5) | |
##more complex: | |
rep(c(1,4,5,2), 3) | |
##also works with text (aka "strings"): | |
rep(c("red", "blue", "gray"), 3) | |
## vector function | |
vector(mode="character", length=5) | |
############ BOOLEANS (aka "TRUE" and "FALSE") and tests ########## | |
########## Booleans ########## | |
# Booleans are a special class in R, and include "TRUE" and "FALSE". | |
# Note that they need to be typed in all caps. In an R editor they should | |
# turn yellow (or a different color) when typed in. On the Mac, | |
# the built-in editor works fine for this; on a PC, you can use an | |
# editing program like TINN-R (http://www.sciviews.org/Tinn-R/). | |
# You used to be able to abbreviate with T and F, but this is now | |
# discouraged and will cause errors in some cases. | |
##Many functions include boolean arguments: | |
vector <- as.integer(c(seq(1:10), NA)) | |
sum(vector) ##in this case "sum" fails because of an NA in the data | |
?sum ## see that there is an argument "na.rm" that we can set to TRUE | |
sum(vector, na.rm=TRUE) ##this removes the NA and sums the remaining 10 integers | |
##equivilently you can use the na.omit() command: | |
sum(na.omit(vector)) | |
##Many tests in R return booleans: | |
vector > 5 ##this returns true for each part of the vector greater than 5 | |
vector[which(vector<3)] ##which() can be used to get the indices of an object where the test is TRUE | |
vector >= 5 ##etc | |
vector == 7 ## this returns true for the element in the vector equal to 7 | |
##Note that two equals signs ("==") are used for testing equivelence, while one ("=") is used to assign something to a variable. | |
##there are a bunch of commands that start with "is." that are very useful: | |
is.na(vector) ##shows us that element 11 is an NA | |
is.finite(vector) ## checks if values are finite numbers | |
##Booleans can be added: TRUE=1,FALSE=0 | |
sum(is.na(vector)) ##means that there is one NA in the vector | |
sum(vector<=3, na.rm=TRUE) ## shows that there are 3 elements of the vector <= to 3. | |
## the "!" (known as a "bang" to people with a background in older | |
## programming languages) is used as a "not" to reverse TRUE and FALSE: | |
is.na(vector) | |
!is.na(vector) | |
vector != 10 ##which part of the vector is not equal to 10?? | |
########## "for" loops: ########## | |
# for loops are a very powerful way of repeating commands in a simulation | |
# or working through each part of an object. | |
# the part after "for" sets up a counter variable 'i' (can be called anything), | |
# which starts (in this case) at 1 and increments by 1 to 10. Each time through | |
# the loop it does whatever is in the { } | |
for(i in 1:10){ | |
print("hello") | |
} | |
##you can refer to the counter variable in the loop: | |
for(j in 1:10){ | |
print(j) | |
} | |
##you can use loops to work through each element of a list or a vector: | |
colors <- c("red", "blue", "green", "silver") | |
colors | |
length(colors) | |
#this loops adds some text to each element of the 'colors' vector | |
for(k in 1:length(colors)){ | |
print(colors[k]) | |
colors[k] <- paste(colors[k], "is a color") | |
} | |
colors | |
# You can also use for loops to run through any arbitrary set of values: | |
for (k in colors){ | |
print(k) | |
} | |
# compare this 'colors' loop with the previous one- here the colors | |
# themselves (not the numbers used to index the colors vector) are what | |
# the loop moves over. | |
## your loop does not have to be sequential from 1: | |
for (k in seq(2,10,2)) { | |
print(k) | |
} | |
################ "If/ else" statments ################ | |
# "if" statements are similar in structure to for loops, and are useful for | |
# doing something if a certain condition is met | |
##lets set up a boolean (true/ false) variable: | |
friendly <- TRUE | |
##if statements just do whatever is in the {} if the statement in () | |
# following "if" is TRUE: | |
if(friendly){ | |
print("hello") | |
} | |
##input the entire statment- you should get a greeting | |
##lets change "friendly" to be false: | |
friendly<-FALSE | |
if(friendly){ | |
print("hello") | |
} | |
##now you should get no greeting at all | |
## you can follow up the "if" with an "else" statement that gives an | |
## alternative if the first test is false: | |
if(friendly){ | |
print("hello") | |
}else{ | |
print("grrrrr...") | |
} | |
##switch "friendly" and run it again: | |
friendly <-!friendly ##remember the "!" means "not" | |
if(friendly){ | |
print("hello") | |
}else{ | |
print("grrrrr...") | |
} | |
#The statement in the parentheses can also be a comparison that generates a | |
# true or false result. In this case, remember that the double equals is used | |
# to make a comparison, single equals to assign a value: | |
x <- 1 | |
if (x==1) print('x = 1') | |
## notice that you can omit the {} if you only have one command to perform after | |
## the if(test...) | |
########## Functions ############### | |
# Basic example | |
greet=function(){ | |
print("hello there!") | |
} | |
# this is the format of function writing: a function name of your choosing- | |
# "greet" in this case- (action verbs are recommended to keep them distinct | |
# from variables), followed by the "=function" plus a () containing any | |
# arguments that the function requires. Here the function doesn't require | |
# an argument. The code inside the {} is performed whenever the function | |
# is called. | |
# You need to copy and paste the entire text of the function into the | |
# console before you can use the function. There are other ways to do this | |
# we'll cover in a little while. Once this is input, type: | |
greet() | |
## and you should see a message in the console. | |
##Here is an example with an argument: | |
square=function(x){ | |
x*x | |
} | |
##as with function names, you can call arguments anything you want. When you call functions, you can refer to the arguments explictly: | |
square(x=2) ## should tell you "4" | |
## this should also work without a hitch: | |
square(2) | |
##once a function is input, you can always see the code for it by typing it in without the () | |
square | |
##you can write functions with multiple arguments: | |
divide=function(x,y){ | |
x/y | |
} | |
##try it out: | |
divide(x=8, y=2) | |
##same as: | |
divide(8,2) | |
##not the same as: | |
divide(2,8) | |
## But if you explictly name the arguments, the order of the arguments does not matter: | |
divide(y=2, x=8) | |
##You need to specify something for all arguments in this kind of function. For example, this should fail: | |
divide(4) | |
##You can however specify defaults for certain arguments when you write a function. Here is an example: | |
increase=function(x, factor=2){ | |
x*factor | |
} | |
##input the function then run these: | |
increase(x=3, factor=2) | |
##is the same as | |
increase(x=3) | |
##because we specified a default for 'factor' | |
##same as: | |
increase(3) | |
##but you can also modify the argument "factor" when you call the function even though it has a default: | |
increase(3, factor=10) | |
##same as | |
increase(3,10) | |
##the "return" command is very useful in functions for "outputting" what you want from the function. Here we've modified the "increase" function to assign to output of the operation to a variable called "output" | |
increase_V2=function(x, factor=2){ | |
output <- (x*factor) | |
} | |
##now run: | |
increase_V2(50) | |
##whoops! No result! | |
##try calling the "output" object that we created: | |
output ##(this should fail if you've run the script as is) | |
##Just as with "for" loops, anything created in the function (such as the 'output' variable) vanishes when it is done running. | |
## here is a third version that has added a line of code telling the function to "return" the output variable- that is- send the value of it out to the world once it is done: | |
increase_V3=function(x, factor=2){ | |
output <- (x*factor) | |
return(output) | |
} | |
##now run: | |
increase_V3(50) | |
##now we have the value of the "output" object to play with again | |
##notice that we still do not have a variable named output: | |
output ##should fail again | |
## Assign the returned value to a variable for further use: | |
result <- increase_V3(50) | |
result | |
##Here's one final example. It is very common to use booleans (true/ false) as arguments with defaults in a function. Lets say you want a function that returns the range of values in a vector, and sometimes you want the absolute minimum and maximum numbers, but sometime you want to know how far apart the extremes are. You could put both options into one function: | |
get_range=function(vector, minMax=FALSE){ | |
vector_min <- min(vector) | |
vector_max <- max(vector) | |
if(minMax){ | |
temp <- data.frame(vector_min, vector_max) | |
return(temp) | |
}else{ | |
range <- vector_max-vector_min | |
return(range) | |
} | |
} | |
##you actually *don't* need the "else" here- but lets keep it in for clarity. | |
##once it's input, try it out: | |
##here is a practice vector: | |
v <- seq(5,20) | |
v | |
get_range(vector=v, minMax=FALSE) | |
##same as: | |
get_range(v) | |
##different from: | |
get_range(v, minMax=TRUE) | |
# notice that if minMax is FALSE the function returns a number, but if it is | |
# TRUE the function returns a dataframe. This might cause errors in other | |
# things you might be doing with the output of this function, so you might | |
# want to avoid things like this when writing functions for your own use. | |
# as a final note, you can place functions in .r text file and then | |
# import them into your workspace with the source() command, such as: | |
# source("my_favorite_functions.r"). This can be faster than copying and | |
# pasting the functions you always use into the start of a script. | |
############ Error catching ########## | |
# try | |
try() | |
# tryCatch | |
tryCatch() | |
# traceback | |
foo <- function(x) { print(1); bar(2) } | |
bar <- function(x) { x + a.variable.which.does.not.exist } | |
## Not run: | |
foo(2) # gives a strange error | |
traceback() | |
## End(Not run) | |
## 2: bar(2) | |
## 1: foo(2) | |
bar ## Ah, this is the culprit ... | |
############ Data Manipulation ########## | |
# The following are five packages that do data manipulation | |
library(plyr); library(data.table); library(doBy); library(sqldf) | |
library(reshape2) | |
# 'Pivot tables' | |
dataset <- data.frame(var1 = rep(c("a","b","c","d","e","f"), each = 4), | |
var2 = rep(c("level1","level1","level2","level2"), 6), | |
var3 = rep(c("h","m"), 12), meas = rep(1:12)) | |
dataset_d <- dcast(dataset, var1 ~ var2 + var3) # pivot table | |
melt(dataset_d, id = 1) # pivot table back, not ideal, but works | |
# reshape and summarise the dataframe | |
data(iris) | |
iris_m <- melt(iris, id="Species", na.rm=TRUE) | |
head(iris_m) | |
dcast(iris_m, Species ~ variable, length) # oops, just length of data vector | |
dcast(iris_m, Species ~ variable, identity) # that's better | |
# Summarise by grouping levels of variables | |
ddply(dataset, .(var1, var2), summarise, | |
mean = mean(meas), | |
sd = sd(meas)) | |
# Good way to go from lists to dataframes | |
mylist <- list(var1 = c(1,2,3,4), | |
var2 = c(5,6,7,8)) | |
library(plyr) # load plyr package | |
ldply(mylist) # makes a dataframe from the list | |
####### Looking at your data - pairwise comparison plots ####### | |
require(lattice) | |
require(ggplot2) | |
require(car) | |
# base graphics | |
pairs(iris[1:4]) | |
# lattice | |
splom(~iris[1:4]) | |
# ggplot2 | |
plotmatrix(iris[1:4]) | |
# other | |
source("/Users/ScottMac/Dropbox/R Group/Week1_R-Intro/ggcorplot.R") | |
ggcorplot( | |
data = iris[1:4], | |
var_text_size = 5, | |
cor_text_limits = c(5,10)) | |
# other2 | |
panel.cor <- function(x, y, digits=2, prefix="", cex.cor) | |
{ | |
usr <- par("usr"); on.exit(par(usr)) | |
par(usr = c(0, 1, 0, 1)) | |
r <- abs(cor(x, y)) | |
txt <- format(c(r, 0.123456789), digits=digits)[1] | |
txt <- paste(prefix, txt, sep="") | |
if(missing(cex.cor)) cex <- 0.8/strwidth(txt) | |
test <- cor.test(x,y) | |
# borrowed from printCoefmat | |
Signif <- symnum(test$p.value, corr = FALSE, na = FALSE, | |
cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1), | |
symbols = c("***", "**", "*", ".", " ")) | |
text(0.5, 0.5, txt, cex = cex * r) | |
text(.8, .8, Signif, cex=cex, col=2) | |
} | |
pairs(iris[1:4], lower.panel=panel.smooth, upper.panel=panel.cor) | |
# car package, function scatterplotMatrix (spm abbreviation) | |
spm(iris[1:4]) | |
# compare run times for different methods | |
system.time(pairs(iris[1:4])) | |
system.time(splom(~iris[1:4])) | |
system.time(plotmatrix(iris[1:4])) | |
system.time(ggcorplot( | |
data = iris[1:4], | |
var_text_size = 5, | |
cor_text_limits = c(5,10))) | |
system.time(pairs(iris[1:4], lower.panel=panel.smooth, upper.panel=panel.cor)) | |
system.time(spm(iris[1:4])) | |
########## Comparison of speed of data summarisation packages ######### | |
# Modified from the Recipes, scripts, and genomics blog | |
library(sqldf) | |
library(doBy) | |
library(plyr) | |
library(data.table) | |
n<-10000 | |
grp1<-sample(1:750, n, replace=T) | |
grp2<-sample(1:750, n, replace=T) | |
d<-data.frame(x=rnorm(n), y=rnorm(n), grp1=grp1, grp2=grp2, n, | |
replace=T) | |
# sqldf | |
rsqldf <- system.time(sqldf("select grp1, grp2, sum(x), sum(y) from d | |
group by grp1, grp2")) | |
#doBy | |
rdoby <- system.time(summaryBy(x+y~grp1+grp2, data=d, FUN=c(sum))) | |
#aggregate | |
raggregate <- system.time(aggregate(d, list(d$grp1, d$grp2), | |
function(x)sum(x))) | |
#plyr | |
rplyr <- system.time(ddply(d, .(grp1, grp2), summarise, avx = sum(x), | |
avy=sum(y))) | |
#data.table | |
DT = data.table(d) | |
rdataT <- system.time(DT[,list(sum(x),sum(y)),by=list(grp1,grp2)]) | |
rsqldf | |
rdoby | |
raggregate | |
rplyr | |
rdataT | |
install.packages("gplots") | |
library(gplots) | |
x <- c(rdataT[3],rsqldf[3],rdoby[3],raggregate[3],rplyr[3]) | |
balloonplot( rep("time.elapsed",5),c("data.table","sqldf","doBy","aggregate","plyr"), | |
round(x,1), ylab ="Method", xlab="",sorted=F,dotcolor=rev(heat.colors(5)), | |
main="time.elapsed for different methods of grouping") | |
##### Comparison of lattice and ggplot2 graphics ######## | |
library(lattice) | |
library(ggplot2) | |
head(iris) | |
# Base graphics | |
# Not sure how to do the specific example here? | |
# lattice graphics | |
pl <- densityplot(~Sepal.Length, data = iris, groups = Species, | |
plot.points = FALSE, ref = TRUE) | |
print(pl) | |
# ggplot2 graphics | |
pg <- ggplot(iris, aes(Sepal.Length)) + | |
stat_density(geom = "path", position = "identity", | |
aes(colour = factor(Species))) | |
print(pg) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment