Skip to content

Instantly share code, notes, and snippets.

@sckott
Created September 10, 2011 13:39
Show Gist options
  • Save sckott/1208321 to your computer and use it in GitHub Desktop.
Save sckott/1208321 to your computer and use it in GitHub Desktop.
A brief introduction to R.
# Intro Tutorial to R
# Scott Chamberlain <myrmecocystus@gmail.com>
# w/ code edited from Nathan Kraft and Dan Bunker
########## HELP ##########
# Getting Help with R on listserves online, etc.
# Go here for correct way to make reproducible examples to submit to help lists:
# https://github.com/hadley/devtools/wiki/Reproducibility
# Code writing style:
# Google style guide: http://google-styleguide.googlecode.com/svn/trunk/google-r-style.html#generallayout
# Henrik Bengtsson style guide: http://www1.maths.lth.se/help/R/RCC/
# Hadley Wickham's style guide: https://github.com/hadley/devtools/wiki/Style
# Generally use <- instead of = when assigning an object to a name
########## Installing R ##########
# go to www.r-project.org, and download a 'precompiled binary'
# for your system from the 'cran' site nearest you
# follow links to 'download'
# follow the directions to install
# starting R: double click the R icon, or double click a .R file (a script)
########## R is case sensitive ##########
myvector <- c(8, 9, 5)
myvector # OK
Myvector # wrong case, no result
########## R as a calculator and simple operations ##########
# try these simple commands by typing them into the 'console'
2 + 2
c(1, 2, 3)
x = c(1, 2, 3)
x
x = c(1:10, 13)
y = x^2
y
ls() # this lists all objects you have created that are still in memory
rm(x) # Use the rm command to remove objects after you're finished.
rm(list = ls()) # removes everything!!!
ls()
########## Assignments ##########
# everything in R is an 'object'. That means that R knows something
# about each object you create, and that there are 'methods'
# available to work with objects of various classes.
########## Scalars ##########
a = 10; b <- 10
# print to console
b
########## Vectors ##########
x = c(3, 5, 2, 8, 5, 6, 8, 10)
y = c(1, 3, 4, 5, 3, 9, 8, 3)
x; y
########### Lists ##########
# lists are arbitrary collections of variables
mylist = c(dd = "foo", ee = 66)
mylist
########## Character ##########
char1 = "this is some text"
char1
########## Factors ##########
# factors are like character vectors but only contain values in predefined 'levels'
fac1 = factor(rep(c("a", "b"), 4))
fac1
########## Matrices ##########
xx = matrix(10, 2, 3)
xx
########## Dataframes ##########
# dataframes are special objects in R that we will use a lot.
# They are special because each column of a dataframe can
# itself be of a different class (UNLIKE MATRICES).
df = data.frame(x, y, fac1)
df # print data frame
summary(df) # basic stats on each column
str(df) # description of columns and format style
head(df) # top 6 rows
tail(df) # bottom 6 rows
df[ , 3] # pick columns
df[1 , ] # pick rows
########## Immutable data frames ##########
# Special case of a dataframe used in the plyr package created by Hadley
# Wickham. When summarising data in plyr, can use immutable data frames to
# do calculations faster, e.g., see the example below
library(plyr)
bigdf <- data.frame(a = rnorm(10000), b = rnorm(10000))
bigdf_i <- idata.frame(bigdf) # make an immutable data frame
system.time( replicate(100, (ddply(bigdf, .(), mean))) )
system.time( replicate(100, (ddply(bigdf_i, .(), mean))) ) # faster!
########## Transforming variables ##########
# Many options: log10, log, log2, sqrt, asin, etc.
df$lny=log10(df$y)
df
########## Indexing ##########
# indexing dataframes
df[1,2] # returns row 1, column 2
df[1, 'y'] # same
df[,2] # returns column y as a vector
df[,'y'] # same
df$y # same
df[1:4,] # rows 1-4
df[df$fac1=="a",] # rows where fac1 is 'a'
df[df$x>mean(df$x),] # rows where x is above the mean for x
df[order(df$x), ] # returns whole dataframe sorted by x
df[df$y%in%c(3:7),] # rows where 'y' matches a list of values
# indexing lists
mylist
mylist[1] # item 1 in list, with label
mylist[[1]] # the value of item 1
########## Sequences/regular patterns ##########
####Sequence, or seq(), is useful for making vectors in regular patterns:
seq(from=1, to=10, by=1)
##can also be written as '1:10' in some contexts:
print(1:10)
#you can change the interval between elements:
seq(from=-10, to=30, by=5)
##same as:
seq(-10, 30, 5)
####repeat, or rep(), also useful for regular patterns
#repeats something as many times as you want:
rep(5, times=5)
##same as:
rep(5,5)
##more complex:
rep(c(1,4,5,2), 3)
##also works with text (aka "strings"):
rep(c("red", "blue", "gray"), 3)
## vector function
vector(mode="character", length=5)
     ############ BOOLEANS (aka "TRUE" and "FALSE") and tests ##########
########## Booleans ##########
# Booleans are a special class in R, and include "TRUE" and "FALSE".
# Note that they need to be typed in all caps. In an R editor they should
# turn yellow (or a different color) when typed in. On the Mac,
# the built-in editor works fine for this; on a PC, you can use an
# editing program like TINN-R (http://www.sciviews.org/Tinn-R/).
# You used to be able to abbreviate with T and F, but this is now
# discouraged and will cause errors in some cases.
##Many functions include boolean arguments:
vector <- as.integer(c(seq(1:10), NA))
sum(vector) ##in this case "sum" fails because of an NA in the data
?sum ## see that there is an argument "na.rm" that we can set to TRUE
sum(vector, na.rm=TRUE) ##this removes the NA and sums the remaining 10 integers
##equivilently you can use the na.omit() command:
sum(na.omit(vector))
##Many tests in R return booleans:
vector > 5 ##this returns true for each part of the vector greater than 5
vector[which(vector<3)] ##which() can be used to get the indices of an object where the test is TRUE
vector >= 5 ##etc
vector == 7 ## this returns true for the element in the vector equal to 7
##Note that two equals signs ("==") are used for testing equivelence, while one ("=") is used to assign something to a variable.
##there are a bunch of commands that start with "is." that are very useful:
is.na(vector) ##shows us that element 11 is an NA
is.finite(vector) ## checks if values are finite numbers
##Booleans can be added: TRUE=1,FALSE=0
sum(is.na(vector)) ##means that there is one NA in the vector
sum(vector<=3, na.rm=TRUE) ## shows that there are 3 elements of the vector <= to 3.
## the "!" (known as a "bang" to people with a background in older
## programming languages) is used as a "not" to reverse TRUE and FALSE:
is.na(vector)
!is.na(vector)
vector != 10 ##which part of the vector is not equal to 10??
########## "for" loops: ##########
# for loops are a very powerful way of repeating commands in a simulation
# or working through each part of an object.
# the part after "for" sets up a counter variable 'i' (can be called anything),
# which starts (in this case) at 1 and increments by 1 to 10. Each time through
# the loop it does whatever is in the { }
for(i in 1:10){
print("hello")
}
##you can refer to the counter variable in the loop:
for(j in 1:10){
print(j)
}
##you can use loops to work through each element of a list or a vector:
colors <- c("red", "blue", "green", "silver")
colors
length(colors)
#this loops adds some text to each element of the 'colors' vector
for(k in 1:length(colors)){
print(colors[k])
colors[k] <- paste(colors[k], "is a color")
}
colors
# You can also use for loops to run through any arbitrary set of values:
for (k in colors){
print(k)
}
# compare this 'colors' loop with the previous one- here the colors
# themselves (not the numbers used to index the colors vector) are what
# the loop moves over.
## your loop does not have to be sequential from 1:
for (k in seq(2,10,2)) {
print(k)
}
################ "If/ else" statments ################
# "if" statements are similar in structure to for loops, and are useful for
# doing something if a certain condition is met
##lets set up a boolean (true/ false) variable:
friendly <- TRUE
##if statements just do whatever is in the {} if the statement in ()
# following "if" is TRUE:
if(friendly){
print("hello")
}
##input the entire statment- you should get a greeting
##lets change "friendly" to be false:
friendly<-FALSE
if(friendly){
print("hello")
}
##now you should get no greeting at all
## you can follow up the "if" with an "else" statement that gives an
## alternative if the first test is false:
if(friendly){
print("hello")
}else{
print("grrrrr...")
}
##switch "friendly" and run it again:
friendly <-!friendly ##remember the "!" means "not"
if(friendly){
print("hello")
}else{
print("grrrrr...")
}
#The statement in the parentheses can also be a comparison that generates a
# true or false result. In this case, remember that the double equals is used
# to make a comparison, single equals to assign a value:
x <- 1
if (x==1) print('x = 1')
## notice that you can omit the {} if you only have one command to perform after
## the if(test...)
########## Functions ###############
# Basic example
greet=function(){
print("hello there!")
}
# this is the format of function writing: a function name of your choosing-
# "greet" in this case- (action verbs are recommended to keep them distinct
# from variables), followed by the "=function" plus a () containing any
# arguments that the function requires. Here the function doesn't require
# an argument. The code inside the {} is performed whenever the function
# is called.
# You need to copy and paste the entire text of the function into the
# console before you can use the function. There are other ways to do this
# we'll cover in a little while. Once this is input, type:
greet()
## and you should see a message in the console.
##Here is an example with an argument:
square=function(x){
x*x
}
##as with function names, you can call arguments anything you want. When you call functions, you can refer to the arguments explictly:
square(x=2) ## should tell you "4"
## this should also work without a hitch:
square(2)
##once a function is input, you can always see the code for it by typing it in without the ()
square
##you can write functions with multiple arguments:
divide=function(x,y){
x/y
}
##try it out:
divide(x=8, y=2)
##same as:
divide(8,2)
##not the same as:
divide(2,8)
## But if you explictly name the arguments, the order of the arguments does not matter:
divide(y=2, x=8)
##You need to specify something for all arguments in this kind of function. For example, this should fail:
divide(4)
##You can however specify defaults for certain arguments when you write a function. Here is an example:
increase=function(x, factor=2){
x*factor
}
##input the function then run these:
increase(x=3, factor=2)
##is the same as
increase(x=3)
##because we specified a default for 'factor'
##same as:
increase(3)
##but you can also modify the argument "factor" when you call the function even though it has a default:
increase(3, factor=10)
##same as
increase(3,10)
##the "return" command is very useful in functions for "outputting" what you want from the function. Here we've modified the "increase" function to assign to output of the operation to a variable called "output"
increase_V2=function(x, factor=2){
output <- (x*factor)
}
##now run:
increase_V2(50)
##whoops! No result!
##try calling the "output" object that we created:
output ##(this should fail if you've run the script as is)
##Just as with "for" loops, anything created in the function (such as the 'output' variable) vanishes when it is done running.
## here is a third version that has added a line of code telling the function to "return" the output variable- that is- send the value of it out to the world once it is done:
increase_V3=function(x, factor=2){
output <- (x*factor)
return(output)
}
##now run:
increase_V3(50)
##now we have the value of the "output" object to play with again
##notice that we still do not have a variable named output:
output ##should fail again
## Assign the returned value to a variable for further use:
result <- increase_V3(50)
result
##Here's one final example. It is very common to use booleans (true/ false) as arguments with defaults in a function. Lets say you want a function that returns the range of values in a vector, and sometimes you want the absolute minimum and maximum numbers, but sometime you want to know how far apart the extremes are. You could put both options into one function:
get_range=function(vector, minMax=FALSE){
vector_min <- min(vector)
vector_max <- max(vector)
if(minMax){
temp <- data.frame(vector_min, vector_max)
return(temp)
}else{
range <- vector_max-vector_min
return(range)
}
}
##you actually *don't* need the "else" here- but lets keep it in for clarity.
##once it's input, try it out:
##here is a practice vector:
v <- seq(5,20)
v
get_range(vector=v, minMax=FALSE)
##same as:
get_range(v)
##different from:
get_range(v, minMax=TRUE)
# notice that if minMax is FALSE the function returns a number, but if it is
# TRUE the function returns a dataframe. This might cause errors in other
# things you might be doing with the output of this function, so you might
# want to avoid things like this when writing functions for your own use.
# as a final note, you can place functions in .r text file and then
# import them into your workspace with the source() command, such as:
# source("my_favorite_functions.r"). This can be faster than copying and
# pasting the functions you always use into the start of a script.
############ Error catching ##########
# try
try()
# tryCatch
tryCatch()
# traceback
foo <- function(x) { print(1); bar(2) }
bar <- function(x) { x + a.variable.which.does.not.exist }
## Not run:
foo(2) # gives a strange error
traceback()
## End(Not run)
## 2: bar(2)
## 1: foo(2)
bar ## Ah, this is the culprit ...
############ Data Manipulation ##########
# The following are five packages that do data manipulation
library(plyr); library(data.table); library(doBy); library(sqldf)
library(reshape2)
# 'Pivot tables'
dataset <- data.frame(var1 = rep(c("a","b","c","d","e","f"), each = 4),
var2 = rep(c("level1","level1","level2","level2"), 6),
var3 = rep(c("h","m"), 12), meas = rep(1:12))
dataset_d <- dcast(dataset, var1 ~ var2 + var3) # pivot table
melt(dataset_d, id = 1) # pivot table back, not ideal, but works
# reshape and summarise the dataframe
data(iris)
iris_m <- melt(iris, id="Species", na.rm=TRUE)
head(iris_m)
dcast(iris_m, Species ~ variable, length) # oops, just length of data vector
dcast(iris_m, Species ~ variable, identity) # that's better
# Summarise by grouping levels of variables
ddply(dataset, .(var1, var2), summarise,
mean = mean(meas),
sd = sd(meas))
# Good way to go from lists to dataframes
mylist <- list(var1 = c(1,2,3,4),
var2 = c(5,6,7,8))
library(plyr) # load plyr package
ldply(mylist) # makes a dataframe from the list
####### Looking at your data - pairwise comparison plots #######
require(lattice)
require(ggplot2)
require(car)
# base graphics
pairs(iris[1:4])
# lattice
splom(~iris[1:4])
# ggplot2
plotmatrix(iris[1:4])
# other
source("/Users/ScottMac/Dropbox/R Group/Week1_R-Intro/ggcorplot.R")
ggcorplot(
data = iris[1:4],
var_text_size = 5,
cor_text_limits = c(5,10))
# other2
panel.cor <- function(x, y, digits=2, prefix="", cex.cor)
{
usr <- par("usr"); on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r <- abs(cor(x, y))
txt <- format(c(r, 0.123456789), digits=digits)[1]
txt <- paste(prefix, txt, sep="")
if(missing(cex.cor)) cex <- 0.8/strwidth(txt)
test <- cor.test(x,y)
# borrowed from printCoefmat
Signif <- symnum(test$p.value, corr = FALSE, na = FALSE,
cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
symbols = c("***", "**", "*", ".", " "))
text(0.5, 0.5, txt, cex = cex * r)
text(.8, .8, Signif, cex=cex, col=2)
}
pairs(iris[1:4], lower.panel=panel.smooth, upper.panel=panel.cor)
# car package, function scatterplotMatrix (spm abbreviation)
spm(iris[1:4])
# compare run times for different methods
system.time(pairs(iris[1:4]))
system.time(splom(~iris[1:4]))
system.time(plotmatrix(iris[1:4]))
system.time(ggcorplot(
data = iris[1:4],
var_text_size = 5,
cor_text_limits = c(5,10)))
system.time(pairs(iris[1:4], lower.panel=panel.smooth, upper.panel=panel.cor))
system.time(spm(iris[1:4]))
########## Comparison of speed of data summarisation packages #########
# Modified from the Recipes, scripts, and genomics blog
library(sqldf)
library(doBy)
library(plyr)
library(data.table)
n<-10000
grp1<-sample(1:750, n, replace=T)
grp2<-sample(1:750, n, replace=T)
d<-data.frame(x=rnorm(n), y=rnorm(n), grp1=grp1, grp2=grp2, n,
replace=T)
# sqldf
rsqldf <- system.time(sqldf("select grp1, grp2, sum(x), sum(y) from d
group by grp1, grp2"))
#doBy
rdoby <- system.time(summaryBy(x+y~grp1+grp2, data=d, FUN=c(sum)))
#aggregate
raggregate <- system.time(aggregate(d, list(d$grp1, d$grp2),
function(x)sum(x)))
#plyr
rplyr <- system.time(ddply(d, .(grp1, grp2), summarise, avx = sum(x),
avy=sum(y)))
#data.table
DT = data.table(d)
rdataT <- system.time(DT[,list(sum(x),sum(y)),by=list(grp1,grp2)])
rsqldf
rdoby
raggregate
rplyr
rdataT
install.packages("gplots")
library(gplots)
x <- c(rdataT[3],rsqldf[3],rdoby[3],raggregate[3],rplyr[3])
balloonplot( rep("time.elapsed",5),c("data.table","sqldf","doBy","aggregate","plyr"),
round(x,1), ylab ="Method", xlab="",sorted=F,dotcolor=rev(heat.colors(5)),
main="time.elapsed for different methods of grouping")
##### Comparison of lattice and ggplot2 graphics ########
library(lattice)
library(ggplot2)
head(iris)
# Base graphics
# Not sure how to do the specific example here?
# lattice graphics
pl <- densityplot(~Sepal.Length, data = iris, groups = Species,
plot.points = FALSE, ref = TRUE)
print(pl)
# ggplot2 graphics
pg <- ggplot(iris, aes(Sepal.Length)) +
stat_density(geom = "path", position = "identity",
aes(colour = factor(Species)))
print(pg)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment