sckott/Rintrotutorial.R

## Rintrotutorial.R
# Intro Tutorial to R
# Scott Chamberlain <myrmecocystus@gmail.com>
# w/ code edited from Nathan Kraft and Dan Bunker

##########    HELP     ##########
# Getting Help with R on listserves online, etc.
# Go here for correct way to make reproducible examples to submit to help lists:
#   https://github.com/hadley/devtools/wiki/Reproducibility

# Code writing style:
  # Google style guide: http://google-styleguide.googlecode.com/svn/trunk/google-r-style.html#generallayout
  # Henrik Bengtsson style guide: http://www1.maths.lth.se/help/R/RCC/
  # Hadley Wickham's style guide: https://github.com/hadley/devtools/wiki/Style

  # Generally use <- instead of = when assigning an object to a name

##########    Installing R     ##########
#  go to www.r-project.org, and download a 'precompiled binary'
#	for your system from the 'cran' site nearest you
#  follow links to 'download'
#  follow the directions to install

# starting R: double click the R icon, or double click a .R file (a script)


##########    R is case sensitive     ##########
myvector <- c(8, 9, 5)
myvector # OK
Myvector # wrong case, no result


##########     R as a calculator and simple operations ##########
# try these simple commands by typing them into the 'console'
2 + 2
c(1, 2, 3)
x = c(1, 2, 3)

x
x = c(1:10, 13)
y = x^2
y


ls()  # this lists all objects you have created that are still in memory
rm(x) # Use the rm command to remove objects after you're finished.
rm(list = ls())  # removes everything!!!
ls()


##########   Assignments   ##########
# everything in R is an 'object'.  That means that R knows something
# 	about each object you create, and that there are 'methods'
# 	available to work with objects of various classes.

##########   Scalars   ##########
a = 10; b <- 10
# print to console
b

##########   Vectors   ##########
x = c(3, 5, 2, 8, 5, 6, 8, 10)
y = c(1, 3, 4, 5, 3, 9, 8, 3)
x; y

###########   Lists   ##########
# lists are arbitrary collections of variables
mylist = c(dd = "foo", ee = 66)
mylist

##########   Character   ##########
char1 =  "this is some text"
char1

##########   Factors   ##########
# factors are like character vectors but only contain values in predefined 'levels'
fac1 = factor(rep(c("a", "b"), 4))
fac1


##########   Matrices   ##########
xx = matrix(10, 2, 3)
xx

##########   Dataframes   ##########
# dataframes are special objects in R that we will use a lot.
# They are special because each column of a dataframe can
#	itself be of a different class (UNLIKE MATRICES).
df = data.frame(x, y, fac1)
df # print data frame
summary(df) # basic stats on each column
str(df) # description of columns and format style
head(df) # top 6 rows
tail(df) # bottom 6 rows
df[ , 3] # pick columns
df[1 , ] # pick rows


##########   Immutable data frames   ##########
# Special case of a dataframe used in the plyr package created by Hadley
# Wickham. When summarising data in plyr, can use immutable data frames to
# do calculations faster, e.g., see the example below
library(plyr)
bigdf <- data.frame(a = rnorm(10000), b = rnorm(10000))
bigdf_i <- idata.frame(bigdf) # make an immutable data frame
system.time( replicate(100, (ddply(bigdf, .(), mean))) )
system.time( replicate(100, (ddply(bigdf_i, .(), mean))) ) # faster!


##########   Transforming variables  ##########
# Many options: log10, log, log2, sqrt, asin, etc.
df$lny=log10(df$y)
df


##########   Indexing  ##########
# indexing dataframes
df[1,2]  #  returns row 1, column 2

df[1, 'y']  # same

df[,2]  # returns column y  as a vector

df[,'y'] # same

df$y  # same

df[1:4,]  # rows 1-4

df[df$fac1=="a",]  # rows where fac1 is 'a'

df[df$x>mean(df$x),]  # rows where x is above the mean for x

df[order(df$x), ]  # returns whole dataframe sorted by x

df[df$y%in%c(3:7),]  # rows where 'y' matches a list of values

# indexing lists
mylist

mylist[1] #  item 1 in list, with label

mylist[[1]] # the value of item 1


##########   Sequences/regular patterns   ##########
####Sequence, or seq(), is useful for making vectors in regular patterns:
seq(from=1, to=10, by=1)

##can also be written as '1:10' in some contexts:
print(1:10)

#you can change the interval between elements:
seq(from=-10, to=30, by=5)

##same as:
seq(-10, 30, 5)


####repeat, or rep(), also useful for regular patterns
#repeats something as many times as you want:

rep(5, times=5)
##same as:
rep(5,5)

##more complex:
rep(c(1,4,5,2), 3)

##also works with text (aka "strings"):
rep(c("red", "blue", "gray"), 3)


## vector function
vector(mode="character", length=5)


                                                                                                                                                                                                                                                 ############     BOOLEANS (aka "TRUE" and "FALSE") and tests  ##########
##########   Booleans   ##########
# Booleans are a special class in R, and include "TRUE" and "FALSE".
# Note that they need to be typed in all caps. In an R editor they should
# turn yellow (or a different color) when typed in. On the Mac,
# the built-in editor works fine for this; on a PC, you can use an
# editing program like TINN-R (http://www.sciviews.org/Tinn-R/).
# You used to be able to abbreviate with T and F, but this is now
# discouraged and will cause errors in some cases.

##Many functions include boolean arguments:
vector <- as.integer(c(seq(1:10), NA))

sum(vector) ##in this case "sum" fails because of an NA in the data

?sum  ## see that there is an argument "na.rm" that we can set to TRUE

sum(vector, na.rm=TRUE)  ##this removes the NA and sums the remaining 10 integers

##equivilently you can use the na.omit() command:
sum(na.omit(vector))

##Many tests in R return booleans:
vector > 5 ##this returns true for each part of the vector greater than 5
vector[which(vector<3)] ##which() can be used to get the indices of an object where the test is TRUE

vector >= 5 ##etc

vector == 7 ## this returns true for the element in the vector equal to 7
##Note that two equals signs ("==") are used for testing equivelence, while one ("=") is used to assign something to a variable.

##there are a bunch of commands that start with "is." that are very useful:

is.na(vector) ##shows us that element 11 is an NA

is.finite(vector) ## checks if values are finite numbers


##Booleans can be added: TRUE=1,FALSE=0
sum(is.na(vector)) ##means that there is one NA in the vector
sum(vector<=3, na.rm=TRUE) ## shows that there are 3 elements of the vector <= to 3.

## the "!" (known as a "bang" to people with a background in older
## programming languages) is used as a "not" to reverse TRUE and FALSE:

is.na(vector)
!is.na(vector)

vector != 10 ##which part of the vector is not equal to 10??


##########   "for" loops:     ##########

# for loops are a very powerful way of repeating commands in a simulation
# or working through each part of an object.

# the part after "for" sets up a counter variable 'i' (can be called anything),
# which starts (in this case) at 1 and increments by 1 to 10. Each time through
# the loop it does whatever is in the { }

for(i in 1:10){
	print("hello")
	}

##you can refer to the counter variable in the loop:
for(j in 1:10){
	print(j)
	}

##you can use loops to work through each element of a list or a vector:
colors <- c("red", "blue", "green", "silver")

colors

length(colors)

#this loops adds some text to each element of the 'colors' vector

for(k in 1:length(colors)){
	print(colors[k])
	colors[k] <- paste(colors[k], "is a color")
	}

colors

# You can also use for loops to run through any arbitrary set of values:
for (k in colors){
	print(k)
	}
# compare this 'colors' loop with the previous one- here the colors
# themselves (not the numbers used to index the colors vector) are what
# the loop moves over.


## your loop does not have to be sequential from 1:
for (k in seq(2,10,2)) {
	print(k)
	}


################  "If/ else" statments   ################
# "if" statements are similar in structure to for loops, and are useful for
# doing something if a certain condition is met

##lets set up a boolean (true/ false) variable:
friendly <- TRUE

##if statements just do whatever is in the {} if the statement in ()
# following "if" is TRUE:

if(friendly){
	print("hello")
	}
##input the entire statment- you should get a greeting

##lets change "friendly" to be false:
friendly<-FALSE

if(friendly){
	print("hello")
	}

##now you should get no greeting at all

## you can follow up the "if" with an "else" statement that gives an
## alternative if the first test is false:

if(friendly){
	print("hello")
	}else{
		print("grrrrr...")
		}

##switch "friendly" and run it again:

friendly <-!friendly        ##remember the "!" means "not"

if(friendly){
	print("hello")
	}else{
		print("grrrrr...")
		}

#The statement in the parentheses can also be a comparison that generates a
# true or false result. In this case, remember that the double equals is used
# to make a comparison, single equals to assign a value:

x <- 1
if (x==1) print('x = 1')

## notice that you can omit the {} if you only have one command to perform after
## the if(test...)


##########    Functions   ###############
# Basic example
greet=function(){
	print("hello there!")
	}


# this is the format of function writing: a function name of your choosing-
# "greet" in this case- (action verbs are recommended to keep them distinct
# from variables), followed by the "=function" plus a () containing any
# arguments that the function requires.  Here the function doesn't require
# an argument.  The code inside the {} is performed whenever the function
# is called.

# You need to copy and paste the entire text of the function into the
# console before you can use the function.  There are other ways to do this
# we'll cover in a little while. Once this is input, type:

greet()

## and you should see a message in the console.

##Here is an example with an argument:

square=function(x){
	x*x
	}

##as with function names, you can call arguments anything you want.  When you call functions, you can refer to the arguments explictly:

square(x=2) ## should tell you "4"

## this should also work without a hitch:

square(2)

##once a function is input, you can always see the code for it by typing it in without the ()

square

##you can write functions with multiple arguments:

divide=function(x,y){
	x/y
	}

##try it out:
divide(x=8, y=2)

##same as:
divide(8,2)

##not the same as:
divide(2,8)

## But if you explictly name the arguments, the order of the arguments does not matter:
divide(y=2, x=8)

##You need to specify something for all arguments in this kind of function.  For example, this should fail:
divide(4)

##You can however specify defaults for certain arguments when you write a function. Here is an example:

increase=function(x, factor=2){
	x*factor
	}

##input the function then run these:

increase(x=3, factor=2)

##is the same as
increase(x=3)
##because we specified a default for 'factor'

##same as:
increase(3)

##but you can also modify the argument "factor" when you call the function even though it has a default:

increase(3, factor=10)

##same as
increase(3,10)

##the "return" command is very useful in functions for "outputting" what you want from the function.  Here we've modified the "increase" function to assign to output of the operation to a variable called "output"

increase_V2=function(x, factor=2){
	output <- (x*factor)
	}

##now run:

increase_V2(50)

##whoops!  No result!

##try calling the "output" object that we created:

output  ##(this should fail if you've run the script as is)

##Just as with "for" loops, anything created in the function (such as the 'output' variable) vanishes when it is done running.

## here is a third version that has added a line of code telling the function to "return" the output variable- that is- send the value of it out to the world once it is done:
increase_V3=function(x, factor=2){
	output <- (x*factor)
	return(output)
	}

##now run:
increase_V3(50)

##now we have the value of the "output" object to play with again

##notice that we still do not have a variable named output:

output  ##should fail again

## Assign the returned value to a variable for further use:
result <- increase_V3(50)
result

##Here's one final example.  It is very common to use booleans (true/ false) as arguments with defaults in a function.  Lets say you want a function that returns the range of values in a vector, and sometimes you want the absolute minimum and maximum numbers, but sometime you want to know how far apart the extremes are.  You could put both options into one function:

get_range=function(vector, minMax=FALSE){
	vector_min <- min(vector)
	vector_max <- max(vector)

	if(minMax){
		temp <- data.frame(vector_min, vector_max)
		return(temp)
		}else{
			range <- vector_max-vector_min
			return(range)
			}

	}

##you actually *don't* need the "else" here- but lets keep it in for clarity.

##once it's input, try it out:

##here is a practice vector:
v <- seq(5,20)
v

get_range(vector=v, minMax=FALSE)

##same as:
get_range(v)

##different from:
get_range(v, minMax=TRUE)

# notice that if minMax is FALSE the function returns a number, but if it is
# TRUE the function returns a dataframe.  This might cause errors in other
# things you might be doing with the output of this function, so you might
# want to avoid things like this when writing functions for your own use.

# as a final note, you can place functions in .r text file and then
# import them into your workspace with the source() command, such as:
# source("my_favorite_functions.r").  This can be faster than  copying and
# pasting the functions you always use into the start of a script.


############    Error catching   ##########
# try
try()

# tryCatch
tryCatch()

# traceback
foo <- function(x) { print(1); bar(2) }
bar <- function(x) { x + a.variable.which.does.not.exist }
## Not run:
foo(2) # gives a strange error
traceback()
## End(Not run)
## 2: bar(2)
## 1: foo(2)
bar ## Ah, this is the culprit ...


############    Data Manipulation   ##########
# The following are five packages that do data manipulation
library(plyr); library(data.table); library(doBy); library(sqldf)
library(reshape2)

# 'Pivot tables'
dataset <- data.frame(var1 = rep(c("a","b","c","d","e","f"), each = 4),
 var2 = rep(c("level1","level1","level2","level2"), 6),
 var3 = rep(c("h","m"), 12), meas = rep(1:12))

dataset_d <- dcast(dataset, var1 ~ var2 + var3) # pivot table
melt(dataset_d, id = 1) # pivot table back, not ideal, but works


# reshape and summarise the dataframe
data(iris)
iris_m <- melt(iris, id="Species", na.rm=TRUE)
head(iris_m)
dcast(iris_m, Species ~ variable, length) # oops, just length of data vector
dcast(iris_m, Species ~ variable, identity) # that's better

# Summarise by grouping levels of variables
ddply(dataset, .(var1, var2), summarise,
    mean = mean(meas),
    sd = sd(meas))

# Good way to go from lists to dataframes
mylist <- list(var1 = c(1,2,3,4),
  var2 = c(5,6,7,8))
library(plyr) # load plyr package
ldply(mylist) # makes a dataframe from the list


#######   Looking at your data - pairwise comparison plots  #######
require(lattice)
require(ggplot2)
require(car)

# base graphics
pairs(iris[1:4])

# lattice
splom(~iris[1:4])

# ggplot2
plotmatrix(iris[1:4])

# other
source("/Users/ScottMac/Dropbox/R Group/Week1_R-Intro/ggcorplot.R")
ggcorplot(
  data = iris[1:4],
  var_text_size = 5,
  cor_text_limits = c(5,10))

# other2
panel.cor <- function(x, y, digits=2, prefix="", cex.cor)
{
    usr <- par("usr"); on.exit(par(usr))
    par(usr = c(0, 1, 0, 1))
    r <- abs(cor(x, y))
    txt <- format(c(r, 0.123456789), digits=digits)[1]
    txt <- paste(prefix, txt, sep="")
    if(missing(cex.cor)) cex <- 0.8/strwidth(txt)

    test <- cor.test(x,y)
    # borrowed from printCoefmat
    Signif <- symnum(test$p.value, corr = FALSE, na = FALSE,
                  cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
                  symbols = c("***", "**", "*", ".", " "))

    text(0.5, 0.5, txt, cex = cex * r)
    text(.8, .8, Signif, cex=cex, col=2)
}

pairs(iris[1:4], lower.panel=panel.smooth, upper.panel=panel.cor)

# car package, function scatterplotMatrix (spm abbreviation)
spm(iris[1:4])


# compare run times for different methods
system.time(pairs(iris[1:4]))

system.time(splom(~iris[1:4]))

system.time(plotmatrix(iris[1:4]))

system.time(ggcorplot(
  data = iris[1:4],
  var_text_size = 5,
  cor_text_limits = c(5,10)))

system.time(pairs(iris[1:4], lower.panel=panel.smooth, upper.panel=panel.cor))

system.time(spm(iris[1:4]))


########## Comparison of speed of data summarisation packages  #########
# Modified from the Recipes, scripts, and genomics blog
library(sqldf)
library(doBy)
library(plyr)
library(data.table)

n<-10000
grp1<-sample(1:750, n, replace=T)
grp2<-sample(1:750, n, replace=T)
d<-data.frame(x=rnorm(n), y=rnorm(n), grp1=grp1, grp2=grp2, n,
replace=T)

# sqldf
rsqldf <- system.time(sqldf("select grp1, grp2, sum(x), sum(y) from d
  group by grp1, grp2"))

#doBy
rdoby <- system.time(summaryBy(x+y~grp1+grp2, data=d, FUN=c(sum)))

#aggregate
raggregate <- system.time(aggregate(d, list(d$grp1, d$grp2),
  function(x)sum(x)))

#plyr
rplyr <- system.time(ddply(d, .(grp1, grp2), summarise, avx = sum(x),
  avy=sum(y)))

#data.table
DT = data.table(d)
rdataT <- system.time(DT[,list(sum(x),sum(y)),by=list(grp1,grp2)])

rsqldf
rdoby
raggregate
rplyr
rdataT

install.packages("gplots")
library(gplots)

x <- c(rdataT[3],rsqldf[3],rdoby[3],raggregate[3],rplyr[3])
balloonplot( rep("time.elapsed",5),c("data.table","sqldf","doBy","aggregate","plyr"),
  round(x,1), ylab ="Method", xlab="",sorted=F,dotcolor=rev(heat.colors(5)),
  main="time.elapsed for different methods of grouping")


#####    Comparison of lattice and ggplot2 graphics   ########
library(lattice)
library(ggplot2)
head(iris)

# Base graphics
  # Not sure how to do the specific example here?

# lattice graphics
pl <- densityplot(~Sepal.Length, data = iris, groups = Species,
   plot.points = FALSE, ref = TRUE)
print(pl)

# ggplot2 graphics
pg <- ggplot(iris, aes(Sepal.Length)) +
    stat_density(geom = "path", position = "identity",
      aes(colour = factor(Species)))
print(pg)
	# Intro Tutorial to R
	# Scott Chamberlain <myrmecocystus@gmail.com>
	# w/ code edited from Nathan Kraft and Dan Bunker

	########## HELP ##########
	# Getting Help with R on listserves online, etc.
	# Go here for correct way to make reproducible examples to submit to help lists:
	# https://github.com/hadley/devtools/wiki/Reproducibility

	# Code writing style:
	# Google style guide: http://google-styleguide.googlecode.com/svn/trunk/google-r-style.html#generallayout
	# Henrik Bengtsson style guide: http://www1.maths.lth.se/help/R/RCC/
	# Hadley Wickham's style guide: https://github.com/hadley/devtools/wiki/Style

	# Generally use <- instead of = when assigning an object to a name

	########## Installing R ##########
	# go to www.r-project.org, and download a 'precompiled binary'
	# for your system from the 'cran' site nearest you
	# follow links to 'download'
	# follow the directions to install

	# starting R: double click the R icon, or double click a .R file (a script)



	########## R is case sensitive ##########
	myvector <- c(8, 9, 5)
	myvector # OK
	Myvector # wrong case, no result


	########## R as a calculator and simple operations ##########
	# try these simple commands by typing them into the 'console'
	2 + 2
	c(1, 2, 3)
	x = c(1, 2, 3)

	x
	x = c(1:10, 13)
	y = x^2
	y



	ls() # this lists all objects you have created that are still in memory
	rm(x) # Use the rm command to remove objects after you're finished.
	rm(list = ls()) # removes everything!!!
	ls()


	########## Assignments ##########
	# everything in R is an 'object'. That means that R knows something
	# about each object you create, and that there are 'methods'
	# available to work with objects of various classes.

	########## Scalars ##########
	a = 10; b <- 10
	# print to console
	b

	########## Vectors ##########
	x = c(3, 5, 2, 8, 5, 6, 8, 10)
	y = c(1, 3, 4, 5, 3, 9, 8, 3)
	x; y

	########### Lists ##########
	# lists are arbitrary collections of variables
	mylist = c(dd = "foo", ee = 66)
	mylist

	########## Character ##########
	char1 = "this is some text"
	char1

	########## Factors ##########
	# factors are like character vectors but only contain values in predefined 'levels'
	fac1 = factor(rep(c("a", "b"), 4))
	fac1


	########## Matrices ##########
	xx = matrix(10, 2, 3)
	xx

	########## Dataframes ##########
	# dataframes are special objects in R that we will use a lot.
	# They are special because each column of a dataframe can
	# itself be of a different class (UNLIKE MATRICES).
	df = data.frame(x, y, fac1)
	df # print data frame
	summary(df) # basic stats on each column
	str(df) # description of columns and format style
	head(df) # top 6 rows
	tail(df) # bottom 6 rows
	df[ , 3] # pick columns
	df[1 , ] # pick rows


	########## Immutable data frames ##########
	# Special case of a dataframe used in the plyr package created by Hadley
	# Wickham. When summarising data in plyr, can use immutable data frames to
	# do calculations faster, e.g., see the example below
	library(plyr)
	bigdf <- data.frame(a = rnorm(10000), b = rnorm(10000))
	bigdf_i <- idata.frame(bigdf) # make an immutable data frame
	system.time( replicate(100, (ddply(bigdf, .(), mean))) )
	system.time( replicate(100, (ddply(bigdf_i, .(), mean))) ) # faster!


	########## Transforming variables ##########
	# Many options: log10, log, log2, sqrt, asin, etc.
	df$lny=log10(df$y)
	df


	########## Indexing ##########
	# indexing dataframes
	df[1,2] # returns row 1, column 2

	df[1, 'y'] # same

	df[,2] # returns column y as a vector

	df[,'y'] # same

	df$y # same

	df[1:4,] # rows 1-4

	df[df$fac1=="a",] # rows where fac1 is 'a'

	df[df$x>mean(df$x),] # rows where x is above the mean for x

	df[order(df$x), ] # returns whole dataframe sorted by x

	df[df$y%in%c(3:7),] # rows where 'y' matches a list of values

	# indexing lists
	mylist

	mylist[1] # item 1 in list, with label

	mylist[[1]] # the value of item 1



	########## Sequences/regular patterns ##########
	####Sequence, or seq(), is useful for making vectors in regular patterns:
	seq(from=1, to=10, by=1)

	##can also be written as '1:10' in some contexts:
	print(1:10)

	#you can change the interval between elements:
	seq(from=-10, to=30, by=5)

	##same as:
	seq(-10, 30, 5)


	####repeat, or rep(), also useful for regular patterns
	#repeats something as many times as you want:

	rep(5, times=5)
	##same as:
	rep(5,5)

	##more complex:
	rep(c(1,4,5,2), 3)

	##also works with text (aka "strings"):
	rep(c("red", "blue", "gray"), 3)


	## vector function
	vector(mode="character", length=5)


	############ BOOLEANS (aka "TRUE" and "FALSE") and tests ##########
	########## Booleans ##########
	# Booleans are a special class in R, and include "TRUE" and "FALSE".
	# Note that they need to be typed in all caps. In an R editor they should
	# turn yellow (or a different color) when typed in. On the Mac,
	# the built-in editor works fine for this; on a PC, you can use an
	# editing program like TINN-R (http://www.sciviews.org/Tinn-R/).
	# You used to be able to abbreviate with T and F, but this is now
	# discouraged and will cause errors in some cases.

	##Many functions include boolean arguments:
	vector <- as.integer(c(seq(1:10), NA))

	sum(vector) ##in this case "sum" fails because of an NA in the data

	?sum ## see that there is an argument "na.rm" that we can set to TRUE

	sum(vector, na.rm=TRUE) ##this removes the NA and sums the remaining 10 integers

	##equivilently you can use the na.omit() command:
	sum(na.omit(vector))

	##Many tests in R return booleans:
	vector > 5 ##this returns true for each part of the vector greater than 5
	vector[which(vector<3)] ##which() can be used to get the indices of an object where the test is TRUE

	vector >= 5 ##etc

	vector == 7 ## this returns true for the element in the vector equal to 7
	##Note that two equals signs ("==") are used for testing equivelence, while one ("=") is used to assign something to a variable.

	##there are a bunch of commands that start with "is." that are very useful:

	is.na(vector) ##shows us that element 11 is an NA

	is.finite(vector) ## checks if values are finite numbers


	##Booleans can be added: TRUE=1,FALSE=0
	sum(is.na(vector)) ##means that there is one NA in the vector
	sum(vector<=3, na.rm=TRUE) ## shows that there are 3 elements of the vector <= to 3.

	## the "!" (known as a "bang" to people with a background in older
	## programming languages) is used as a "not" to reverse TRUE and FALSE:

	is.na(vector)
	!is.na(vector)

	vector != 10 ##which part of the vector is not equal to 10??





	########## "for" loops: ##########

	# for loops are a very powerful way of repeating commands in a simulation
	# or working through each part of an object.

	# the part after "for" sets up a counter variable 'i' (can be called anything),
	# which starts (in this case) at 1 and increments by 1 to 10. Each time through
	# the loop it does whatever is in the { }

	for(i in 1:10){
	print("hello")
	}

	##you can refer to the counter variable in the loop:
	for(j in 1:10){
	print(j)
	}

	##you can use loops to work through each element of a list or a vector:
	colors <- c("red", "blue", "green", "silver")

	colors

	length(colors)

	#this loops adds some text to each element of the 'colors' vector

	for(k in 1:length(colors)){
	print(colors[k])
	colors[k] <- paste(colors[k], "is a color")
	}

	colors

	# You can also use for loops to run through any arbitrary set of values:
	for (k in colors){
	print(k)
	}
	# compare this 'colors' loop with the previous one- here the colors
	# themselves (not the numbers used to index the colors vector) are what
	# the loop moves over.


	## your loop does not have to be sequential from 1:
	for (k in seq(2,10,2)) {
	print(k)
	}



	################ "If/ else" statments ################
	# "if" statements are similar in structure to for loops, and are useful for
	# doing something if a certain condition is met

	##lets set up a boolean (true/ false) variable:
	friendly <- TRUE

	##if statements just do whatever is in the {} if the statement in ()
	# following "if" is TRUE:

	if(friendly){
	print("hello")
	}
	##input the entire statment- you should get a greeting

	##lets change "friendly" to be false:
	friendly<-FALSE

	if(friendly){
	print("hello")
	}

	##now you should get no greeting at all

	## you can follow up the "if" with an "else" statement that gives an
	## alternative if the first test is false:

	if(friendly){
	print("hello")
	}else{
	print("grrrrr...")
	}

	##switch "friendly" and run it again:

	friendly <-!friendly ##remember the "!" means "not"

	if(friendly){
	print("hello")
	}else{
	print("grrrrr...")
	}

	#The statement in the parentheses can also be a comparison that generates a
	# true or false result. In this case, remember that the double equals is used
	# to make a comparison, single equals to assign a value:

	x <- 1
	if (x==1) print('x = 1')

	## notice that you can omit the {} if you only have one command to perform after
	## the if(test...)





	########## Functions ###############
	# Basic example
	greet=function(){
	print("hello there!")
	}


	# this is the format of function writing: a function name of your choosing-
	# "greet" in this case- (action verbs are recommended to keep them distinct
	# from variables), followed by the "=function" plus a () containing any
	# arguments that the function requires. Here the function doesn't require
	# an argument. The code inside the {} is performed whenever the function
	# is called.

	# You need to copy and paste the entire text of the function into the
	# console before you can use the function. There are other ways to do this
	# we'll cover in a little while. Once this is input, type:

	greet()

	## and you should see a message in the console.

	##Here is an example with an argument:

	square=function(x){
	x*x
	}

	##as with function names, you can call arguments anything you want. When you call functions, you can refer to the arguments explictly:

	square(x=2) ## should tell you "4"

	## this should also work without a hitch:

	square(2)

	##once a function is input, you can always see the code for it by typing it in without the ()

	square

	##you can write functions with multiple arguments:

	divide=function(x,y){
	x/y
	}

	##try it out:
	divide(x=8, y=2)

	##same as:
	divide(8,2)

	##not the same as:
	divide(2,8)

	## But if you explictly name the arguments, the order of the arguments does not matter:
	divide(y=2, x=8)

	##You need to specify something for all arguments in this kind of function. For example, this should fail:
	divide(4)

	##You can however specify defaults for certain arguments when you write a function. Here is an example:

	increase=function(x, factor=2){
	x*factor
	}

	##input the function then run these:

	increase(x=3, factor=2)

	##is the same as
	increase(x=3)
	##because we specified a default for 'factor'

	##same as:
	increase(3)

	##but you can also modify the argument "factor" when you call the function even though it has a default:

	increase(3, factor=10)

	##same as
	increase(3,10)

	##the "return" command is very useful in functions for "outputting" what you want from the function. Here we've modified the "increase" function to assign to output of the operation to a variable called "output"

	increase_V2=function(x, factor=2){
	output <- (x*factor)
	}

	##now run:

	increase_V2(50)

	##whoops! No result!

	##try calling the "output" object that we created:

	output ##(this should fail if you've run the script as is)

	##Just as with "for" loops, anything created in the function (such as the 'output' variable) vanishes when it is done running.

	## here is a third version that has added a line of code telling the function to "return" the output variable- that is- send the value of it out to the world once it is done:
	increase_V3=function(x, factor=2){
	output <- (x*factor)
	return(output)
	}

	##now run:
	increase_V3(50)

	##now we have the value of the "output" object to play with again

	##notice that we still do not have a variable named output:

	output ##should fail again

	## Assign the returned value to a variable for further use:
	result <- increase_V3(50)
	result

	##Here's one final example. It is very common to use booleans (true/ false) as arguments with defaults in a function. Lets say you want a function that returns the range of values in a vector, and sometimes you want the absolute minimum and maximum numbers, but sometime you want to know how far apart the extremes are. You could put both options into one function:

	get_range=function(vector, minMax=FALSE){
	vector_min <- min(vector)
	vector_max <- max(vector)

	if(minMax){
	temp <- data.frame(vector_min, vector_max)
	return(temp)
	}else{
	range <- vector_max-vector_min
	return(range)
	}

	}

	##you actually don't need the "else" here- but lets keep it in for clarity.

	##once it's input, try it out:

	##here is a practice vector:
	v <- seq(5,20)
	v

	get_range(vector=v, minMax=FALSE)

	##same as:
	get_range(v)

	##different from:
	get_range(v, minMax=TRUE)

	# notice that if minMax is FALSE the function returns a number, but if it is
	# TRUE the function returns a dataframe. This might cause errors in other
	# things you might be doing with the output of this function, so you might
	# want to avoid things like this when writing functions for your own use.

	# as a final note, you can place functions in .r text file and then
	# import them into your workspace with the source() command, such as:
	# source("my_favorite_functions.r"). This can be faster than copying and
	# pasting the functions you always use into the start of a script.






	############ Error catching ##########
	# try
	try()

	# tryCatch
	tryCatch()

	# traceback
	foo <- function(x) { print(1); bar(2) }
	bar <- function(x) { x + a.variable.which.does.not.exist }
	## Not run:
	foo(2) # gives a strange error
	traceback()
	## End(Not run)
	## 2: bar(2)
	## 1: foo(2)
	bar ## Ah, this is the culprit ...







	############ Data Manipulation ##########
	# The following are five packages that do data manipulation
	library(plyr); library(data.table); library(doBy); library(sqldf)
	library(reshape2)

	# 'Pivot tables'
	dataset <- data.frame(var1 = rep(c("a","b","c","d","e","f"), each = 4),
	var2 = rep(c("level1","level1","level2","level2"), 6),
	var3 = rep(c("h","m"), 12), meas = rep(1:12))

	dataset_d <- dcast(dataset, var1 ~ var2 + var3) # pivot table
	melt(dataset_d, id = 1) # pivot table back, not ideal, but works


	# reshape and summarise the dataframe
	data(iris)
	iris_m <- melt(iris, id="Species", na.rm=TRUE)
	head(iris_m)
	dcast(iris_m, Species ~ variable, length) # oops, just length of data vector
	dcast(iris_m, Species ~ variable, identity) # that's better

	# Summarise by grouping levels of variables
	ddply(dataset, .(var1, var2), summarise,
	mean = mean(meas),
	sd = sd(meas))

	# Good way to go from lists to dataframes
	mylist <- list(var1 = c(1,2,3,4),
	var2 = c(5,6,7,8))
	library(plyr) # load plyr package
	ldply(mylist) # makes a dataframe from the list








	####### Looking at your data - pairwise comparison plots #######
	require(lattice)
	require(ggplot2)
	require(car)

	# base graphics
	pairs(iris[1:4])

	# lattice
	splom(~iris[1:4])

	# ggplot2
	plotmatrix(iris[1:4])

	# other
	source("/Users/ScottMac/Dropbox/R Group/Week1_R-Intro/ggcorplot.R")
	ggcorplot(
	data = iris[1:4],
	var_text_size = 5,
	cor_text_limits = c(5,10))

	# other2
	panel.cor <- function(x, y, digits=2, prefix="", cex.cor)
	{
	usr <- par("usr"); on.exit(par(usr))
	par(usr = c(0, 1, 0, 1))
	r <- abs(cor(x, y))
	txt <- format(c(r, 0.123456789), digits=digits)[1]
	txt <- paste(prefix, txt, sep="")
	if(missing(cex.cor)) cex <- 0.8/strwidth(txt)

	test <- cor.test(x,y)
	# borrowed from printCoefmat
	Signif <- symnum(test$p.value, corr = FALSE, na = FALSE,
	cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
	symbols = c("*", "", "*", ".", " "))

	text(0.5, 0.5, txt, cex = cex * r)
	text(.8, .8, Signif, cex=cex, col=2)
	}

	pairs(iris[1:4], lower.panel=panel.smooth, upper.panel=panel.cor)

	# car package, function scatterplotMatrix (spm abbreviation)
	spm(iris[1:4])


	# compare run times for different methods
	system.time(pairs(iris[1:4]))

	system.time(splom(~iris[1:4]))

	system.time(plotmatrix(iris[1:4]))

	system.time(ggcorplot(
	data = iris[1:4],
	var_text_size = 5,
	cor_text_limits = c(5,10)))

	system.time(pairs(iris[1:4], lower.panel=panel.smooth, upper.panel=panel.cor))

	system.time(spm(iris[1:4]))









	########## Comparison of speed of data summarisation packages #########
	# Modified from the Recipes, scripts, and genomics blog
	library(sqldf)
	library(doBy)
	library(plyr)
	library(data.table)

	n<-10000
	grp1<-sample(1:750, n, replace=T)
	grp2<-sample(1:750, n, replace=T)
	d<-data.frame(x=rnorm(n), y=rnorm(n), grp1=grp1, grp2=grp2, n,
	replace=T)

	# sqldf
	rsqldf <- system.time(sqldf("select grp1, grp2, sum(x), sum(y) from d
	group by grp1, grp2"))

	#doBy
	rdoby <- system.time(summaryBy(x+y~grp1+grp2, data=d, FUN=c(sum)))

	#aggregate
	raggregate <- system.time(aggregate(d, list(d$grp1, d$grp2),
	function(x)sum(x)))

	#plyr
	rplyr <- system.time(ddply(d, .(grp1, grp2), summarise, avx = sum(x),
	avy=sum(y)))

	#data.table
	DT = data.table(d)
	rdataT <- system.time(DT[,list(sum(x),sum(y)),by=list(grp1,grp2)])

	rsqldf
	rdoby
	raggregate
	rplyr
	rdataT

	install.packages("gplots")
	library(gplots)

	x <- c(rdataT[3],rsqldf[3],rdoby[3],raggregate[3],rplyr[3])
	balloonplot( rep("time.elapsed",5),c("data.table","sqldf","doBy","aggregate","plyr"),
	round(x,1), ylab ="Method", xlab="",sorted=F,dotcolor=rev(heat.colors(5)),
	main="time.elapsed for different methods of grouping")








	##### Comparison of lattice and ggplot2 graphics ########
	library(lattice)
	library(ggplot2)
	head(iris)

	# Base graphics
	# Not sure how to do the specific example here?

	# lattice graphics
	pl <- densityplot(~Sepal.Length, data = iris, groups = Species,
	plot.points = FALSE, ref = TRUE)
	print(pl)

	# ggplot2 graphics
	pg <- ggplot(iris, aes(Sepal.Length)) +
	stat_density(geom = "path", position = "identity",
	aes(colour = factor(Species)))
	print(pg)