aammd/loopfree.R

## loopfree.R
# by Andrew MacDonald Nov 2011
## please add your own tips and tricks!

## the idea here is to demonstrate how for loops are used
## and then immediately demonstrate how they can be avoided.

## we'll begin by creating a fake dataset.
## This experiment is a 3x3 factorial with 2 replicates of each treatment combination.
## The scientist applied nitrogen and phosphorous to the soil
## then she measured two responses: plant growth and seed production.
## both responses increase with nutrient addition.  There's no interaction.

####################################################################################################
###################### simulating the dataset ######################################################
####################################################################################################

## a vector of treatment levels
treatments <- c("low","med","high")
## generate a dataframe with two factors.
## rep makes copies of the 'treatment' vector.  for the factor 'nitrogen', the whole vector 'treatments' is repeated six times
## while for phosp, each element of 'treatments' is repeated six times.  the result is two fully-crossed factors, with every combination repeated twice (ie n=2 for each treatment combination)
enrich <- data.frame(nitrogen=factor(rep(treatments,6),levels=c("low","med","high")),
                     phosp=factor(rep(treatments,rep(6,3)),levels=c("low","med","high"))
                     )

## to generate response variables, we just make up an equation that describes how the mean of a distribution
## depends on the values of our treatments (ie the numerical vectors in the two lines below are just my imagination)
## R has the ability to generate random numbers from all the common statistical distributions.
## to generate data with a plausible amount of noise, fill in these equations for the value of the means
enrich$growth<-with(enrich,rnorm(n=dim(enrich)[1],mean=c(2,3,4)[nitrogen]+c(3,5,7)[phosp]))
## while the normal distribution might describe growth (continuous), the poisson will make plausible seed counts (discrete)
enrich$seeds <- with(enrich,rpois(n=dim(enrich)[1],lambda=c(15,20,25)[nitrogen]+c(20,30,40)[phosp]))


####################################################################################################
########################## operations on factor levels #############################################
####################################################################################################

## QUESTION: what is the mean growth of plants in the different phosphorus treatments?

## with a (totally needless!) for-loop
x<-numeric(0)
for (i in levels(enrich$phosp)) {
  x[i]<-mean(enrich$growth[which(enrich$phosp==i)])
}
x

## TAPPLY is useful: it finds all the values of a response variable that match the levels of a factor
## then it runs a function of your choice on those values.
## the function can be a standard on (mean, sd) or one you make up.
with(enrich,tapply(growth,phosp,mean))
with(enrich,tapply(seeds,phosp,mean))

## tapply can do more than one factor at once:
with(enrich,tapply(growth,list(phosp,nitrogen),mean))

## confirm there are two replicates in each group
with(enrich,tapply(growth,list(phosp,nitrogen),length))

## AGGREGATE is an extremely powerful function.
## Like tapply it can work with as many factors as you want, and can run any function you like
## Unlike tapply, it can handle multiple response variables and produces a dataframe, not a matrix.
## take the average of all replicates in the same treatment combination:
with(enrich,aggregate(enrich[,3:4],by=list(p=phosp,n=nitrogen),mean))


####################################################################################################
############################# operations on columns and rows #######################################
####################################################################################################

## QUESTION: what is the total growth in each treatment level?

## lets begin with the result of tapply as above
growth.resp <- with(enrich,tapply(growth,list(phosp,nitrogen),mean))

## for-loop
x<-numeric(0)
for (i in 1:dim(growth.resp)[2]){
  x[i] <- sum(growth.resp[,i])
}
x

## The above can be done in a single function:
apply(growth.resp,2,sum)

## colSums is much faster, in addition to being easier to read!
colSums(growth.resp)

## all three of the techniques above could be done on rows as well.  I leave this as an exercise for the reader (sorry to be so smug; I've just always wanted to say that!)


####################################################################################################
###################### operations on lists #########################################################
####################################################################################################

## you can begin to save a great deal of time by using lists.
## for example, you can collect similar objects in one list and then apply a function to each with a single line of code.

# plot each response variable as a function of a fertilizer treatment
par(mfrow=c(1,2))
for (i in 3:4){
  with(enrich,plot(x=phosp,y=enrich[,i],main=names(enrich)[i],xlab="phosp"))
}

## lapply becomes very flexible when you write your own special function for it to run.

## this draws histograms of each variable
par(mfrow=c(1,2))
lapply(enrich[,3:4],hist,col='green')  #notice how additional arguments can be applied.

## this reproduces our for-loop from above...
lapply(enrich[,3:4],function(resp) plot(x=enrich$phosp,y=resp))

## ... and the next line does even better! Here we are running lapply on the *names*, not the actual data.
## this is nicer because it lets us use the name itself, for example to label the plots with the name of the response variable.
## remember: a dataframe is a kind of list!
boxplots <- lapply(names(enrich)[3:4],function(resp) plot(cbind(enrich["phosp"],enrich[resp]),main=resp))
## the function could become as complex as you like; in which case you should probably define it as a function
## outside of lapply - i.e., give it a nice name.

## Question CLOSED (THANKS LAURA!!) these are equivalent ways of making it work.
lapply(names(enrich)[3:4],function(resp) plot(x=enrich["phosp"],y=enrich[[resp]],main=resp))
lapply(names(enrich)[3:4],function(resp) plot(enrich[,c("phosp",resp)],main=resp))

## notice that "plot" actually produces some information (in this case about the breakpoints used to make the boxplots etc)
## by saving this information to a list, we can save it for easy extraction later:

sapply(boxplots,function(x) x["conf"])

## lapply and sapply are the same, except that sapply "S"implifies the resulting list into a vector or array
## sometimes this makes no sense; other times it is convenient:
lapply(enrich[,3:4],mean)  #returns a list
sapply(enrich[,3:4],mean)  # a vector
	# by Andrew MacDonald Nov 2011
	## please add your own tips and tricks!

	## the idea here is to demonstrate how for loops are used
	## and then immediately demonstrate how they can be avoided.

	## we'll begin by creating a fake dataset.
	## This experiment is a 3x3 factorial with 2 replicates of each treatment combination.
	## The scientist applied nitrogen and phosphorous to the soil
	## then she measured two responses: plant growth and seed production.
	## both responses increase with nutrient addition. There's no interaction.

	####################################################################################################
	###################### simulating the dataset ######################################################
	####################################################################################################

	## a vector of treatment levels
	treatments <- c("low","med","high")
	## generate a dataframe with two factors.
	## rep makes copies of the 'treatment' vector. for the factor 'nitrogen', the whole vector 'treatments' is repeated six times
	## while for phosp, each element of 'treatments' is repeated six times. the result is two fully-crossed factors, with every combination repeated twice (ie n=2 for each treatment combination)
	enrich <- data.frame(nitrogen=factor(rep(treatments,6),levels=c("low","med","high")),
	phosp=factor(rep(treatments,rep(6,3)),levels=c("low","med","high"))
	)

	## to generate response variables, we just make up an equation that describes how the mean of a distribution
	## depends on the values of our treatments (ie the numerical vectors in the two lines below are just my imagination)
	## R has the ability to generate random numbers from all the common statistical distributions.
	## to generate data with a plausible amount of noise, fill in these equations for the value of the means
	enrich$growth<-with(enrich,rnorm(n=dim(enrich)[1],mean=c(2,3,4)[nitrogen]+c(3,5,7)[phosp]))
	## while the normal distribution might describe growth (continuous), the poisson will make plausible seed counts (discrete)
	enrich$seeds <- with(enrich,rpois(n=dim(enrich)[1],lambda=c(15,20,25)[nitrogen]+c(20,30,40)[phosp]))


	####################################################################################################
	########################## operations on factor levels #############################################
	####################################################################################################

	## QUESTION: what is the mean growth of plants in the different phosphorus treatments?

	## with a (totally needless!) for-loop
	x<-numeric(0)
	for (i in levels(enrich$phosp)) {
	x[i]<-mean(enrich$growth[which(enrich$phosp==i)])
	}
	x

	## TAPPLY is useful: it finds all the values of a response variable that match the levels of a factor
	## then it runs a function of your choice on those values.
	## the function can be a standard on (mean, sd) or one you make up.
	with(enrich,tapply(growth,phosp,mean))
	with(enrich,tapply(seeds,phosp,mean))

	## tapply can do more than one factor at once:
	with(enrich,tapply(growth,list(phosp,nitrogen),mean))

	## confirm there are two replicates in each group
	with(enrich,tapply(growth,list(phosp,nitrogen),length))

	## AGGREGATE is an extremely powerful function.
	## Like tapply it can work with as many factors as you want, and can run any function you like
	## Unlike tapply, it can handle multiple response variables and produces a dataframe, not a matrix.
	## take the average of all replicates in the same treatment combination:
	with(enrich,aggregate(enrich[,3:4],by=list(p=phosp,n=nitrogen),mean))


	####################################################################################################
	############################# operations on columns and rows #######################################
	####################################################################################################

	## QUESTION: what is the total growth in each treatment level?

	## lets begin with the result of tapply as above
	growth.resp <- with(enrich,tapply(growth,list(phosp,nitrogen),mean))

	## for-loop
	x<-numeric(0)
	for (i in 1:dim(growth.resp)[2]){
	x[i] <- sum(growth.resp[,i])
	}
	x

	## The above can be done in a single function:
	apply(growth.resp,2,sum)

	## colSums is much faster, in addition to being easier to read!
	colSums(growth.resp)

	## all three of the techniques above could be done on rows as well. I leave this as an exercise for the reader (sorry to be so smug; I've just always wanted to say that!)


	####################################################################################################
	###################### operations on lists #########################################################
	####################################################################################################

	## you can begin to save a great deal of time by using lists.
	## for example, you can collect similar objects in one list and then apply a function to each with a single line of code.

	# plot each response variable as a function of a fertilizer treatment
	par(mfrow=c(1,2))
	for (i in 3:4){
	with(enrich,plot(x=phosp,y=enrich[,i],main=names(enrich)[i],xlab="phosp"))
	}

	## lapply becomes very flexible when you write your own special function for it to run.

	## this draws histograms of each variable
	par(mfrow=c(1,2))
	lapply(enrich[,3:4],hist,col='green') #notice how additional arguments can be applied.

	## this reproduces our for-loop from above...
	lapply(enrich[,3:4],function(resp) plot(x=enrich$phosp,y=resp))

	## ... and the next line does even better! Here we are running lapply on the names, not the actual data.
	## this is nicer because it lets us use the name itself, for example to label the plots with the name of the response variable.
	## remember: a dataframe is a kind of list!
	boxplots <- lapply(names(enrich)[3:4],function(resp) plot(cbind(enrich["phosp"],enrich[resp]),main=resp))
	## the function could become as complex as you like; in which case you should probably define it as a function
	## outside of lapply - i.e., give it a nice name.

	## Question CLOSED (THANKS LAURA!!) these are equivalent ways of making it work.
	lapply(names(enrich)[3:4],function(resp) plot(x=enrich["phosp"],y=enrich[[resp]],main=resp))
	lapply(names(enrich)[3:4],function(resp) plot(enrich[,c("phosp",resp)],main=resp))

	## notice that "plot" actually produces some information (in this case about the breakpoints used to make the boxplots etc)
	## by saving this information to a list, we can save it for easy extraction later:

	sapply(boxplots,function(x) x["conf"])

	## lapply and sapply are the same, except that sapply "S"implifies the resulting list into a vector or array
	## sometimes this makes no sense; other times it is convenient:
	lapply(enrich[,3:4],mean) #returns a list
	sapply(enrich[,3:4],mean) # a vector