sjackman/stat540-seminar5.R

## stat540-seminar5.R
# STAT 540 Seminar 5
# Shaun Jackman
# 2013-01-30

# Brief pause to load the photoRec data and the lattice package
library(lattice)  # if you don't already have this loaded ...
prDat <- read.table("../data/photoRec/GSE4051_data.txt")  # the whole enchilada
str(prDat, max.level = 0)
load("../data/photoRec/GSE4051_design.robj")  # load exp design as 'prDes'
str(prDes)

# Write a function to prepare a mini-dataset for a small number of genes
luckyGenes <- c("1419655_at", "1438815_at")

library(reshape2)
prepareData <- function(genes)
  melt(cbind(prDes, t(prDat[genes,])),
    id.vars=colnames(prDes),
    variable.name='gene', value.name='gExp')

jDat <- prepareData(luckyGenes)
str(jDat)
head(jDat)
tail(jDat)

stripplot(gExp ~ devStage | gene, jDat, group = gType,
  jitter.data = TRUE, auto.key = TRUE,
  type = c("p", "a"), grid = TRUE)

# Write a function to stripplot a mini-dataset
makeStripplot <- function(x, ...)
  stripplot(gExp ~ devStage | gene, x, group = gType,
            jitter.data = TRUE, auto.key = TRUE,
            type = c("p", "a"), grid = TRUE,
            ...)
makeStripplot(jDat)
makeStripplot(jDat, pch = 17, cex = 3)
makeStripplot(newDat <- prepareData('1456341_a_at'))
str(newDat)
head(newDat)

# Do a two-sample t-test
t.test(gExp ~ devStage,
       subset(newDat, devStage %in% c('P2', '4_weeks')))

# Fit a linear model with a categorical covariate
makeStripplot(mDat <- prepareData('1438786_a_at'))
summary(mFit <- lm(gExp ~ devStage, mDat, gType == 'wt'))

# Perform inference for a contrast
coef(mFit)
contMat <- rbind(c(0, -1, 0, 1, 0))
obsDiff <- contMat %*% coef(mFit)
sampMeans <- with(subset(mDat, gType == 'wt'),
                  tapply(gExp, devStage, mean))
sampMeans["P2"] - sampMeans["P10"]
vcov(mFit)
summary(mFit)$coefficients[, "Std. Error"]
sqrt(diag(vcov(mFit)))
estSe <- contMat %*% vcov(mFit) %*% t(contMat)
testStat <- obsDiff/estSe
2 * pt(abs(testStat), df = df.residual(mFit), lower.tail = FALSE)

# Fit a linear model with two categorical covariates
makeStripplot(oDat <- prepareData("1448690_at"))
str(oDat)
summary(oFitBig <- lm(gExp ~ gType * devStage, oDat))
summary(oFitSmall <- lm(gExp ~ gType + devStage, oDat))
anova(oFitSmall, oFitBig)

makeStripplot(oDat <- prepareData('1429225_at'))
summary(oFitBig <- lm(gExp ~ gType * devStage, oDat))
summary(oFitSmall <- lm(gExp ~ gType + devStage, oDat))
anova(oFitSmall, oFitBig)

# Ideas for further work

# We wrote functions to prepare and plot data for more than 1 gene. But when we started fitting models and condusting tests, we only worked with 1 gene at a time. Can you use data aggregation strategies from last week to do some of the same work for small sets of genes?
makeStripplot(oDat <- prepareData(c('1448690_at', '1429225_at')))
by(oDat, oDat$gene, function(x) {
  print(summary(oFitBig <- lm(gExp ~ gType * devStage, x)))
  print(summary(oFitSmall <- lm(gExp ~ gType + devStage, x)))
  anova(oFitSmall, oFitBig)})

# In lecture we also experimented with a quantitative version of devStage, which we called age. This opens the door to modelling with a quantitative covariate. Can you fit linear and quadratic models to the expression data for one or several genes?
makeStripplot(oDat <- prepareData(c('1448690_at', '1429225_at')))
devStageToDays <- c('E2'=-2, 'P2'=2, 'P6'=6, 'P10'=10, '4_weeks'=28)
oDat <- prepareData(c('1448690_at', '1429225_at'))
oDat$age <- devStageToDays[oDat$devStage]
str(oDat)

makeXYplot <- function(x, ...)
  xyplot(gExp ~ age | gene, x, group = gType,
            jitter.data = TRUE, auto.key = TRUE,
            type = c("p", "a"), grid = TRUE,
            ...)
makeXYplot(oDat)

by(oDat, oDat$gene, function(x) {
  print(summary(oFitBig <- lm(gExp ~ gType + age, x)))
  print(summary(oFitSmall <- lm(gExp ~ gType + age + I(age^2), x)))
  anova(oFitSmall, oFitBig)})
	# STAT 540 Seminar 5
	# Shaun Jackman
	# 2013-01-30

	# Brief pause to load the photoRec data and the lattice package
	library(lattice) # if you don't already have this loaded ...
	prDat <- read.table("../data/photoRec/GSE4051_data.txt") # the whole enchilada
	str(prDat, max.level = 0)
	load("../data/photoRec/GSE4051_design.robj") # load exp design as 'prDes'
	str(prDes)

	# Write a function to prepare a mini-dataset for a small number of genes
	luckyGenes <- c("1419655_at", "1438815_at")

	library(reshape2)
	prepareData <- function(genes)
	melt(cbind(prDes, t(prDat[genes,])),
	id.vars=colnames(prDes),
	variable.name='gene', value.name='gExp')

	jDat <- prepareData(luckyGenes)
	str(jDat)
	head(jDat)
	tail(jDat)

	stripplot(gExp ~ devStage \| gene, jDat, group = gType,
	jitter.data = TRUE, auto.key = TRUE,
	type = c("p", "a"), grid = TRUE)

	# Write a function to stripplot a mini-dataset
	makeStripplot <- function(x, ...)
	stripplot(gExp ~ devStage \| gene, x, group = gType,
	jitter.data = TRUE, auto.key = TRUE,
	type = c("p", "a"), grid = TRUE,
	...)
	makeStripplot(jDat)
	makeStripplot(jDat, pch = 17, cex = 3)
	makeStripplot(newDat <- prepareData('1456341_a_at'))
	str(newDat)
	head(newDat)

	# Do a two-sample t-test
	t.test(gExp ~ devStage,
	subset(newDat, devStage %in% c('P2', '4_weeks')))

	# Fit a linear model with a categorical covariate
	makeStripplot(mDat <- prepareData('1438786_a_at'))
	summary(mFit <- lm(gExp ~ devStage, mDat, gType == 'wt'))

	# Perform inference for a contrast
	coef(mFit)
	contMat <- rbind(c(0, -1, 0, 1, 0))
	obsDiff <- contMat %*% coef(mFit)
	sampMeans <- with(subset(mDat, gType == 'wt'),
	tapply(gExp, devStage, mean))
	sampMeans["P2"] - sampMeans["P10"]
	vcov(mFit)
	summary(mFit)$coefficients[, "Std. Error"]
	sqrt(diag(vcov(mFit)))
	estSe <- contMat %% vcov(mFit) %% t(contMat)
	testStat <- obsDiff/estSe
	2 * pt(abs(testStat), df = df.residual(mFit), lower.tail = FALSE)

	# Fit a linear model with two categorical covariates
	makeStripplot(oDat <- prepareData("1448690_at"))
	str(oDat)
	summary(oFitBig <- lm(gExp ~ gType * devStage, oDat))
	summary(oFitSmall <- lm(gExp ~ gType + devStage, oDat))
	anova(oFitSmall, oFitBig)

	makeStripplot(oDat <- prepareData('1429225_at'))
	summary(oFitBig <- lm(gExp ~ gType * devStage, oDat))
	summary(oFitSmall <- lm(gExp ~ gType + devStage, oDat))
	anova(oFitSmall, oFitBig)

	# Ideas for further work

	# We wrote functions to prepare and plot data for more than 1 gene. But when we started fitting models and condusting tests, we only worked with 1 gene at a time. Can you use data aggregation strategies from last week to do some of the same work for small sets of genes?
	makeStripplot(oDat <- prepareData(c('1448690_at', '1429225_at')))
	by(oDat, oDat$gene, function(x) {
	print(summary(oFitBig <- lm(gExp ~ gType * devStage, x)))
	print(summary(oFitSmall <- lm(gExp ~ gType + devStage, x)))
	anova(oFitSmall, oFitBig)})

	# In lecture we also experimented with a quantitative version of devStage, which we called age. This opens the door to modelling with a quantitative covariate. Can you fit linear and quadratic models to the expression data for one or several genes?
	makeStripplot(oDat <- prepareData(c('1448690_at', '1429225_at')))
	devStageToDays <- c('E2'=-2, 'P2'=2, 'P6'=6, 'P10'=10, '4_weeks'=28)
	oDat <- prepareData(c('1448690_at', '1429225_at'))
	oDat$age <- devStageToDays[oDat$devStage]
	str(oDat)

	makeXYplot <- function(x, ...)
	xyplot(gExp ~ age \| gene, x, group = gType,
	jitter.data = TRUE, auto.key = TRUE,
	type = c("p", "a"), grid = TRUE,
	...)
	makeXYplot(oDat)

	by(oDat, oDat$gene, function(x) {
	print(summary(oFitBig <- lm(gExp ~ gType + age, x)))
	print(summary(oFitSmall <- lm(gExp ~ gType + age + I(age^2), x)))
	anova(oFitSmall, oFitBig)})