sjackman/stat540-hw1.R

## stat540-hw1.R
# STAT 540 Homework 1
# Shaun Jackman
# 2013-02-18

library(lattice)
library(limma)
library(preprocessCore)
library(reshape2)

design <- read.table('gse7191_design.txt')
design$sample <- factor(row.names(design))
design$DateRun <- as.Date(design$DateRun, format='%m/%d/%y')
design$Genotype <- relevel(design$Genotype, 'Wild_type')
data <- t(read.table('GSE7191_data.txt', header=TRUE, row.names=1))
tall <- melt(cbind(design, data), id.vars=colnames(design),
	variable.name = 'probe', value.name = 'expression')

# > 1a Indicate how many probes and how many samples are available.
cat('Number of probes:', ncol(data))
cat('Number of samples:', nrow(data))

# > 1b What is the breakdown of samples for Genotype, BrainRegion and Sex within each DateRun?
by(design, design$DateRun,
	function(x) summary(subset(x, select=c(Genotype, BrainRegion, Sex))))

# > 1c Create a plot showing the raw data for one probe, labeled by sample type (choose at least one factor to label, such as genotype).
stripplot(expression ~ Genotype, tall, group = BrainRegion,
	subset = probe == '100001_at',
	type = c('p', 'a'), jitter.data=TRUE, auto.key=TRUE, grid=TRUE)

# > 1d Report the average expression of the selected probe across Genotype, BrainRegion and Sex, i.e. one of the 12 average expression values you report will be for wild type neocortex in the male mice.
oneprobe = subset(tall, probe == '100001_at')
by(oneprobe[,c('expression')], oneprobe[,c('Genotype', 'BrainRegion', 'Sex')],
	mean)

# > 2a Order the data by run dates and examine batch effects using a heatmap.
heatmap(data[order(design$DateRun),], Rowv=NA)
heatmap(data)
outlier <- 'GSM172976'

# > 2b Document, identify and remove the worst outlier observed in the heatmap.
cat('The outlier is', outlier)
design.clean <- subset(design, sample != outlier)
data.clean <- data[design$sample != outlier,]
tall.clean <- subset(tall, sample != outlier)

# > 2c Scatter plot this outlier, to-be-removed sample against the other samples from the same group (i.e., same Genotype, BrainRegion and Sex)
x <- design[outlier,c('Genotype', 'BrainRegion', 'Sex')]
same.as.outlier <- subset(design,
	subset = Genotype == x$Genotype & BrainRegion == x$BrainRegion & Sex == x$Sex
	)$sample
splom(t(data[same.as.outlier,]))

# > 2d Re-make the heatmap of the sample correlation matrix after removing the outlier.
heatmap(data.clean)
# > Comment on the results.
# The initial clustering of the data showed that a single sample was isolated
# from all other samples in the dendrogram. After removing the outlier, no one
# sample is isolated from the rest.

# > 3a Re-normalize the datasets we have provided (before removing the outlier) using quantile normalization.
data.norm <- t(normalize.quantiles(t(data)))
dimnames(data.norm) <- dimnames(data)
tall.norm <- melt(cbind(design, data.norm), id.vars=colnames(design),
	variable.name = 'probe', value.name = 'expression')
# > Compare before and after re-normalization datasets using box plots. Identify the outlier with a different color.
sample.col <- rep('blue', nrow(design))
sample.col[design$sample == outlier] <- 'red'
boxplot(t(data), col=sample.col)
boxplot(t(data.norm), col=sample.col)

# > 3b Repeat a) for the clean data (after removing the outlier).
data.clean.norm <- t(normalize.quantiles(t(data.clean)))
dimnames(data.clean.norm) <- dimnames(data.clean)
tall.clean.norm <- melt(cbind(design.clean, data.clean.norm),
	id.vars=colnames(design.clean),
	variable.name = 'probe', value.name = 'expression')
boxplot(t(data.clean))
boxplot(t(data.clean.norm))

# > 3c Comment and describe the results from a) and b).
# > Does the outlier affect the normalization?
# No, the outlier does not greatly affect the normalization.
# > Does normalization have the desired effect in the clean data?
# Yes, the normalization does have the desired effect in the clean data. Namely,
# the data is now quantile normalized.
# > Does normalization solve the problem of the outlier?
# No, normalization does not solve the problem of the outlier.

# > 3d Compare the normalized data of the outlying sample against other samples in the same group (i.e., repeat 2c for the normalized data).
splom(t(data[same.as.outlier,]), xlab='Without quantile normalization')
splom(t(data.norm[same.as.outlier,]), xlab='With quantile normalization')
# > Does normalization solve the problem of the outlier?
# No. Quantile normalization does not solve the problem of the outlier.

# > 4a Write out, in an equation or English or, ideally, both, the model you are fitting. In the context of that model, what statistical test(s) are you conducting to find the requested probe hits?
# I am fitting a linear model with three parameters:
# 1. the mean expression for the probe, also known as the intercept
# 2. the effect on expression due to the Genotype S1P2_KO for that probe
# 3. the effect on expression due to the Genotype S1P3_KO for that probe
# I am conducting an F-test to determine whether the effect on expression due to
# both genotypes S1P2_KO and S1P3_KO is zero.

# > 4b How many probes have p-value < 10e-4?
mm <- model.matrix(~Genotype, design)
eb <- eBayes(lmFit(t(data), mm))
hits <- topTable(eb, coef = grep('Genotype', colnames(mm)), number=Inf)
countHits <- function(hits, p.value=10e-4) {
	cat('Number of probes with a p-value <', p.value , ':',
		sum(hits$P.Value < p.value), '\n')
	cat('Number of probes with an adjusted p-value <', p.value , ':',
		sum(hits$adj.P.Val < p.value))
}
countHits(hits)
stripplot(expression ~ Genotype | probe, tall.norm,
	subset = probe %in% subset(hits, adj.P.Val < 10e-4)$ID,
	type=c('p', 'a'), jitter.data=TRUE, grid=TRUE,
	scales=list(relation='sliced'))
# > Order results in ascending order of p-values and display the top 50 probes in a heat map.
top50 <- topTable(eb, coef = grep('Genotype', colnames(mm)),
	number=50)$ID
heatmap(data.norm[,top50])

# > 4c State explicitly what, if any, adjustment has been made to these p-values with respect to multiple testing.
# The p-values have been adjusted using Benjamini & Hochberg multiple testing
# correction to control the false discovery rate.
# > What is the associated probability statement for the probes reported in 4b?
# Using an adjusted p-value threshold of 10e-4, which equals 1e-3,
# the probablity that a reported discovery is false is 10e-4 = 1e-3 = 0.001.
# We expect one false discovery for every 1,000 reported discoveries.

# > 4d Provide a stripplot for the top-ranking probe and report the associated p-value.
stripplot(expression ~ Genotype, tall.norm, subset = probe == top50[1],
	type=c('p', 'a'), jitter.data=TRUE, grid=TRUE)
cat('The p-value is:', hits[1,]$P.Value)
cat('The adjusted p-value is:', hits[1,]$adj.P.Val)
# > Do the same for a probe near the bottom of the overall ranking (that is, a "boring" gene).
boring <- tail(hits, n=1)
stripplot(expression ~ Genotype, tall.norm, subset = probe == boring$ID,
	type=c('p', 'a'), jitter.data=TRUE, grid=TRUE)
cat('The p-value is:', boring$P.Value)
cat('The adjusted p-value is:', boring$adj.P.Val)
# > Comment on the differences you observe.
# The top-ranking probe has clearly different expression levels for all three
# genotypes and a low p-value, whereas the boring probe has nearly identical
# expression levels for all three genotypes and a high p-value.

# > 4e Test the null hypothesis that the mean probe expression of the S1P3 knockout is the same as that of the wild type.
mm <- model.matrix(~ 0 + Genotype, design)
fit <- lmFit(t(data.norm), mm)
eb <- eBayes(contrasts.fit(fit,
	makeContrasts(P3vsWT = GenotypeS1P3_KO - GenotypeWild_type, levels=mm)))
# > How many probes have a p-value < 10e-4?
P3vsWT.hits <- topTable(eb, number=Inf)
countHits(P3vsWT.hits)
stripplot(expression ~ Genotype | probe, tall.norm,
	subset = probe %in% subset(P3vsWT.hits, adj.P.Val < 10e-4)$ID
		& Genotype %in% c('Wild_type', 'S1P3_KO'),
	type=c('p', 'a'), jitter.data=TRUE, grid=TRUE,
	scales=list(relation='sliced'))

# > 4f Repeat the test for differential expression across the 3 genotypes using the clean normalized dataset (i.e., the normalized data after removing the outlier).
mm <- model.matrix(~Genotype, design.clean)
eb <- eBayes(lmFit(t(data.clean.norm), mm))
hits.clean <- topTable(eb, coef = grep('Genotype', colnames(mm)), number=Inf)
countHits(hits.clean)
stripplot(expression ~ Genotype | probe, tall.norm,
	subset = probe %in% subset(hits.clean, adj.P.Val < 10e-4)$ID,
	type=c('p', 'a'), jitter.data=TRUE, grid=TRUE,
	scales=list(relation='sliced'))
# > Compare the list of probes with p < 10e-4 between both analyses.
compareHits <- function(hits.id, hits.clean.id) {
	df <- data.frame(
		Raw = colnames(data) %in% hits.id,
		Clean = colnames(data) %in% hits.clean.id)
	print(addmargins(with(df, table(Raw, Clean))))
	vennDiagram(df)
}
compareHits(
	subset(hits, P.Value < 10e-4)$ID,
	subset(hits.clean, P.Value < 10e-4)$ID)
compareHits(
	subset(hits, adj.P.Val < 10e-4)$ID,
	subset(hits.clean, adj.P.Val < 10e-4)$ID)

# > 5a Using the parametrization of your choice, fit a 3x2 full factorial model.
# > Test for a difference of expression across the 6 groups.
# > How many probesets have an overall p-value < 10e-4?
mm <- model.matrix(~ Genotype * BrainRegion, design.clean)
eb <- eBayes(lmFit(t(data.clean.norm), mm))
hits <- topTable(eb, coef = grep('(Intercept)', colnames(mm), invert=TRUE),
	number=Inf)
countHits(hits)

# > 5b Use an F-test to test the null hypothesis that all the interaction effects between Genotype and BrainRegion are zero.
# > How many probes have a p-value < 0.05?
hits <- topTable(eb, coef = grep(':', colnames(mm)), number=Inf)
countHits(hits, 0.05)

# > 5c Identify the probe with the lowest p-value and pick additional "boring" gene (with respect to Genotype:BrainRegion interaction effects).
# > For this contrasting pair of probes, report the relevant statistical results and plot their data illustrating the interaction effect.
interesting = hits[1,]
boring = hits[9999,]
rbind(interesting, boring)
stripplot(expression ~ Genotype | probe, tall.clean.norm,
	group = BrainRegion,
	subset = probe %in% c(interesting$ID, boring$ID),
	type=c('p', 'a'), jitter.data=TRUE, grid=TRUE, auto.key=TRUE,
	scales=list(relation='sliced'))

# > 5d In problem 4, you identified probes with significant differential expression among the three genotypes. In 5a, you fit a more complex model considering the joint effect of Genotype and BrainRegion on each probe expression.
# > For each probe, use an F-test to test if the more complex model with Genotype and BrainRegion effects is better than the simpler model with only Genotype effect.
hits <- topTable(eb, coef = grep('BrainRegion', colnames(mm)), number=Inf)
countHits(hits)
# > Draw a density plot of the corresponding p-values and comment on the results.
densityplot(~P.Value, hits)
# The density plot appears to be a mixture of a uniform distribution, expected
# of probes for which the null hypothesis is true, and a peaked distribution
# of probes for which we can reject the null hypothesis.
	# STAT 540 Homework 1
	# Shaun Jackman
	# 2013-02-18

	library(lattice)
	library(limma)
	library(preprocessCore)
	library(reshape2)

	design <- read.table('gse7191_design.txt')
	design$sample <- factor(row.names(design))
	design$DateRun <- as.Date(design$DateRun, format='%m/%d/%y')
	design$Genotype <- relevel(design$Genotype, 'Wild_type')
	data <- t(read.table('GSE7191_data.txt', header=TRUE, row.names=1))
	tall <- melt(cbind(design, data), id.vars=colnames(design),
	variable.name = 'probe', value.name = 'expression')

	# > 1a Indicate how many probes and how many samples are available.
	cat('Number of probes:', ncol(data))
	cat('Number of samples:', nrow(data))

	# > 1b What is the breakdown of samples for Genotype, BrainRegion and Sex within each DateRun?
	by(design, design$DateRun,
	function(x) summary(subset(x, select=c(Genotype, BrainRegion, Sex))))

	# > 1c Create a plot showing the raw data for one probe, labeled by sample type (choose at least one factor to label, such as genotype).
	stripplot(expression ~ Genotype, tall, group = BrainRegion,
	subset = probe == '100001_at',
	type = c('p', 'a'), jitter.data=TRUE, auto.key=TRUE, grid=TRUE)

	# > 1d Report the average expression of the selected probe across Genotype, BrainRegion and Sex, i.e. one of the 12 average expression values you report will be for wild type neocortex in the male mice.
	oneprobe = subset(tall, probe == '100001_at')
	by(oneprobe[,c('expression')], oneprobe[,c('Genotype', 'BrainRegion', 'Sex')],
	mean)

	# > 2a Order the data by run dates and examine batch effects using a heatmap.
	heatmap(data[order(design$DateRun),], Rowv=NA)
	heatmap(data)
	outlier <- 'GSM172976'

	# > 2b Document, identify and remove the worst outlier observed in the heatmap.
	cat('The outlier is', outlier)
	design.clean <- subset(design, sample != outlier)
	data.clean <- data[design$sample != outlier,]
	tall.clean <- subset(tall, sample != outlier)

	# > 2c Scatter plot this outlier, to-be-removed sample against the other samples from the same group (i.e., same Genotype, BrainRegion and Sex)
	x <- design[outlier,c('Genotype', 'BrainRegion', 'Sex')]
	same.as.outlier <- subset(design,
	subset = Genotype == x$Genotype & BrainRegion == x$BrainRegion & Sex == x$Sex
	)$sample
	splom(t(data[same.as.outlier,]))

	# > 2d Re-make the heatmap of the sample correlation matrix after removing the outlier.
	heatmap(data.clean)
	# > Comment on the results.
	# The initial clustering of the data showed that a single sample was isolated
	# from all other samples in the dendrogram. After removing the outlier, no one
	# sample is isolated from the rest.

	# > 3a Re-normalize the datasets we have provided (before removing the outlier) using quantile normalization.
	data.norm <- t(normalize.quantiles(t(data)))
	dimnames(data.norm) <- dimnames(data)
	tall.norm <- melt(cbind(design, data.norm), id.vars=colnames(design),
	variable.name = 'probe', value.name = 'expression')
	# > Compare before and after re-normalization datasets using box plots. Identify the outlier with a different color.
	sample.col <- rep('blue', nrow(design))
	sample.col[design$sample == outlier] <- 'red'
	boxplot(t(data), col=sample.col)
	boxplot(t(data.norm), col=sample.col)

	# > 3b Repeat a) for the clean data (after removing the outlier).
	data.clean.norm <- t(normalize.quantiles(t(data.clean)))
	dimnames(data.clean.norm) <- dimnames(data.clean)
	tall.clean.norm <- melt(cbind(design.clean, data.clean.norm),
	id.vars=colnames(design.clean),
	variable.name = 'probe', value.name = 'expression')
	boxplot(t(data.clean))
	boxplot(t(data.clean.norm))

	# > 3c Comment and describe the results from a) and b).
	# > Does the outlier affect the normalization?
	# No, the outlier does not greatly affect the normalization.
	# > Does normalization have the desired effect in the clean data?
	# Yes, the normalization does have the desired effect in the clean data. Namely,
	# the data is now quantile normalized.
	# > Does normalization solve the problem of the outlier?
	# No, normalization does not solve the problem of the outlier.

	# > 3d Compare the normalized data of the outlying sample against other samples in the same group (i.e., repeat 2c for the normalized data).
	splom(t(data[same.as.outlier,]), xlab='Without quantile normalization')
	splom(t(data.norm[same.as.outlier,]), xlab='With quantile normalization')
	# > Does normalization solve the problem of the outlier?
	# No. Quantile normalization does not solve the problem of the outlier.

	# > 4a Write out, in an equation or English or, ideally, both, the model you are fitting. In the context of that model, what statistical test(s) are you conducting to find the requested probe hits?
	# I am fitting a linear model with three parameters:
	# 1. the mean expression for the probe, also known as the intercept
	# 2. the effect on expression due to the Genotype S1P2_KO for that probe
	# 3. the effect on expression due to the Genotype S1P3_KO for that probe
	# I am conducting an F-test to determine whether the effect on expression due to
	# both genotypes S1P2_KO and S1P3_KO is zero.

	# > 4b How many probes have p-value < 10e-4?
	mm <- model.matrix(~Genotype, design)
	eb <- eBayes(lmFit(t(data), mm))
	hits <- topTable(eb, coef = grep('Genotype', colnames(mm)), number=Inf)
	countHits <- function(hits, p.value=10e-4) {
	cat('Number of probes with a p-value <', p.value , ':',
	sum(hits$P.Value < p.value), '\n')
	cat('Number of probes with an adjusted p-value <', p.value , ':',
	sum(hits$adj.P.Val < p.value))
	}
	countHits(hits)
	stripplot(expression ~ Genotype \| probe, tall.norm,
	subset = probe %in% subset(hits, adj.P.Val < 10e-4)$ID,
	type=c('p', 'a'), jitter.data=TRUE, grid=TRUE,
	scales=list(relation='sliced'))
	# > Order results in ascending order of p-values and display the top 50 probes in a heat map.
	top50 <- topTable(eb, coef = grep('Genotype', colnames(mm)),
	number=50)$ID
	heatmap(data.norm[,top50])

	# > 4c State explicitly what, if any, adjustment has been made to these p-values with respect to multiple testing.
	# The p-values have been adjusted using Benjamini & Hochberg multiple testing
	# correction to control the false discovery rate.
	# > What is the associated probability statement for the probes reported in 4b?
	# Using an adjusted p-value threshold of 10e-4, which equals 1e-3,
	# the probablity that a reported discovery is false is 10e-4 = 1e-3 = 0.001.
	# We expect one false discovery for every 1,000 reported discoveries.

	# > 4d Provide a stripplot for the top-ranking probe and report the associated p-value.
	stripplot(expression ~ Genotype, tall.norm, subset = probe == top50[1],
	type=c('p', 'a'), jitter.data=TRUE, grid=TRUE)
	cat('The p-value is:', hits[1,]$P.Value)
	cat('The adjusted p-value is:', hits[1,]$adj.P.Val)
	# > Do the same for a probe near the bottom of the overall ranking (that is, a "boring" gene).
	boring <- tail(hits, n=1)
	stripplot(expression ~ Genotype, tall.norm, subset = probe == boring$ID,
	type=c('p', 'a'), jitter.data=TRUE, grid=TRUE)
	cat('The p-value is:', boring$P.Value)
	cat('The adjusted p-value is:', boring$adj.P.Val)
	# > Comment on the differences you observe.
	# The top-ranking probe has clearly different expression levels for all three
	# genotypes and a low p-value, whereas the boring probe has nearly identical
	# expression levels for all three genotypes and a high p-value.

	# > 4e Test the null hypothesis that the mean probe expression of the S1P3 knockout is the same as that of the wild type.
	mm <- model.matrix(~ 0 + Genotype, design)
	fit <- lmFit(t(data.norm), mm)
	eb <- eBayes(contrasts.fit(fit,
	makeContrasts(P3vsWT = GenotypeS1P3_KO - GenotypeWild_type, levels=mm)))
	# > How many probes have a p-value < 10e-4?
	P3vsWT.hits <- topTable(eb, number=Inf)
	countHits(P3vsWT.hits)
	stripplot(expression ~ Genotype \| probe, tall.norm,
	subset = probe %in% subset(P3vsWT.hits, adj.P.Val < 10e-4)$ID
	& Genotype %in% c('Wild_type', 'S1P3_KO'),
	type=c('p', 'a'), jitter.data=TRUE, grid=TRUE,
	scales=list(relation='sliced'))

	# > 4f Repeat the test for differential expression across the 3 genotypes using the clean normalized dataset (i.e., the normalized data after removing the outlier).
	mm <- model.matrix(~Genotype, design.clean)
	eb <- eBayes(lmFit(t(data.clean.norm), mm))
	hits.clean <- topTable(eb, coef = grep('Genotype', colnames(mm)), number=Inf)
	countHits(hits.clean)
	stripplot(expression ~ Genotype \| probe, tall.norm,
	subset = probe %in% subset(hits.clean, adj.P.Val < 10e-4)$ID,
	type=c('p', 'a'), jitter.data=TRUE, grid=TRUE,
	scales=list(relation='sliced'))
	# > Compare the list of probes with p < 10e-4 between both analyses.
	compareHits <- function(hits.id, hits.clean.id) {
	df <- data.frame(
	Raw = colnames(data) %in% hits.id,
	Clean = colnames(data) %in% hits.clean.id)
	print(addmargins(with(df, table(Raw, Clean))))
	vennDiagram(df)
	}
	compareHits(
	subset(hits, P.Value < 10e-4)$ID,
	subset(hits.clean, P.Value < 10e-4)$ID)
	compareHits(
	subset(hits, adj.P.Val < 10e-4)$ID,
	subset(hits.clean, adj.P.Val < 10e-4)$ID)

	# > 5a Using the parametrization of your choice, fit a 3x2 full factorial model.
	# > Test for a difference of expression across the 6 groups.
	# > How many probesets have an overall p-value < 10e-4?
	mm <- model.matrix(~ Genotype * BrainRegion, design.clean)
	eb <- eBayes(lmFit(t(data.clean.norm), mm))
	hits <- topTable(eb, coef = grep('(Intercept)', colnames(mm), invert=TRUE),
	number=Inf)
	countHits(hits)

	# > 5b Use an F-test to test the null hypothesis that all the interaction effects between Genotype and BrainRegion are zero.
	# > How many probes have a p-value < 0.05?
	hits <- topTable(eb, coef = grep(':', colnames(mm)), number=Inf)
	countHits(hits, 0.05)

	# > 5c Identify the probe with the lowest p-value and pick additional "boring" gene (with respect to Genotype:BrainRegion interaction effects).
	# > For this contrasting pair of probes, report the relevant statistical results and plot their data illustrating the interaction effect.
	interesting = hits[1,]
	boring = hits[9999,]
	rbind(interesting, boring)
	stripplot(expression ~ Genotype \| probe, tall.clean.norm,
	group = BrainRegion,
	subset = probe %in% c(interesting$ID, boring$ID),
	type=c('p', 'a'), jitter.data=TRUE, grid=TRUE, auto.key=TRUE,
	scales=list(relation='sliced'))

	# > 5d In problem 4, you identified probes with significant differential expression among the three genotypes. In 5a, you fit a more complex model considering the joint effect of Genotype and BrainRegion on each probe expression.
	# > For each probe, use an F-test to test if the more complex model with Genotype and BrainRegion effects is better than the simpler model with only Genotype effect.
	hits <- topTable(eb, coef = grep('BrainRegion', colnames(mm)), number=Inf)
	countHits(hits)
	# > Draw a density plot of the corresponding p-values and comment on the results.
	densityplot(~P.Value, hits)
	# The density plot appears to be a mixture of a uniform distribution, expected
	# of probes for which the null hypothesis is true, and a peaked distribution
	# of probes for which we can reject the null hypothesis.