slavailn/illumina_array_limma.R

## illumina_array_limma.R
library(limma)
library(RColorBrewer)
library(gplots)
library(lumi)
library(lumiHumanAll.db)
library(lumiHumanIDMapping)
library(biomaRt)

setwd("C:/Users/Yaroslav/Documents/teraherz2")
list.files()
setwd("raw_data/")
list.files()
idat_path <- paste("101035690003/",
                   list.files("101035690003/")
                   [grep("idat", list.files("101035690003/"))], sep = "")
bgx_path <- "HumanHT-12_V4_0_R2_15002873_B.bgx"

# read idat files and create EListRaw object
x <- read.idat(idatfiles = idat_path, bgxfile = bgx_path,
               annotation =c("ILMN_Gene, Entrez_Gene_ID", "Symbol", "Probe_Id"))

# Create targets data frame
targets <- data.frame(
  row.names = colnames(x$E),
  array = c("A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L"),
  treatment = c("High", "Ct", "High", "Ct", "High", "Low", "High", "Low",
                "Ct", "Low", "Ct", "Low")
)


head(x$E)
x$other$Detection <- detectionPValues(x)
x$targets <- targets
x <- x[,order(x$targets$treatment)]
x
save(x, file = "raw_data.RData")

# Take a look at distribution of raw intensities
tiff("raw_intensities_distribution.tiff")
boxplot(log2(x$E), range = 0, las=2,
        main = "Distribution of raw intensities\nlog2")
dev.off()

# Proportion of probes expressed
head(x$other$Detection)
pe <- propexpr(x)
pe
tiff("proportion_of_expressed_probes.tiff")
par(mar=c(6,6,6,6))
barplot(pe, las = 2, main = "Proportion of expressed probes")
dev.off()

# Background correction and normalization
y <- neqc(x)
dim(y)
save(y, file="normalized_data.RData")

# Filter out probes which are not expressed
expressed <- rowSums(y$other$Detection < 0.01) >= 1
y <- y[expressed,]
dim(y)

tiff("MDSplot.tiff")
par(mar=c(4,4,4,4))
plotMDS(y,
        labels = paste(y$targets$array, y$targets$treatment, sep="::"),
        main = "MDS plot")
dev.off()

tiff("normalized_intensities_distribution.tiff")
boxplot(y$E, range = 0, las=2,
        main = "Distribution of normalized intensities\nlog2")
dev.off()

# cluster samples
# Get heatmap of top 100 genes with the highest average expression
## Heatmap of top 100 highly expressed miRNAs and sample clustering
select <- order(rowMeans(y$E), decreasing=T)[1:100]
hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100)
tiff("heatmap_clustering_top100.tiff", height = 800, width = 800)
heatmap.2(y$E[select,], col=hmcol, Rowv=T, Colv=T, scale="row",
          dendrogram="both", trace="none", margin=c(10,6), cexRow=0.3,
          labCol = paste(y$targets$array, y$targets$treatment, sep="::"),
          main = "Top 100 probes clustering")
dev.off()

## Plot heatmap of distances
## Heatmap of sample-to-sample distances
dists <- dist(t(y$E))
mat <- as.matrix(dists)
rownames(mat) <- paste(y$targets$array, y$targets$treatment, sep="::")
colnames(mat) <- paste(y$targets$array, y$targets$treatment, sep="::")
tiff("heatmap_sample_to_sample_distances.tiff", width = 800, height = 800)
heatmap.2(mat, trace="none", col=rev(hmcol), margin=c(13, 13))
dev.off()

## Create annotation table
probe_ids <- y$genes$Probe_Id
nuIDs <- IlluminaID2nuID(probe_ids, species = c("Human"))
nuIDs <- as.data.frame(nuIDs)
entrez <- nuID2EntrezID(nuID = as.character(nuIDs$nuID),
                              lib.mapping='lumiHumanIDMapping')
entrez <- as.data.frame(entrez)
nuIDs_entrez <- merge(nuIDs, entrez, by.x = "nuID", by.y = 0)
ensembl <- useMart("ENSEMBL_MART_ENSEMBL", host="www.ensembl.org")
ensembl <- useDataset("hsapiens_gene_ensembl", mart=ensembl)
annot <- getBM(attributes=c("entrezgene", "hgnc_symbol",
                            "ensembl_gene_id",
                            "description"),
               filters = "entrezgene",
               values=as.character(nuIDs_entrez$entrez),
               mart=ensembl)
nuIDs_annot <- merge(nuIDs_entrez, annot, by.x="entrez",
                     by.y = "entrezgene")

# Get expressions
exprs <- y$E
colnames(exprs) <- paste(y$targets$array, y$targets$treatment,
                         sep="_")
exprs <- as.data.frame(exprs)
exprs_probes <- merge(exprs, y$genes, by.x = 0,
                      by.y = "Array_Address_Id")
row.names(exprs_probes) <- exprs_probes$Probe_Id
exprs_probes <- exprs_probes[-c(1, 14, 15)]
exprs <- exprs_probes
write.table(exprs, file="normalized_bkg_subtracted_expressions.txt",
            sep = "\t", quote = F, col.names = T, row.names = T)

# Detect differentially expressed genes between groups
f <- factor(y$targets$treatment, levels = c("Ct", "High", "Low"))
design <- model.matrix(~0 + f)
colnames(design) <- c("Ct", "High", "Low")
fit <- lmFit(exprs, design)
contrast.matrix <- makeContrasts(Ct-High, Ct-Low, Low-High,
                                 levels = design)
fit2 <- contrasts.fit(fit, contrast.matrix)
fit2 <- eBayes(fit2)

# Get results for Ct versus High
high_ct <- topTable(fit2, coef = "Ct - High", adjust = "BH",
                    n = dim(exprs)[1])

high_ct <- merge(high_ct, nuIDs_annot, by.x = 0,
                 by.y = "Probe_Id")
names(high_ct)[1] <- "Probe_Id"
high_ct <- merge(high_ct, exprs[,grep("Ct|High", names(exprs))], by.x = "Probe_Id",
      by.y = 0)
write.table(high_ct, file="High_vs_Ct_results.txt", sep = "\t",
            col.names = T, row.names = F, quote = F)

# Get results for Ct versus Low
low_ct <- topTable(fit2, coef = "Ct - Low", adjust = "BH",
                    n = dim(exprs)[1])

low_ct <- merge(low_ct, nuIDs_annot, by.x = 0,
                 by.y = "Probe_Id")
names(low_ct)[1] <- "Probe_Id"
low_ct <- merge(low_ct, exprs[,grep("Ct|Low", names(exprs))], by.x = "Probe_Id",
                 by.y = 0)
write.table(low_ct, file="Low_vs_Ct_results.txt", sep = "\t",
            col.names = T, row.names = F, quote = F)

# Get results for Low versus High
low_high <- topTable(fit2, coef = "Low - High", adjust = "BH",
                   n = dim(exprs)[1])

# Get results for Low vs High
low_high <- merge(low_high, nuIDs_annot, by.x = 0,
                by.y = "Probe_Id")
names(low_high)[1] <- "Probe_Id"
low_high <- merge(low_high, exprs[,grep("Low|High", names(exprs))], by.x = "Probe_Id",
                by.y = 0)
write.table(low_high, file="Low_vs_High_results.txt", sep = "\t",
            col.names = T, row.names = F, quote = F)
	library(limma)
	library(RColorBrewer)
	library(gplots)
	library(lumi)
	library(lumiHumanAll.db)
	library(lumiHumanIDMapping)
	library(biomaRt)

	setwd("C:/Users/Yaroslav/Documents/teraherz2")
	list.files()
	setwd("raw_data/")
	list.files()
	idat_path <- paste("101035690003/",
	list.files("101035690003/")
	[grep("idat", list.files("101035690003/"))], sep = "")
	bgx_path <- "HumanHT-12_V4_0_R2_15002873_B.bgx"

	# read idat files and create EListRaw object
	x <- read.idat(idatfiles = idat_path, bgxfile = bgx_path,
	annotation =c("ILMN_Gene, Entrez_Gene_ID", "Symbol", "Probe_Id"))

	# Create targets data frame
	targets <- data.frame(
	row.names = colnames(x$E),
	array = c("A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L"),
	treatment = c("High", "Ct", "High", "Ct", "High", "Low", "High", "Low",
	"Ct", "Low", "Ct", "Low")
	)


	head(x$E)
	x$other$Detection <- detectionPValues(x)
	x$targets <- targets
	x <- x[,order(x$targets$treatment)]
	x
	save(x, file = "raw_data.RData")

	# Take a look at distribution of raw intensities
	tiff("raw_intensities_distribution.tiff")
	boxplot(log2(x$E), range = 0, las=2,
	main = "Distribution of raw intensities\nlog2")
	dev.off()

	# Proportion of probes expressed
	head(x$other$Detection)
	pe <- propexpr(x)
	pe
	tiff("proportion_of_expressed_probes.tiff")
	par(mar=c(6,6,6,6))
	barplot(pe, las = 2, main = "Proportion of expressed probes")
	dev.off()

	# Background correction and normalization
	y <- neqc(x)
	dim(y)
	save(y, file="normalized_data.RData")

	# Filter out probes which are not expressed
	expressed <- rowSums(y$other$Detection < 0.01) >= 1
	y <- y[expressed,]
	dim(y)

	tiff("MDSplot.tiff")
	par(mar=c(4,4,4,4))
	plotMDS(y,
	labels = paste(y$targets$array, y$targets$treatment, sep="::"),
	main = "MDS plot")
	dev.off()

	tiff("normalized_intensities_distribution.tiff")
	boxplot(y$E, range = 0, las=2,
	main = "Distribution of normalized intensities\nlog2")
	dev.off()

	# cluster samples
	# Get heatmap of top 100 genes with the highest average expression
	## Heatmap of top 100 highly expressed miRNAs and sample clustering
	select <- order(rowMeans(y$E), decreasing=T)[1:100]
	hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100)
	tiff("heatmap_clustering_top100.tiff", height = 800, width = 800)
	heatmap.2(y$E[select,], col=hmcol, Rowv=T, Colv=T, scale="row",
	dendrogram="both", trace="none", margin=c(10,6), cexRow=0.3,
	labCol = paste(y$targets$array, y$targets$treatment, sep="::"),
	main = "Top 100 probes clustering")
	dev.off()

	## Plot heatmap of distances
	## Heatmap of sample-to-sample distances
	dists <- dist(t(y$E))
	mat <- as.matrix(dists)
	rownames(mat) <- paste(y$targets$array, y$targets$treatment, sep="::")
	colnames(mat) <- paste(y$targets$array, y$targets$treatment, sep="::")
	tiff("heatmap_sample_to_sample_distances.tiff", width = 800, height = 800)
	heatmap.2(mat, trace="none", col=rev(hmcol), margin=c(13, 13))
	dev.off()

	## Create annotation table
	probe_ids <- y$genes$Probe_Id
	nuIDs <- IlluminaID2nuID(probe_ids, species = c("Human"))
	nuIDs <- as.data.frame(nuIDs)
	entrez <- nuID2EntrezID(nuID = as.character(nuIDs$nuID),
	lib.mapping='lumiHumanIDMapping')
	entrez <- as.data.frame(entrez)
	nuIDs_entrez <- merge(nuIDs, entrez, by.x = "nuID", by.y = 0)
	ensembl <- useMart("ENSEMBL_MART_ENSEMBL", host="www.ensembl.org")
	ensembl <- useDataset("hsapiens_gene_ensembl", mart=ensembl)
	annot <- getBM(attributes=c("entrezgene", "hgnc_symbol",
	"ensembl_gene_id",
	"description"),
	filters = "entrezgene",
	values=as.character(nuIDs_entrez$entrez),
	mart=ensembl)
	nuIDs_annot <- merge(nuIDs_entrez, annot, by.x="entrez",
	by.y = "entrezgene")

	# Get expressions
	exprs <- y$E
	colnames(exprs) <- paste(y$targets$array, y$targets$treatment,
	sep="_")
	exprs <- as.data.frame(exprs)
	exprs_probes <- merge(exprs, y$genes, by.x = 0,
	by.y = "Array_Address_Id")
	row.names(exprs_probes) <- exprs_probes$Probe_Id
	exprs_probes <- exprs_probes[-c(1, 14, 15)]
	exprs <- exprs_probes
	write.table(exprs, file="normalized_bkg_subtracted_expressions.txt",
	sep = "\t", quote = F, col.names = T, row.names = T)

	# Detect differentially expressed genes between groups
	f <- factor(y$targets$treatment, levels = c("Ct", "High", "Low"))
	design <- model.matrix(~0 + f)
	colnames(design) <- c("Ct", "High", "Low")
	fit <- lmFit(exprs, design)
	contrast.matrix <- makeContrasts(Ct-High, Ct-Low, Low-High,
	levels = design)
	fit2 <- contrasts.fit(fit, contrast.matrix)
	fit2 <- eBayes(fit2)

	# Get results for Ct versus High
	high_ct <- topTable(fit2, coef = "Ct - High", adjust = "BH",
	n = dim(exprs)[1])

	high_ct <- merge(high_ct, nuIDs_annot, by.x = 0,
	by.y = "Probe_Id")
	names(high_ct)[1] <- "Probe_Id"
	high_ct <- merge(high_ct, exprs[,grep("Ct\|High", names(exprs))], by.x = "Probe_Id",
	by.y = 0)
	write.table(high_ct, file="High_vs_Ct_results.txt", sep = "\t",
	col.names = T, row.names = F, quote = F)

	# Get results for Ct versus Low
	low_ct <- topTable(fit2, coef = "Ct - Low", adjust = "BH",
	n = dim(exprs)[1])

	low_ct <- merge(low_ct, nuIDs_annot, by.x = 0,
	by.y = "Probe_Id")
	names(low_ct)[1] <- "Probe_Id"
	low_ct <- merge(low_ct, exprs[,grep("Ct\|Low", names(exprs))], by.x = "Probe_Id",
	by.y = 0)
	write.table(low_ct, file="Low_vs_Ct_results.txt", sep = "\t",
	col.names = T, row.names = F, quote = F)

	# Get results for Low versus High
	low_high <- topTable(fit2, coef = "Low - High", adjust = "BH",
	n = dim(exprs)[1])

	# Get results for Low vs High
	low_high <- merge(low_high, nuIDs_annot, by.x = 0,
	by.y = "Probe_Id")
	names(low_high)[1] <- "Probe_Id"
	low_high <- merge(low_high, exprs[,grep("Low\|High", names(exprs))], by.x = "Probe_Id",
	by.y = 0)
	write.table(low_high, file="Low_vs_High_results.txt", sep = "\t",
	col.names = T, row.names = F, quote = F)