Created
October 26, 2015 12:12
-
-
Save jakeyeung/e707b1967a11a45d4cc8 to your computer and use it in GitHub Desktop.
Analyze gene enrichment
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# AnalyzeGeneEnrichment.R | |
# using topGO | |
# 25 Feb 2014 | |
# | |
# if necessary: | |
# source("http://bioconductor.org/biocLite.R") | |
# biocLite("topGO") | |
# source("http://bioconductor.org/biocLite.R") | |
# biocLite("org.Mm.eg.db") | |
library(topGO) | |
library(org.Mm.eg.db) | |
CreateSym2Entrez <- function(){ | |
sym2entrez <- as.list(org.Mm.egALIAS2EG) | |
sym2entrez <- sym2entrez[!is.na(sym2entrez)] | |
return(sym2entrez) | |
} | |
CreateEntrez2GO <- function(){ | |
entrez2GO.obj <- org.Mm.egGO | |
mapped.genes <- mappedkeys(entrez2GO.obj) | |
entrez2GO.full <- as.list(entrez2GO.obj[mapped.genes]) | |
# collapse sublist of GO-IDs to vector | |
entrez2GO <- lapply(entrez2GO.full, function(x){ | |
return(names(x)) # lapply is nice | |
}) | |
return(entrez2GO) | |
} | |
ConvertSym2Entrez <- function(gene.list, sym2entrez){ | |
# convert to entrez, auto filtering here | |
gene.list.entrez <- sapply(gene.list, function(g){ | |
entrez <- sym2entrez[[g]] | |
if (!is.null(entrez)){ | |
return(entrez) | |
} | |
}, simplify = TRUE) | |
gene.list.entrez <- gene.list.entrez[!sapply(gene.list.entrez, is.null)] | |
return(unlist(gene.list.entrez)) | |
} | |
AnalyzeGeneEnrichment <- function(genes.bg, genes.hit, | |
sym2entrez, | |
entrez2GO, | |
convert.sym.to.entrez = TRUE, | |
which.ontology = "BP", | |
write.path = FALSE, | |
node.size = 5, | |
FDR.cutoff = 0.05){ | |
# Analyze gene enrichment given background and hit genes. | |
# | |
# INPUT: | |
# genes.bg: vector of background genes | |
# genes.hit: vector of hit genes | |
# sym2entrez: Provide a mapping from gene symbol to entrez, see example via CreateSym2Entrez(). If missing, creates it via CreateSym2Entrez() | |
# entrez2GO: mapping from entrez to GO terms, example CreateEntrez2GO(). If missing, creates it via CreateEntrez2Go() | |
# convert.sym.to.entrez: convert gene symbols to entrez ID. Set to FALSE if genes already in entrez ID | |
# write.path: path to write topGO results object. If FALSE, does not write to file | |
# which.ontology: "BP", "MF", "CC" | |
# node.size: prune GO hierarchy from terms which have less than node.size genes | |
# FDR.cutoff: Fisher's exact test FDR-adjusted pvals cutoff to be significant | |
# write.path: if false jsut return object, if true also write to file given by write.path | |
# | |
# OUTPUT: | |
# all.res: topGO results object | |
# | |
if (missing(sym2entrez)){ | |
sym2entrez <- CreateSym2Entrez() | |
} | |
if (missing(entrez2GO)){ | |
entrez2GO <- CreateEntrez2GO() | |
} | |
if (convert.sym.to.entrez){ | |
genes.bg <- ConvertSym2Entrez(genes.bg, sym2entrez) | |
genes.hit <- ConvertSym2Entrez(genes.hit, sym2entrez) | |
} | |
# Select genes in bg that are in hit, binary matrix. | |
# used in topGO new() function | |
sel.genes <- factor(as.integer(genes.bg %in% genes.hit)) | |
names(sel.genes) <- genes.bg | |
# Get topGO object | |
GOdata <- new("topGOdata", | |
ontology = which.ontology, | |
allGenes = sel.genes, | |
nodeSize = node.size, | |
annot = annFUN.gene2GO, | |
gene2GO = entrez2GO) | |
# Run enrichment ---------------------------------------------------------- | |
result.fisher <- runTest(GOdata, algorithm = "classic", statistic = "fisher") | |
n.tests <- length(score(result.fisher)) | |
# Look at results --------------------------------------------------------- | |
all.res <- GenTable(GOdata, | |
classicFisher = result.fisher, | |
orderBy = "classicFisher", | |
ranksOf = "classicFisher", | |
topNodes = n.tests) | |
# adjust pvalues | |
all.res$FDRadj <- p.adjust(all.res$classicFisher, method = "BH") | |
# Filter table by pvalue -------------------------------------------------- | |
all.res <- all.res[all.res$FDRadj <= FDR.cutoff, ] | |
# Add all genes that were considered (can recapitulate contTable) --------- | |
N.genes <- length(genes(GOdata)) | |
all.res$N.genes <- N.genes | |
# Write to file ----------------------------------------------------------- | |
if (write.path != FALSE){ | |
write.table(all.res, file = write.path, | |
quote = FALSE, | |
sep = "\t", | |
row.names = FALSE, | |
col.names = TRUE) | |
} | |
return(all.res) | |
} | |
EnrichmentBinnedToFile <- function(sorted.hits, bin.vector, fname.base, sym2entrez, entrez2GO){ | |
# Take sorted hits and take top bin.vector[i] genes for enrichment | |
# to see how enrichment of certain processes evolve over time | |
# | |
# sorted.hits: list of genes sorted by "relevance" | |
# bin.vector: vector of integers to bin genes (take top 10 genes, then top 20 ... etc ) | |
# fname.base: full path of filename, we will add _i_Ontology.GOtop to end of file | |
for (i in bin.vector){ | |
genes.hit <- sorted.hits[1:i] | |
for (onto in c("BP", "MF", "CC")){ | |
fname.out <- paste0(fname.base, '_', i, '_', onto, '.GOtop') | |
res <- AnalyzeGeneEnrichment(sorted.hits, genes.hit, | |
sym2entrez, entrez2GO, | |
convert.sym.to.entrez = TRUE, | |
which.ontology = onto, | |
write.path = fname.out, | |
node.size = 5, | |
FDR.cutoff = 0.05) | |
} | |
} | |
} | |
GetGoObject <- function(genes.hit, genes.bg, onto, node.size, sym2entrez, entrez2GO){ | |
# Run GO | |
} | |
GetEnrichment <- function(sorted.hits, bin, onto, sym2entrez, entrez2GO){ | |
# Same as getEnrichmentOverBins but for just one bin, that way we can parallelize it | |
genes.hit <- sorted.hits[1:bin] | |
res <- AnalyzeGeneEnrichment(sorted.hits, genes.hit, | |
sym2entrez, entrez2GO, | |
convert.sym.to.entrez = TRUE, | |
which.ontology = onto, | |
write.path = FALSE, | |
node.size = 5, | |
FDR.cutoff = 1) | |
return(res) | |
} | |
GetEnrichmentParallel <- function(sorted.hits, bin.vector, onto, sym2entrez, entrez2GO, n.cores = 4){ | |
# Run GetEnrichment in parallel over all bins in bin.vector | |
library(parallel) | |
print("Running GO (~4 minutes)") | |
start <- Sys.time() | |
res.split <- mclapply(bin.vector, function(bin){ | |
genes.hit <- sorted.hits[1:bin] | |
res <- AnalyzeGeneEnrichment(sorted.hits, genes.hit, | |
sym2entrez, entrez2GO, | |
convert.sym.to.entrez = TRUE, | |
which.ontology = onto, | |
write.path = FALSE, | |
node.size = 5, | |
FDR.cutoff = 1) | |
res$bin <- bin | |
return(res) | |
}, mc.cores = n.cores) | |
print(Sys.time() - start) | |
res.all <- do.call(rbind, res.split) | |
return(res.all) | |
} | |
GetEnrichmentOverBins <- function(sorted.hits, bin.vector, onto, go.terms, sym2entrez, entrez2GO){ | |
# Iterate over bins and find enrichment for a specific go term | |
# track this enrichment as we sweep across the bins | |
# init output matrix: 8 columns ("GO.ID", "Term", "Annotated", "Significant", "Expected", "classicFisher", "FDRadj", "N.genes", "bin") | |
res.out <- matrix(NA, nrow = length(bin.vector) * length(go.terms), ncol = 9) | |
colnames(res.out) <- c("GO.ID", "Term", "Annotated", "Significant", "Expected", "classicFisher", "FDRadj", "N.genes", "bin") | |
rowcount <- 1 | |
for (i in 1:length(bin.vector)){ | |
n.genes <- bin.vector[i] | |
genes.hit <- sorted.hits[1:n.genes] | |
res <- AnalyzeGeneEnrichment(sorted.hits, genes.hit, | |
sym2entrez, entrez2GO, | |
convert.sym.to.entrez = TRUE, | |
which.ontology = onto, | |
write.path = FALSE, | |
node.size = 5, | |
FDR.cutoff = 1) | |
res.sub <- subset(res, Term %in% go.terms) | |
res.sub$bin <- n.genes | |
# make into matrix, makes life easier later | |
res.sub <- as.matrix(res.sub) | |
res.sub[, 3:ncol(res.sub)] <- as.numeric(res.sub[, 3:ncol(res.sub)]) | |
rowstart <- rowcount | |
rowend <- rowstart + length(go.terms) - 1 | |
res.out[rowstart:rowend, ] <- res.sub | |
rowcount <- rowcount + length(go.terms) | |
} | |
# # make to dataframe and unfactor things | |
# res.out <- data.frame(noquote(res.out)) | |
# for (cname in c("Annotated", "Significant", "Expected", "classicFisher", "FDRadj")){ | |
# res.out[[cname]] <- as.numeric(res.out[[cname]]) | |
# } | |
# res.out <- data.frame(res.out) | |
# for (cname in c("Annotated", "Significant", "Expected", "classicFisher", "FDRadj", "bin")){ | |
# res.out[[cname]] <- as.numeric(as.character(res.out[[cname]])) | |
# } | |
return(res.out) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment