Skip to content

Instantly share code, notes, and snippets.

@jakeyeung
Created October 26, 2015 12:12
Show Gist options
  • Save jakeyeung/e707b1967a11a45d4cc8 to your computer and use it in GitHub Desktop.
Save jakeyeung/e707b1967a11a45d4cc8 to your computer and use it in GitHub Desktop.
Analyze gene enrichment
# AnalyzeGeneEnrichment.R
# using topGO
# 25 Feb 2014
#
# if necessary:
# source("http://bioconductor.org/biocLite.R")
# biocLite("topGO")
# source("http://bioconductor.org/biocLite.R")
# biocLite("org.Mm.eg.db")
library(topGO)
library(org.Mm.eg.db)
CreateSym2Entrez <- function(){
sym2entrez <- as.list(org.Mm.egALIAS2EG)
sym2entrez <- sym2entrez[!is.na(sym2entrez)]
return(sym2entrez)
}
CreateEntrez2GO <- function(){
entrez2GO.obj <- org.Mm.egGO
mapped.genes <- mappedkeys(entrez2GO.obj)
entrez2GO.full <- as.list(entrez2GO.obj[mapped.genes])
# collapse sublist of GO-IDs to vector
entrez2GO <- lapply(entrez2GO.full, function(x){
return(names(x)) # lapply is nice
})
return(entrez2GO)
}
ConvertSym2Entrez <- function(gene.list, sym2entrez){
# convert to entrez, auto filtering here
gene.list.entrez <- sapply(gene.list, function(g){
entrez <- sym2entrez[[g]]
if (!is.null(entrez)){
return(entrez)
}
}, simplify = TRUE)
gene.list.entrez <- gene.list.entrez[!sapply(gene.list.entrez, is.null)]
return(unlist(gene.list.entrez))
}
AnalyzeGeneEnrichment <- function(genes.bg, genes.hit,
sym2entrez,
entrez2GO,
convert.sym.to.entrez = TRUE,
which.ontology = "BP",
write.path = FALSE,
node.size = 5,
FDR.cutoff = 0.05){
# Analyze gene enrichment given background and hit genes.
#
# INPUT:
# genes.bg: vector of background genes
# genes.hit: vector of hit genes
# sym2entrez: Provide a mapping from gene symbol to entrez, see example via CreateSym2Entrez(). If missing, creates it via CreateSym2Entrez()
# entrez2GO: mapping from entrez to GO terms, example CreateEntrez2GO(). If missing, creates it via CreateEntrez2Go()
# convert.sym.to.entrez: convert gene symbols to entrez ID. Set to FALSE if genes already in entrez ID
# write.path: path to write topGO results object. If FALSE, does not write to file
# which.ontology: "BP", "MF", "CC"
# node.size: prune GO hierarchy from terms which have less than node.size genes
# FDR.cutoff: Fisher's exact test FDR-adjusted pvals cutoff to be significant
# write.path: if false jsut return object, if true also write to file given by write.path
#
# OUTPUT:
# all.res: topGO results object
#
if (missing(sym2entrez)){
sym2entrez <- CreateSym2Entrez()
}
if (missing(entrez2GO)){
entrez2GO <- CreateEntrez2GO()
}
if (convert.sym.to.entrez){
genes.bg <- ConvertSym2Entrez(genes.bg, sym2entrez)
genes.hit <- ConvertSym2Entrez(genes.hit, sym2entrez)
}
# Select genes in bg that are in hit, binary matrix.
# used in topGO new() function
sel.genes <- factor(as.integer(genes.bg %in% genes.hit))
names(sel.genes) <- genes.bg
# Get topGO object
GOdata <- new("topGOdata",
ontology = which.ontology,
allGenes = sel.genes,
nodeSize = node.size,
annot = annFUN.gene2GO,
gene2GO = entrez2GO)
# Run enrichment ----------------------------------------------------------
result.fisher <- runTest(GOdata, algorithm = "classic", statistic = "fisher")
n.tests <- length(score(result.fisher))
# Look at results ---------------------------------------------------------
all.res <- GenTable(GOdata,
classicFisher = result.fisher,
orderBy = "classicFisher",
ranksOf = "classicFisher",
topNodes = n.tests)
# adjust pvalues
all.res$FDRadj <- p.adjust(all.res$classicFisher, method = "BH")
# Filter table by pvalue --------------------------------------------------
all.res <- all.res[all.res$FDRadj <= FDR.cutoff, ]
# Add all genes that were considered (can recapitulate contTable) ---------
N.genes <- length(genes(GOdata))
all.res$N.genes <- N.genes
# Write to file -----------------------------------------------------------
if (write.path != FALSE){
write.table(all.res, file = write.path,
quote = FALSE,
sep = "\t",
row.names = FALSE,
col.names = TRUE)
}
return(all.res)
}
EnrichmentBinnedToFile <- function(sorted.hits, bin.vector, fname.base, sym2entrez, entrez2GO){
# Take sorted hits and take top bin.vector[i] genes for enrichment
# to see how enrichment of certain processes evolve over time
#
# sorted.hits: list of genes sorted by "relevance"
# bin.vector: vector of integers to bin genes (take top 10 genes, then top 20 ... etc )
# fname.base: full path of filename, we will add _i_Ontology.GOtop to end of file
for (i in bin.vector){
genes.hit <- sorted.hits[1:i]
for (onto in c("BP", "MF", "CC")){
fname.out <- paste0(fname.base, '_', i, '_', onto, '.GOtop')
res <- AnalyzeGeneEnrichment(sorted.hits, genes.hit,
sym2entrez, entrez2GO,
convert.sym.to.entrez = TRUE,
which.ontology = onto,
write.path = fname.out,
node.size = 5,
FDR.cutoff = 0.05)
}
}
}
GetGoObject <- function(genes.hit, genes.bg, onto, node.size, sym2entrez, entrez2GO){
# Run GO
}
GetEnrichment <- function(sorted.hits, bin, onto, sym2entrez, entrez2GO){
# Same as getEnrichmentOverBins but for just one bin, that way we can parallelize it
genes.hit <- sorted.hits[1:bin]
res <- AnalyzeGeneEnrichment(sorted.hits, genes.hit,
sym2entrez, entrez2GO,
convert.sym.to.entrez = TRUE,
which.ontology = onto,
write.path = FALSE,
node.size = 5,
FDR.cutoff = 1)
return(res)
}
GetEnrichmentParallel <- function(sorted.hits, bin.vector, onto, sym2entrez, entrez2GO, n.cores = 4){
# Run GetEnrichment in parallel over all bins in bin.vector
library(parallel)
print("Running GO (~4 minutes)")
start <- Sys.time()
res.split <- mclapply(bin.vector, function(bin){
genes.hit <- sorted.hits[1:bin]
res <- AnalyzeGeneEnrichment(sorted.hits, genes.hit,
sym2entrez, entrez2GO,
convert.sym.to.entrez = TRUE,
which.ontology = onto,
write.path = FALSE,
node.size = 5,
FDR.cutoff = 1)
res$bin <- bin
return(res)
}, mc.cores = n.cores)
print(Sys.time() - start)
res.all <- do.call(rbind, res.split)
return(res.all)
}
GetEnrichmentOverBins <- function(sorted.hits, bin.vector, onto, go.terms, sym2entrez, entrez2GO){
# Iterate over bins and find enrichment for a specific go term
# track this enrichment as we sweep across the bins
# init output matrix: 8 columns ("GO.ID", "Term", "Annotated", "Significant", "Expected", "classicFisher", "FDRadj", "N.genes", "bin")
res.out <- matrix(NA, nrow = length(bin.vector) * length(go.terms), ncol = 9)
colnames(res.out) <- c("GO.ID", "Term", "Annotated", "Significant", "Expected", "classicFisher", "FDRadj", "N.genes", "bin")
rowcount <- 1
for (i in 1:length(bin.vector)){
n.genes <- bin.vector[i]
genes.hit <- sorted.hits[1:n.genes]
res <- AnalyzeGeneEnrichment(sorted.hits, genes.hit,
sym2entrez, entrez2GO,
convert.sym.to.entrez = TRUE,
which.ontology = onto,
write.path = FALSE,
node.size = 5,
FDR.cutoff = 1)
res.sub <- subset(res, Term %in% go.terms)
res.sub$bin <- n.genes
# make into matrix, makes life easier later
res.sub <- as.matrix(res.sub)
res.sub[, 3:ncol(res.sub)] <- as.numeric(res.sub[, 3:ncol(res.sub)])
rowstart <- rowcount
rowend <- rowstart + length(go.terms) - 1
res.out[rowstart:rowend, ] <- res.sub
rowcount <- rowcount + length(go.terms)
}
# # make to dataframe and unfactor things
# res.out <- data.frame(noquote(res.out))
# for (cname in c("Annotated", "Significant", "Expected", "classicFisher", "FDRadj")){
# res.out[[cname]] <- as.numeric(res.out[[cname]])
# }
# res.out <- data.frame(res.out)
# for (cname in c("Annotated", "Significant", "Expected", "classicFisher", "FDRadj", "bin")){
# res.out[[cname]] <- as.numeric(as.character(res.out[[cname]]))
# }
return(res.out)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment