Skip to content

Instantly share code, notes, and snippets.

@massyah
Last active December 19, 2015 22:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save massyah/6029531 to your computer and use it in GitHub Desktop.
Save massyah/6029531 to your computer and use it in GitHub Desktop.
R code to annotate genes (specified by their official symbol) with GO terms, as well as example usage.
source("go_annotations.R")
# Get the GO BP/MF/CC terms ids associated with STAT3
gene2gos("STAT3")
# Get the GO BP/MF/CC terms description associated with STAT3
go2terms(gene2gos("STAT3"))
# union of go terms for a list of genes
go2terms(gene2gos(c("STAT3","STAT2")))
# To select go terms of interest, either go to AMIGO, or build a mapping from terms to GO ID
# This is very slow, has to be done once
go.terms <- as.data.frame(GOTERM)
unique(go.terms[grep("DNA binding",go.terms$Term),c("go_id","Term")])
# Once we know the ID of genes of interest, let's annotate a list of genes with them
# We consider a gene to be associated with a GO term if it's associated with it or with any of its descendants
# For the exemple, we chose:
# DNA binding corresponds to GO:0003677,
# Protein binding to GO:0005515
# Insuling binding to GO:0043559
annotate.genes.with.gos(c("STAT3","STAT2","IGF1R","EGFR"),c("GO:0003677","GO:0005515","GO:0043559"),with.descendants=T)
# Let's display a wordle to get specific annotations (Can take several seconds and spit warnings for lengthy go terms)
gocloud.for.genes("STAT3")
# Go terms annotation ------
require(GO.db)
# For word cloud generation (can be commented if wordclouds are not used)
library(tm)
library(wordcloud)
# TODO: generalize so that we are specie agnostic. Requires to pass a param to the package?
# Uncomment for Mouse DB
# require(org.Mm.eg.db) # generalize
# annotation.db<-org.Mm.eg
# annotation.dbSYMBOL<-org.Mm.egSYMBOL
# annotation.egGO<-org.Mm.egGO
# Uncomment for Human DB
require(org.Hs.eg.db)
annotation.db<-org.Hs.eg
annotation.dbSYMBOL<-org.Hs.egSYMBOL
annotation.egGO<-org.Hs.egGO
gene2ez=revmap(annotation.dbSYMBOL)
gene2go<- function(gene){
if(!exists(gene,gene2ez)) return(NULL)
ez=get(gene,gene2ez)
gos=get(ez,annotation.egGO)
if(length(gos)==0 || is.na(gos)) return(c())
goids=lapply(gos,"[[","GOID")
sapply(lapply(goids, function(z) get(z, GOTERM)), "Term")
}
geneWithGo<- function(go){
if(!exists(go,revmap(annotation.egGO))) return(c())
ez.genes=get(go,revmap(annotation.egGO))
unlist(mget(ez.genes,annotation.dbSYMBOL))
}
geneHasAnnotation<- function(gene,term){
genes=unlist(strsplit(gene,"+",fixed=T))
for(gene in genes){
terms=gene2go(gene)
if(length(grep(term,terms))>0) return(T)
}
return(F)
}
geneHasAnnotations<- function(gene,terms=c()){
# If transcript info present, remove it
gene=strsplit(gene," ",fixed=T)[[1]][1]
genes=unlist(strsplit(gene,"+",fixed=T))
res=list()
for(term in terms){
res[term]=F
}
for(gene in genes){
gene.terms=gene2go(gene)
for(term in terms){
if(length(grep(term,gene.terms))>0){
res[term]=T
}
}
}
return(as.data.frame(res,row.names=c(gene)))
}
# go.descendants <- function(go){
# visited=c()
# to.visit=c(go)
# while(length(to.visit)>0 && !is.na(to.visit)){
# visited=union(visited,to.visit)
# children<-unique(unlist(mget(to.visit,GOBPCHILDREN)))
# children<-children[complete.cases(children)]
# to.visit=setdiff(children,visited)
# }
# return(visited)
# }
# More efficient
go.descendants <- function(go,onto=c("BP","MF","CC")){
onto=match.arg(onto)
if(onto=="BP"){
onto=GOBPOFFSPRING
}else if(onto=="MF"){
onto=GOMFOFFSPRING
}else if(onto=="CC"){
onto=GOCCOFFSPRING
}
children<-unique(unlist(mget(go,onto,ifnotfound=NA)))
children=union(children,go)
return(children[complete.cases(children)])
}
go2terms <- function(gos){
sapply(lapply(gos, function(z) get(z, GOTERM)), "Term")
# Maybe better?
# unlist(sapply(mget(go.descendants("GO:0072539"),GOTERM),function(z) Term(z)))
}
go2genes <- function(go,with.descendants=FALSE){
genes=c()
if(with.descendants){
#Check if recursive
goids<-go.descendants(go)
}else{
goids<-c(go)
}
ezgenes<-unique(unlist(mget(goids,revmap(annotation.egGO),ifnotfound=NA)))
ezgenes<-ezgenes[!is.na(ezgenes)]
symbols<-unique(unlist(mget(ezgenes,annotation.dbSYMBOL)))
return(symbols)
}
gene2gos <- function(gene,flatten=T){
ezgenes<-unique(unlist(mget(gene,gene2ez,ifnotfound=NA)))
ezgenes<-ezgenes[!is.na(ezgenes)]
if(flatten){
gos<-unlist(mget(ezgenes,annotation.egGO),recursive=F,use.names=F)
goids=unique(sapply(gos,"[[","GOID"))
return(goids)
}else{
res<-melt(mget(ezgenes,annotation.egGO))
res<-res[res$"L3"=="GOID",]
res["gene"]=unlist(mget(res$"L1",annotation.dbSYMBOL))
res["term"]=go2terms(as.character(res$L2))
res<-res[,c("L1","gene","L2","term")]
colnames(res)<-c("ezgene","gene","GO","term")
rownames(res)<-NULL
return(unique(res))
}
}
annotate.genes.with.gos<- function(genes,gos,with.descendants=F,use.descendants=F,go.terms=F){
## Value
## Returns a logical matrix of genes * terms
## If with.descendants, then the set of go terms is expanded to include the descendants
## If use.descendants, then genes are associated with a go term g if they are associated with any of its descendants
#get the genes of all the go terms
if(with.descendants){
gos<-go.descendants(gos)
}
if(use.descendants){
geneByGo<-sapply(gos,function(go) go2genes(go,with.descendants=T))
}else{
ezgenes=mget(gos,revmap(annotation.egGO),ifnotfound=NA)
geneByGo<-lapply(ezgenes,function (z){
if(length(z)==1 && is.na(z)) return(c())
unique(unlist(mget(z,annotation.dbSYMBOL)))
})
}
genes.annot<-t(unlist(sapply(genes,function(gene){
unlist(lapply(geneByGo,function(golist){
gene<-strsplit(gene,"_",fixed=T)[[1]][1]
length(intersect(unlist(strsplit(gene,"+",fixed=T)),golist))
}))
})))
if(go.terms){
colnames(genes.annot)<-go2terms(colnames(genes.annot))
}
return(genes.annot)
}
# go2terms(gene2gos(c("Stat3")))
gotable.for.genes <- function(genes){
some.gos<-gene2gos(genes)
some.gos.terms<-go2terms(some.gos)
some.gos.lengths<-unlist(lapply(some.gos,function(go) length(go2genes(go,with.descendants=T))))
this.gos<-data.table(gos=some.gos,terms=some.gos.terms,lengths=some.gos.lengths)
max.freq=max(this.gos[,lengths])
max.freq=9266
this.gos[,specificity:=min(c(round(max.freq/lengths),1000)),by=gos]
return(this.gos)
}
gocloud.for.genes <- function(genes,go.table=NULL,rot.per=0.15,scale=c(1.3,.5)){
if(is.null(go.table)){
go.table<-gotable.for.genes(genes)
}
#go.table[,specificity:=min(c(round(max.freq/lengths),100)),by=gos]
go.table[,terms:=gsub("positive","pos.",terms,ignore.case=T,fixed=F)]
go.table[,terms:=gsub("negative","neg.",terms,ignore.case=T,fixed=F)]
go.table[,terms:=gsub("protein","prot.",terms,ignore.case=T,fixed=F)]
go.table[,terms:=gsub("kinase","kin.",terms,ignore.case=T,fixed=F)]
go.table[,terms:=gsub("signaling","signal.",terms,ignore.case=T,fixed=F)]
go.table[,terms:=gsub("cellular","c.",terms,ignore.case=T,fixed=F)]
go.table[,terms:=gsub("cell","c.",terms,ignore.case=T,fixed=F)]
go.table[,terms:=gsub("reactive oxygen species","ROS",terms,ignore.case=T,fixed=F)]
go.table[,terms:=gsub("metabolic","metab.",terms,ignore.case=T,fixed=F)]
go.table[,terms:=gsub("metabolism","metab.",terms,ignore.case=T,fixed=F)]
go.table[,terms:=gsub("process","proc.",terms,ignore.case=T,fixed=F)]
go.table[,terms:=gsub("oxygen","O2",terms,ignore.case=T,fixed=F)]
go.table[,terms:=gsub("production","product.",terms,ignore.case=T,fixed=F)]
go.table[,terms:=gsub("regulation","reg.",terms,ignore.case=T,fixed=F)]
go.table[,terms:=gsub("activity","actvty.",terms,ignore.case=T,fixed=F)]
go.table[,terms:=gsub("response","resp.",terms,ignore.case=T,fixed=F)]
go.table[,terms:=gsub("electron","e-",terms,ignore.case=T,fixed=F)]
go.table[,terms:=gsub("endoplasmic","endop.",terms,ignore.case=T,fixed=F)]
go.table[,terms:=gsub("transcription factor","TF.",terms,ignore.case=T,fixed=F)]
go.table[,terms:=gsub("transcription","transcr.",terms,ignore.case=T,fixed=F)]
go.table[,terms:=gsub("retinoic acid","RA",terms,ignore.case=T,fixed=F)]
go.table[,terms:=gsub("interleukin","IL",terms,ignore.case=T,fixed=F)]
go.table[,terms:=gsub("interferon","IFN",terms,ignore.case=T,fixed=F)]
go.table[,terms:=gsub("tumor necrosis factor","TNF",terms,ignore.case=T,fixed=F)]
go.table[,terms:=gsub("aryl hydrocarbon receptor","AHR",terms,ignore.case=T,fixed=F)]
go.table[,terms:=gsub("complex","cplx.",terms,ignore.case=T,fixed=F)]
go.table[,terms:=gsub("proliferation","prolif.",terms,ignore.case=T,fixed=F)]
go.table[,terms:=gsub("nuclear","nucl.",terms,ignore.case=T,fixed=F)]
go.table[,terms:=gsub("response","resp.",terms,ignore.case=T,fixed=F)]
go.table<-go.table[order(specificity,decreasing=T)]
pal2 <- brewer.pal(8,"Dark2")
# Shorten some GO terms
wordcloud(go.table$terms,go.table$specificity, scale=scale,max.words=50, random.color=F,random.order=FALSE, rot.per=rot.per,colors=pal2,min.freq=10)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment