gauthamnair/greedyGO.Rnw

## greedyGO.Rnw
% Instructions: Save as .Rnw, run Stangle in an R session and source the resulting .R file.
% Some required libraries: data.table, GO.db, stringr

A helper function to exclude GO terms that are too small or too big, as measured by the number or genes annotated for it.
<<>>=
require(data.table)
f.GOsizefilter <- function( gotable, gosizecutoff=250, keepsmaller = TRUE ){
    # gotable: One column is Category, another is GeneSymbol.
    # One row for every Category that contains the gene
    gotable <- as.data.table(gotable)
    gofreqs <- gotable[,list(Num=length(GeneSymbol)),by='Category']
    if (keepsmaller){
    subgos <- subset( gofreqs, Num < gosizecutoff, select = 'Category')
    } else {
	subgos <- subset( gofreqs, Num > gosizecutoff, select = 'Category')
    }
    subset(gotable, gotable$Category %in% subgos$Category)
}
@

There are two possible greedy strategies for a GO breakdown of a gene list.
One option is to exclude big categories, and look to efficiently cover the gene list with as few categories as possible. The greedy method is only an approximate solutionto this problem, known as the maximum coverage problem in complexity theory. The other option, which I prefer, is to first exclude small categories, and then seek to cover the gene list with categories that are as concentrated as possible for the hits by the greedy approach.
<<>>=
f.orderbymaxcover <- function( Num, Num.univ, Num.hits ) {
    # Sort first by high number of hit genes that haven't been assigned yet,
    # then by lower number  of members in Universe
    # Lastly by number of hits irrespective of whether they've been assigned.
    order( Num, -Num.univ, Num.hits, decreasing = TRUE )
}
f.orderbyhighconc <- function( Num, Num.univ, Num.hits ) {
    # Sort first by high concentration of hit genes not yet assigned.
    # Then by size of the category (prefer larger categories of same concentration)
    # Lastly by number of hits irrespective of whether they've been assigned.
    order( Num/Num.univ, Num.univ, Num.hits, decreasing = TRUE)
}
f.greedyGO <- function( gotable, genehits, goorderFUN, gonumcutoff=20){
    # Calculate hit fraction expected from a random GO-annotated gene subset
    # gotable: two-column data.table. Category & GeneSymbol
    # genehits: vector of GeneSymbol hits.
    # goorderFUN: returns an order given vector inputs Num, Num.univ, Num.hits
    # gonumcutoff: max number of terms/categories to return. NULL for no max.
    Num.overall.univ <- length( unique(gotable$GeneSymbol ) )
    Num.hits.univ <- sum( unique(genehits) %in% gotable$GeneSymbol )
    frac.hits.rndm <- Num.hits.univ / Num.overall.univ

    # Find the total number of genes and hits in each category
    mygotable <- as.data.table(gotable)
    gofreqs <- mygotable[, list(Num.univ=length(GeneSymbol)), by='Category']

    mygotable <- subset(mygotable, mygotable$GeneSymbol %in% genehits)
    hitfreqs <- mygotable[,list(Num.hits=length(GeneSymbol)),by='Category']
    gofreqs <- merge(gofreqs, hitfreqs, by='Category', all.x=FALSE, all.y=TRUE)

    outgo <- data.frame()
    doneflag <- FALSE
    while (!doneflag){
	# calculate remaining hits in each category
	freqs <- mygotable[,list(Num=length(GeneSymbol)),by='Category']
	freqs <- merge( gofreqs, freqs, by='Category', all.x=FALSE, all.y=TRUE)
	# Sort categories by desired criteria. Ex: highest conc. of remaining hits.
	freqs <- freqs[ with(freqs,
	    goorderFUN(Num=Num, Num.univ=Num.univ, Num.hits=Num.hits)), ]
	# Keep top category and remove the hits in it from further consideration.
	outgo <- rbind( outgo, freqs[1,])
	genes.in.cat <- subset(mygotable, mygotable$Category == freqs$Category[1],
	    select='GeneSymbol',drop=TRUE)
	mygotable <- subset(mygotable,
	    !(mygotable$GeneSymbol %in% genes.in.cat$GeneSymbol))
	# Finish if no more hits left to consider or enough categories found.
	if (nrow(mygotable) == 0) doneflag <- TRUE
	if (!is.null(gonumcutoff)){
	    if (nrow(outgo) == gonumcutoff ) doneflag <- TRUE
	}
    }
    outgo <- as.data.frame(outgo)
    outgo <- outgo[,c('Category','Num','Num.hits','Num.univ')]
    # Number of hits expected for a random subset of same size as GO category.
    outgo$Num.hits.rndm <- frac.hits.rndm * outgo$Num.univ
    return( outgo )
}
@

Wrapper for the common use case of finding most concentrated GO categories after cutting off too-small categories. Also adds conveniences like an abbreviated GO term name and calculates fraction of hits.
<<>>=
require(GO.db)
f.makegreedyGOtable <- function( genehits, gotable,
	gosizecutoff = 250 , abbrlength = 40, gonumcutoff=NULL) {
	# abbrlength: Terms will be abbreviated to this max. number of characters.
	# set abbrlength=Inf for no abbreviation.
    outgo <- f.greedyGO(
	gotable = f.GOsizefilter( gotable, gosizecutoff = gosizecutoff,
	    keepsmaller = FALSE ),
	genehits = genehits,
	goorderFUN = f.orderbyhighconc,
	gonumcutoff = gonumcutoff )
    termnames <- f.abbreviate( Term(as.character(outgo$Category)), abbrlength)
    outgo <- cbind(TERM=factor(termnames, termnames ), outgo)
    outgo <- transform( outgo,
	Frac.hits.rndm = Num.hits.rndm / Num.univ,
	Frac.hits = Num.hits / Num.univ)
    return(outgo)
}
require(stringr)
f.abbreviate <- function( string, charlimit = 40 ){
    string <- ifelse( nchar(string) < charlimit, string,
	str_replace_all( string, '\\B[aeiou]{1,}', ''))
    string <- ifelse( nchar(string) < charlimit, string,
	str_replace_all( string, '([[:alpha:]]{4})[[:alpha:]]{1,}', '\\1\\.'))
    string <- ifelse( nchar(string) < charlimit, string,
	paste0( str_sub(string, 1, charlimit-3), '...'))
    return(string)
}
@

<<>>=
f.projectGO <- function( gotable, genehits, categories, allowoverlap = FALSE ) {
    gotable <- subset(gotable, gotable$GeneSymbol %in% genehits )
    if (allowoverlap) return( subset( gotable, gotable$Category %in% categories ) )

    goout <- data.frame()
    for (ctgry in categories) {
	rows.in.ctgry <- subset( gotable, gotable$Category == ctgry )
	goout <- rbind( goout, rows.in.ctgry )
	gotable <- subset( gotable,
	    !(gotable$GeneSymbol %in% rows.in.ctgry$GeneSymbol))
    }
    return(goout)
}
@
	% Instructions: Save as .Rnw, run Stangle in an R session and source the resulting .R file.
	% Some required libraries: data.table, GO.db, stringr

	A helper function to exclude GO terms that are too small or too big, as measured by the number or genes annotated for it.
	<<>>=
	require(data.table)
	f.GOsizefilter <- function( gotable, gosizecutoff=250, keepsmaller = TRUE ){
	# gotable: One column is Category, another is GeneSymbol.
	# One row for every Category that contains the gene
	gotable <- as.data.table(gotable)
	gofreqs <- gotable[,list(Num=length(GeneSymbol)),by='Category']
	if (keepsmaller){
	subgos <- subset( gofreqs, Num < gosizecutoff, select = 'Category')
	} else {
	subgos <- subset( gofreqs, Num > gosizecutoff, select = 'Category')
	}
	subset(gotable, gotable$Category %in% subgos$Category)
	}
	@

	There are two possible greedy strategies for a GO breakdown of a gene list.
	One option is to exclude big categories, and look to efficiently cover the gene list with as few categories as possible. The greedy method is only an approximate solutionto this problem, known as the maximum coverage problem in complexity theory. The other option, which I prefer, is to first exclude small categories, and then seek to cover the gene list with categories that are as concentrated as possible for the hits by the greedy approach.
	<<>>=
	f.orderbymaxcover <- function( Num, Num.univ, Num.hits ) {
	# Sort first by high number of hit genes that haven't been assigned yet,
	# then by lower number of members in Universe
	# Lastly by number of hits irrespective of whether they've been assigned.
	order( Num, -Num.univ, Num.hits, decreasing = TRUE )
	}
	f.orderbyhighconc <- function( Num, Num.univ, Num.hits ) {
	# Sort first by high concentration of hit genes not yet assigned.
	# Then by size of the category (prefer larger categories of same concentration)
	# Lastly by number of hits irrespective of whether they've been assigned.
	order( Num/Num.univ, Num.univ, Num.hits, decreasing = TRUE)
	}
	f.greedyGO <- function( gotable, genehits, goorderFUN, gonumcutoff=20){
	# Calculate hit fraction expected from a random GO-annotated gene subset
	# gotable: two-column data.table. Category & GeneSymbol
	# genehits: vector of GeneSymbol hits.
	# goorderFUN: returns an order given vector inputs Num, Num.univ, Num.hits
	# gonumcutoff: max number of terms/categories to return. NULL for no max.
	Num.overall.univ <- length( unique(gotable$GeneSymbol ) )
	Num.hits.univ <- sum( unique(genehits) %in% gotable$GeneSymbol )
	frac.hits.rndm <- Num.hits.univ / Num.overall.univ

	# Find the total number of genes and hits in each category
	mygotable <- as.data.table(gotable)
	gofreqs <- mygotable[, list(Num.univ=length(GeneSymbol)), by='Category']

	mygotable <- subset(mygotable, mygotable$GeneSymbol %in% genehits)
	hitfreqs <- mygotable[,list(Num.hits=length(GeneSymbol)),by='Category']
	gofreqs <- merge(gofreqs, hitfreqs, by='Category', all.x=FALSE, all.y=TRUE)

	outgo <- data.frame()
	doneflag <- FALSE
	while (!doneflag){
	# calculate remaining hits in each category
	freqs <- mygotable[,list(Num=length(GeneSymbol)),by='Category']
	freqs <- merge( gofreqs, freqs, by='Category', all.x=FALSE, all.y=TRUE)
	# Sort categories by desired criteria. Ex: highest conc. of remaining hits.
	freqs <- freqs[ with(freqs,
	goorderFUN(Num=Num, Num.univ=Num.univ, Num.hits=Num.hits)), ]
	# Keep top category and remove the hits in it from further consideration.
	outgo <- rbind( outgo, freqs[1,])
	genes.in.cat <- subset(mygotable, mygotable$Category == freqs$Category[1],
	select='GeneSymbol',drop=TRUE)
	mygotable <- subset(mygotable,
	!(mygotable$GeneSymbol %in% genes.in.cat$GeneSymbol))
	# Finish if no more hits left to consider or enough categories found.
	if (nrow(mygotable) == 0) doneflag <- TRUE
	if (!is.null(gonumcutoff)){
	if (nrow(outgo) == gonumcutoff ) doneflag <- TRUE
	}
	}
	outgo <- as.data.frame(outgo)
	outgo <- outgo[,c('Category','Num','Num.hits','Num.univ')]
	# Number of hits expected for a random subset of same size as GO category.
	outgo$Num.hits.rndm <- frac.hits.rndm * outgo$Num.univ
	return( outgo )
	}
	@

	Wrapper for the common use case of finding most concentrated GO categories after cutting off too-small categories. Also adds conveniences like an abbreviated GO term name and calculates fraction of hits.
	<<>>=
	require(GO.db)
	f.makegreedyGOtable <- function( genehits, gotable,
	gosizecutoff = 250 , abbrlength = 40, gonumcutoff=NULL) {
	# abbrlength: Terms will be abbreviated to this max. number of characters.
	# set abbrlength=Inf for no abbreviation.
	outgo <- f.greedyGO(
	gotable = f.GOsizefilter( gotable, gosizecutoff = gosizecutoff,
	keepsmaller = FALSE ),
	genehits = genehits,
	goorderFUN = f.orderbyhighconc,
	gonumcutoff = gonumcutoff )
	termnames <- f.abbreviate( Term(as.character(outgo$Category)), abbrlength)
	outgo <- cbind(TERM=factor(termnames, termnames ), outgo)
	outgo <- transform( outgo,
	Frac.hits.rndm = Num.hits.rndm / Num.univ,
	Frac.hits = Num.hits / Num.univ)
	return(outgo)
	}
	require(stringr)
	f.abbreviate <- function( string, charlimit = 40 ){
	string <- ifelse( nchar(string) < charlimit, string,
	str_replace_all( string, '\\B[aeiou]{1,}', ''))
	string <- ifelse( nchar(string) < charlimit, string,
	str_replace_all( string, '([[:alpha:]]{4})[[:alpha:]]{1,}', '\\1\\.'))
	string <- ifelse( nchar(string) < charlimit, string,
	paste0( str_sub(string, 1, charlimit-3), '...'))
	return(string)
	}
	@

	<<>>=
	f.projectGO <- function( gotable, genehits, categories, allowoverlap = FALSE ) {
	gotable <- subset(gotable, gotable$GeneSymbol %in% genehits )
	if (allowoverlap) return( subset( gotable, gotable$Category %in% categories ) )

	goout <- data.frame()
	for (ctgry in categories) {
	rows.in.ctgry <- subset( gotable, gotable$Category == ctgry )
	goout <- rbind( goout, rows.in.ctgry )
	gotable <- subset( gotable,
	!(gotable$GeneSymbol %in% rows.in.ctgry$GeneSymbol))
	}
	return(goout)
	}
	@