Skip to content

Instantly share code, notes, and snippets.

@timedreamer
Created December 14, 2022 19:56
Show Gist options
  • Save timedreamer/ba8e8243bb1c9e264a737ce1fcc53c98 to your computer and use it in GitHub Desktop.
Save timedreamer/ba8e8243bb1c9e264a737ce1fcc53c98 to your computer and use it in GitHub Desktop.
Use topGO package for GO analysis in R.
# 0. Prep -----------------------------------------------------------------
library(tidyverse)
library(here)
# 1. Load and clean EggNOG output -----------------------------------------
## The output was downloaded from the eggnog-mapper online run.
eggnog_go <- read_tsv(here("data", "misc", "out.emapper.annotations.gz"),
skip = 4) %>%
dplyr::rename(query = "#query")
eggnog_go_clean <- eggnog_go %>% dplyr::select(query, GOs)
eggnog_go_clean <- eggnog_go_clean %>% filter(GOs != "-") %>%
filter(grepl(pattern = "^LOC", .$query))
eggnog_go_clean <- eggnog_go_clean %>%
separate(col = query, sep = "\\.", into = c("geneID", "rm1")) %>%
dplyr::select(-rm1)
write_tsv(eggnog_go_clean,
here("data", "misc", "EggNOG_GOv1.tsv"), col_names = FALSE)
# 2. Run GO enrichment analysis -------------------------------------------
library(topGO)
## 2.1 Prep geneID to GO table
geneID2GO <- readMappings(file = here("data", "misc", "EggNOG_GOv1.tsv"))
str(head(geneID2GO))
geneNames <- names(geneID2GO)
head(geneNames)
length(geneID2GO) # 14036 genes have at least one GO term
pdata <- tibble(GO_number_each_gene = lengths(geneID2GO),
geneid = names(geneID2GO),
group = "gene")
pdata %>% ggplot(., aes(x= group, y = GO_number_each_gene)) +
geom_boxplot() +
scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x))) +
cowplot::theme_cowplot() +
theme(axis.text.x=element_blank(), #remove x axis labels
axis.ticks.x=element_blank(), #remove x axis ticks
) +
xlab("Gene")
## 2.2 Build the topGOdata object
myInterestingGenes <- sample(geneNames, length(geneNames) / 10)
geneList <- factor(as.integer(geneNames %in% myInterestingGenes))
names(geneList) <- geneNames
sampleGOdata <- new("topGOdata",
description = "Simple test", ontology = "BP",
allGenes = geneList,
# geneSel = topDiffGenes, # use this for GSEA
nodeSize = 5,
annot = annFUN.gene2GO,
gene2GO = geneID2GO)
sampleGOdata
## 2.3 Run test
resultFisher <- runTest(sampleGOdata, algorithm = "classic", statistic = "fisher")
# resultKS <- runTest(sampleGOdata, algorithm = "classic", statistic = "ks")
# resultKS.elim <- runTest(sampleGOdata, algorithm = "elim", statistic = "ks")
## 2.4 Get result table and plot
allRes <- GenTable(sampleGOdata, classicFisher = resultFisher,
# classicKS = resultKS, elimKS = resultKS.elim,
ranksOf = "classicFisher", topNodes = 10)
head(allRes)
showSigOfNodes(sampleGOdata, score(resultFisher), firstSigNodes = 2,
useInfo = "all")
@azmigueldario
Copy link

Really helpful to organize any EggNOG results. I produced a plot with the top GO from a multifasta.fa file of CDS.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment