skurscheid/cluster_profiler_GO_SK_edits.R

## cluster_profiler_GO_SK_edits.R
library(clusterProfiler)
library(magrittr)
library(tidyverse)
library(qusage)
library(msigdbr)
library(biomaRt)
library(DOSE)
library(org.Mm.eg.db)
library(enrichplot)
library(data.table)

#reading in data
## assume that 1st column is ID
## 2nd column is fold change
data1 <- read.csv("wt_mal_ercc_results.csv", header = TRUE)
setDT(data1) # just for better readibility when printing

#removing duplicates (required for GSEA analysis)
data1_new = data1 %>% distinct(X, .keep_all = TRUE)
setDT(data1_new)

# remove all the rows which contain no data
data1_new <- data1_new[!is.na(data1_new$log2FoldChange)]

#entrez gene id
# not really needed here as you can use Ensembl ID for mapping to GO terms
mart <- useDataset("mmusculus_gene_ensembl", useMart("ensembl"))
bm <- getBM(attributes= c("ensembl_gene_id", "entrezgene_id"), mart= mart)
d1_new <- inner_join(data1_new, bm, by=c("X"="ensembl_gene_id"))

## feature 1: numeric vector
geneList1 <- d1_new[,3]

## feature 2: named vector
# I have changed this to use the Ensembl ID instead as I noticed lots of missing Entrez IDs
names(geneList1) <- as.character(d1_new[,1])

## feature 3: decreasing order
geneList1 <- sort(geneList1, decreasing = TRUE)

#define fold change greater than 2 as DEG
gene1 <- names(geneList1)[abs(geneList1) > 2]

# if you d
gene1 <- gene1[!is.na(gene1)]

#over-representation analysis
ego_bp1 <- enrichGO(gene      = gene1,
                    universe      = names(geneList1),
                    OrgDb         = org.Mm.eg.db,
                    keyType = 'ENSEMBL',
                    ont           = "ALL",
                    pAdjustMethod = "BH",
                    pvalueCutoff  = 0.01,
                    qvalueCutoff  = 0.05,
                    readable      = TRUE)
head(ego_bp1)

#GSEA analysis
ego_gse1 <- gseGO(geneList     = geneList1,
                  OrgDb        = org.Mm.eg.db,
                  ont          = "BP", # changed this for testing purposes
                  keyType      = "ENSEMBL",
                  nPerm        = 1000,
                  minGSSize    = 100,
                  maxGSSize    = 500,
                  pvalueCutoff = 0.1,
                  verbose      = FALSE)
head(ego_gse1, 10)

# inspect the whole range of qvalues
ego_gse1@result$qvalues
# there are indeed tests that do have a higher qvalue
# I recall that there is an issue with BH adjustment, as it is not continous (Maurits might be better informed!)
# which could lead to the effects we are seeing here

dotplot(ego_gse1)
	library(clusterProfiler)
	library(magrittr)
	library(tidyverse)
	library(qusage)
	library(msigdbr)
	library(biomaRt)
	library(DOSE)
	library(org.Mm.eg.db)
	library(enrichplot)
	library(data.table)

	#reading in data
	## assume that 1st column is ID
	## 2nd column is fold change
	data1 <- read.csv("wt_mal_ercc_results.csv", header = TRUE)
	setDT(data1) # just for better readibility when printing

	#removing duplicates (required for GSEA analysis)
	data1_new = data1 %>% distinct(X, .keep_all = TRUE)
	setDT(data1_new)

	# remove all the rows which contain no data
	data1_new <- data1_new[!is.na(data1_new$log2FoldChange)]

	#entrez gene id
	# not really needed here as you can use Ensembl ID for mapping to GO terms
	mart <- useDataset("mmusculus_gene_ensembl", useMart("ensembl"))
	bm <- getBM(attributes= c("ensembl_gene_id", "entrezgene_id"), mart= mart)
	d1_new <- inner_join(data1_new, bm, by=c("X"="ensembl_gene_id"))

	## feature 1: numeric vector
	geneList1 <- d1_new[,3]

	## feature 2: named vector
	# I have changed this to use the Ensembl ID instead as I noticed lots of missing Entrez IDs
	names(geneList1) <- as.character(d1_new[,1])

	## feature 3: decreasing order
	geneList1 <- sort(geneList1, decreasing = TRUE)

	#define fold change greater than 2 as DEG
	gene1 <- names(geneList1)[abs(geneList1) > 2]

	# if you d
	gene1 <- gene1[!is.na(gene1)]

	#over-representation analysis
	ego_bp1 <- enrichGO(gene = gene1,
	universe = names(geneList1),
	OrgDb = org.Mm.eg.db,
	keyType = 'ENSEMBL',
	ont = "ALL",
	pAdjustMethod = "BH",
	pvalueCutoff = 0.01,
	qvalueCutoff = 0.05,
	readable = TRUE)
	head(ego_bp1)

	#GSEA analysis
	ego_gse1 <- gseGO(geneList = geneList1,
	OrgDb = org.Mm.eg.db,
	ont = "BP", # changed this for testing purposes
	keyType = "ENSEMBL",
	nPerm = 1000,
	minGSSize = 100,
	maxGSSize = 500,
	pvalueCutoff = 0.1,
	verbose = FALSE)
	head(ego_gse1, 10)

	# inspect the whole range of qvalues
	ego_gse1@result$qvalues
	# there are indeed tests that do have a higher qvalue
	# I recall that there is an issue with BH adjustment, as it is not continous (Maurits might be better informed!)
	# which could lead to the effects we are seeing here

	dotplot(ego_gse1)