Skip to content

Instantly share code, notes, and snippets.

@nilesh-tawari
Last active May 8, 2019 09:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nilesh-tawari/d6dd3951de0ff6671ccd5b096fe13958 to your computer and use it in GitHub Desktop.
Save nilesh-tawari/d6dd3951de0ff6671ccd5b096fe13958 to your computer and use it in GitHub Desktop.
featureCountsDEseq2
#!/usr/bin/env Rscript
################################################
################################################
## REQUIREMENTS ##
################################################
################################################
## DIFFERENTIAL ANALYSIS, SCATTERPLOTS AND PCA FOR SAMPLES IN FEATURECOUNTS FILE
## - FIRST SIX COLUMNS OF FEATURECOUNTS_FILE SHOULD BE INTERVAL INFO. REMAINDER OF COLUMNS SHOULD BE SAMPLES-SPECIFIC COUNTS.
## - SAMPLE NAMES HAVE TO END IN "_R1" REPRESENTING REPLICATE ID. LAST 3 CHARACTERS OF SAMPLE NAME WILL BE TRIMMED TO OBTAIN GROUP ID FOR DESEQ2 COMPARISONS.
## - BAM_SUFFIX IS PORTION OF FILENAME AFTER SAMPLE NAME IN FEATURECOUNTS COLUMN SAMPLE NAMES E.G. ".rmDup.bam" if "DRUG_R1.rmDup.bam"
## - PACKAGES BELOW NEED TO BE AVAILABLE TO LOAD WHEN RUNNING R
################################################
################################################
## LOAD LIBRARIES ##
################################################
################################################
library(optparse)
library(DESeq2)
library(vsn)
library(ggplot2)
library(RColorBrewer)
library(pheatmap)
library(lattice)
################################################
################################################
## PARSE COMMAND-LINE PARAMETERS ##
################################################
################################################
# option_list <- list(make_option(c("-i", "--featurecount_file"), type="character", default=NULL, help="Feature count file generated by the SubRead featureCounts command.", metavar="featurecount_file"),
# make_option(c("-g", "--group_file"), type="character", default=NULL, help="Condition file, with sample id and groups information.", metavar="group_file"),
# make_option(c("-b", "--bam_suffix"), type="character", default=NULL, help="Portion of filename after sample name in featurecount file header e.g. '.rmDup.bam' if 'DRUG_R1.rmDup.bam'", metavar="bam_suffix"),
# make_option(c("-o", "--outdir"), type="character", default='./', help="Output directory", metavar="path"),
# make_option(c("-p", "--outprefix"), type="character", default='differential', help="Output prefix", metavar="character"),
# make_option(c("-s", "--outsuffix"), type="character", default='', help="Output suffix for comparison-level results", metavar="character"))
#
# opt_parser <- OptionParser(option_list=option_list)
# opt <- parse_args(opt_parser)
#if (is.null(opt$featurecount_file)){
# print_help(opt_parser)
# stop("Please provide featurecount file.", call.=FALSE)
#}
# if (is.null(opt$bam_suffix)){
# print_help(opt_parser)
# stop("Please provide bam suffix in header of featurecount file.", call.=FALSE)
# }
################################################
################################################
## READ IN COUNTS FILE ##
################################################
################################################
#count.table <- read.delim(file=opt$featurecount_file,header=TRUE,skip=1)
setwd("/lrlhps/users/c274411/07_cat_RNAseq/analysis/results/featureCounts/DEseq2/test")
#analysis_func_go_kegg
count.table <- read.csv('analysis_fc.txt', sep="\t", head=TRUE)
coldata <- read.csv('analysis_pheno.txt', sep="\t", row.names=1)
rownames(count.table) <- count.table[,1]
#resultDESeq2[,1] <- NULL
#colnames(count.table) <- gsub(opt$bam_suffix,"",colnames(count.table))
#colnames(count.table) <- as.character(lapply(colnames(count.table), function (x) tail(strsplit(x,'.',fixed=TRUE)[[1]],1)))
#rownames(count.table) <- count.table$Geneid
interval.table <- count.table[,0:2]
count.table <- count.table[,3:ncol(count.table),drop=FALSE]
################################################
################################################
## RUN DESEQ2 ##
################################################
################################################
#if (file.exists(opt$outdir) == FALSE) {
# dir.create(opt$outdir,recursive=TRUE)
#}
#samples.vec <- sort(colnames(count.table))
samples.vec <- colnames(count.table)
groups <- substr(coldata$condition, 1, 20)
print(unique(groups))
if (length(unique(groups)) == 1) {
quit(save = "no", status = 0, runLast = FALSE)
}
############### Descriptive statistics
# Define a group-specific color for each sample, and add it as a supplementary column
groupColor <- c("Group_1"="violet", "Group_2"="red", "Group_3"="blue", "Group_4"="green", "Group_5"="yellow","Group_6"="orange") # Choose one color per group
print(groupColor)
## Add a column to the pheno type to specify the color of each sample according to its genotype
coldata$color[coldata$condition == "Group_1"] <- groupColor["Group_1"]
coldata$color[coldata$condition == "Group_2"] <- groupColor["Group_2"]
coldata$color[coldata$condition == "Group_3"] <- groupColor["Group_3"]
coldata$color[coldata$condition == "Group_4"] <- groupColor["Group_4"]
coldata$color[coldata$condition == "Group_5"] <- groupColor["Group_5"]
coldata$color[coldata$condition == "Group_6"] <- groupColor["Group_6"]
#Basic statistics
statsPerSample <- data.frame(t(apply(count.table, 2, summary)))
names(statsPerSample) <- c("min", "Q1", "median", "mean", "Q3", "max")
statsPerSample[,c("perc5", "perc10", "perc90", "perc95")] <-
apply(count.table, 2, quantile, c(0.05, 0.10, 0.90, 0.95))
statsPerSample$zeros <- apply(count.table==0, 2, sum)
statsPerSample$percZeros <- 100*statsPerSample$zeros/nrow(count.table)
write.table(statsPerSample,file=paste("analysis",".statsPerSample.txt",sep=""),row.names=TRUE,col.names=TRUE,sep="\t",quote=TRUE)
# DEseq2
DDSFile <- paste("analysis",".dds.rld.RData",sep="")
if (file.exists(DDSFile) == FALSE) {
counts <- count.table[,samples.vec,drop=FALSE]
#data.frame(row.names=colnames(counts),condition=groups)
dds <- DESeqDataSetFromMatrix(countData = round(counts), colData = coldata, design = ~ condition)
print(dds)
is(dds)
isS4(dds)
slotNames(dds)
dds <- DESeq(dds)
rld <- rlog(dds)
save(dds,rld,file=DDSFile)
}
################ Interactive PCA
# expmatrix <- SummarizedExperiment::assay(rld)
# if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
# BiocManager::install("pcaExplorer", version = "3.8")
#library("pcaExplorer")
#pcaExplorer(dds, rld)
#var_vs_mean(expmatrix)
#library(corrgram)
################################################
################################################
## PLOT QC ##
################################################
################################################
PlotFile <- paste("analysis",".plots.pdf",sep="")
if (file.exists(PlotFile) == FALSE) {
pdf(file=PlotFile,onefile=TRUE,width=7,height=7)
# histogram of counts
par(mfrow=c(3,1))
hist(as.matrix(count.table), col="blue", border="white", breaks=100)
hist(as.matrix(count.table), col="blue", border="white",
breaks=20000, xlim=c(0,2000), main="Counts per gene",
xlab="Counts (truncated axis)", ylab="Number of genes",
las=1, cex.axis=0.7)
epsilon <- 1 # pseudo-count to avoid problems with log(0)
hist(as.matrix(log2(count.table + epsilon)), breaks=100, col="blue", border="white",
main="Log2-transformed counts per gene", xlab="log2(counts+1)", ylab="Number of genes",
las=1, cex.axis=0.7)
par(mfrow=c(1,1))
# box plot of counts
boxplot(log2(count.table + epsilon), pch=".", col=coldata$color,
horizontal=TRUE, cex.axis=0.5,
las=1, ylab="Samples", xlab="log2(counts +1)")
# Density plot
# We will require one function from the affy package
if(!require("affy")){
source("http://bioconductor.org/biocLite.R")
biocLite("affy")
}
library(affy)
plotDensity(log2(count.table + epsilon), col=coldata$color, lty=1, lwd=2)
grid()
legend("topright", legend=names(groupColor), col=groupColor, lwd=2)
dds.norm <- estimateSizeFactors(dds)
# Checking the normalization
par(mfrow=c(2,2),cex.lab=0.7)
boxplot(log2(counts(dds.norm)+epsilon), col=coldata$color, cex.axis=0.7,
las=1, xlab="log2(counts+1)", horizontal=TRUE, main="Raw counts")
boxplot(log2(counts(dds.norm, normalized=TRUE)+epsilon), col=coldata$color, cex.axis=0.7,
las=1, xlab="log2(normalized counts)", horizontal=TRUE, main="Normalized counts")
plotDensity(log2(counts(dds.norm)+epsilon), col=coldata$color,
xlab="log2(counts+1)", cex.lab=0.7, panel.first=grid())
plotDensity(log2(counts(dds.norm, normalized=TRUE)+epsilon), col=coldata$color,
xlab="log2(normalized counts)", cex.lab=0.7, panel.first=grid())
# Restore default parameters
par(mfrow=c(1,1), cex.lab=1)
# # Scatter plot
# # Define a function to draw a scatter plot for a pair of variables (samples) with density colors
# plotFun <- function(x,y){
# dns <- densCols(x,y);
# points(x,y, col=dns, pch=".", panel.first=grid());
# # abline(a=0, b=1, col="brown")
# }
#
# # Plot the scatter plot for a few pairs of variables selected at random
# set.seed(123) # forces the random number generator to produce fixed results
# pairs(log2(count.table[,sample(ncol(count.table),10)] + epsilon),
# panel=plotFun, lower.panel = NULL)
#
## PCA
pca.data <- DESeq2::plotPCA(rld,intgroup=c("condition"),returnData=TRUE)
percentVar <- round(100 * attr(pca.data, "percentVar"))
myplot <- ggplot(pca.data, aes(PC1, PC2, color=condition)) +
geom_point(size=3) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=1))
print(myplot)
## WRITE PC1 vs PC2 VALUES TO FILE
pca.vals <- pca.data[,1:2]
colnames(pca.vals) <- paste(colnames(pca.vals),paste(percentVar,'% variance',sep=""), sep=": ")
pca.vals <- cbind(sample = rownames(pca.vals), pca.vals)
write.table(pca.vals,file=paste("analysis",".pca.vals.txt",sep=""),row.names=FALSE,col.names=TRUE,sep="\t",quote=TRUE)
## Dispersion plot
plotDispEsts(dds, main="Dispersion plot")
## SAMPLE CORRELATION HEATMAP
sampleDists <- dist(t(assay(rld)))
sampleDistMatrix <- as.matrix(sampleDists)
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,clustering_distance_rows=sampleDists,clustering_distance_cols=sampleDists,col=colors)
library(RColorBrewer)
library(gplots)
(mycols <- brewer.pal(8, "Dark2")[1:length(unique(coldata$condition))])
par(mar=c(10,4,4,2))
heatmap.2(sampleDistMatrix, key=F, trace="none",
col=colorRampPalette( rev(brewer.pal(9, "Blues")) )(255),
ColSideColors=mycols[coldata$condition], RowSideColors=mycols[coldata$condition],
margin=c(10, 10), main="Sample Distance Matrix",cexRow=0.8, cexCol=0.8)
## Box plot of counts
par(mar=c(10,3,1,1))
nor.counts <- counts(dds,normalized=TRUE)
nor.counts = log2(nor.counts+1)
par(cex.lab=0.8) # is for y-axis
par(cex.axis=0.8) # is for x-axis
boxplot(nor.counts,col=mycols[coldata$condition],las=2,ylab='log2(nor.counts+1)', boxwex = 0.5)
## WRITE SAMPLE DISTANCES TO FILE
write.table(cbind(sample = rownames(sampleDistMatrix), sampleDistMatrix),file=paste("analysis",".sample.dists.txt",sep=""),row.names=FALSE,col.names=TRUE,sep="\t",quote=FALSE)
dev.off()
}
#plotDensity(log2(count.table + epsilon), col=coldata$color, lty=1, lwd=2)
#grid()
#legend("topright", legend=names(groupColor), col=groupColor, lwd=2)
################################################
################################################
## SAVE SIZE FACTORS ##
################################################
################################################
SizeFactorsDir <- "sizeFactors/"
if (file.exists(SizeFactorsDir) == FALSE) {
dir.create(SizeFactorsDir,recursive=TRUE)
}
NormFactorsFile <- paste(SizeFactorsDir, "analysis",".sizeFactors.RData",sep="")
if (file.exists(NormFactorsFile) == FALSE) {
normFactors <- sizeFactors(dds)
save(normFactors,file=NormFactorsFile)
for (name in names(sizeFactors(dds))) {
sizeFactorFile <- paste(SizeFactorsDir,name,"analysis",".sizeFactor.txt",sep="")
if (file.exists(sizeFactorFile) == FALSE) {
write(as.numeric(sizeFactors(dds)[name]),file=sizeFactorFile)
}
}
}
################################################
################################################
## WRITE LOG FILE ##
################################################
################################################
LogFile <- paste("analysis",".log",sep="")
if (file.exists(LogFile) == FALSE) {
cat("\nSamples =",samples.vec,"\n\n",file=LogFile,append=TRUE,sep=', ')
cat("Groups =",groups,"\n\n",file=LogFile,append=TRUE,sep=', ')
cat("Dimensions of count matrix =",dim(counts),"\n\n",file=LogFile,append=FALSE,sep=' ')
cat("\n",file=LogFile,append=TRUE,sep='')
}
################################################
################################################
## LOOP THROUGH COMPARISONS ##
################################################
################################################
ResultsFile <- paste("analysis",".results.txt",sep="")
raw.counts <- counts(dds,normalized=FALSE)
colnames(raw.counts) <- paste(colnames(raw.counts),'raw',sep='.')
pseudo.counts <- counts(dds,normalized=TRUE)
colnames(pseudo.counts) <- paste(colnames(pseudo.counts),'pseudo',sep='.')
deseq2_results_list <- list()
comparisons <- combn(unique(groups),2)
#ncol(comparisons)
library(AnnotationHub)
hub <- AnnotationHub()
#d <- display(hub)
#query(hub, "Felis")
Felis <- hub[["AH66297"]] #org.Felis_catus.eg.sqlite AH61890
#snapshotDate(): 2018-10-24 snapshotDate(): 2018-04-30
#pd <- possibleDates(hub)
#snapshotDate(hub) <- pd[111]
library(clusterProfiler)
#1:ncol(comparisons)
# 2, 3, 8, 12
#
for (idx in 1:ncol(comparisons)) {
control.group <- comparisons[1,idx]
treat.group <- comparisons[2,idx]
CompPrefix <- paste(control.group,treat.group,sep="vs")
cat("Saving results for ",CompPrefix," ...\n",sep="")
CompOutDir <- paste(CompPrefix,'/',sep="")
if (file.exists(CompOutDir) == FALSE) {
dir.create(CompOutDir,recursive=TRUE)
}
control.samples <- samples.vec[which(groups == control.group)]
treat.samples <- samples.vec[which(groups == treat.group)]
comp.samples <- c(control.samples,treat.samples)
print(c(control.group,treat.group))
print("=================================")
print(control.group)
print("control.samples")
print(control.samples)
print("=================================")
print("treat.samples")
print(treat.group)
print(treat.samples)
print("=================================")
print(c("condition",c(control.group,treat.group)))
comp.results <- results(dds,contrast=c("condition",c(control.group,treat.group)))
# Annotations #######################################################################################
# library(biomaRt) # Install the biomaRt package (Bioconductor) if this command does not work
# Get the Ensembl ID to gene name mapping:
# mart <- useMart("ENSEMBL_MART_ENSEMBL", dataset = "fcatus_gene_ensembl", host="www.ensembl.org")
# ensembl2name <- getBM(attributes=c("ensembl_gene_id","external_gene_name","entrezgene", "description"),mart=mart)
#library(stringr)
#ensembl2name$test <- str_split_fixed(ensembl2name$description, "Acc:", 2)
#, "uniprot_gn"
# Merge it with our gene-level statistics:
# comp.results <- merge(x=as.data.frame(comp.results), y=ensembl2name, by.x=0, by.y=1, all.x=TRUE)
# colnames(comp.results) <- c("ensembl", "baseMean","log2FoldChange", "lfcSE", "stat", "pvalue", "padj", "gene", "entrezgene", "description")
# , "uniprot_gn"
# comp.results <- comp.results[!duplicated(comp.results$ensembl), ] #### keep uniq gene ids , fromLast=T
# rownames(comp.results) <- comp.results[,1]
#library(tidyr)
#a <- separate(data = ensembl2name, col = description, into = c("left", "right"), sep = "Acc:")
#res <- comp.results
comp.df <- as.data.frame(comp.results)
comp.table <- cbind(interval.table, as.data.frame(comp.df), raw.counts[,paste(comp.samples,'raw',sep='.')], pseudo.counts[,paste(comp.samples,'pseudo',sep='.')])
resultDESeq2 <- as.data.frame(comp.results)
#resultDESeq2 <- cbind(interval.table, as.data.frame(comp.results))
#rownames(resultDESeq2) <- resultDESeq2[,1]
#resultDESeq2[,1] <- NULL
## WRITE RESULTS FILE
CompResultsFile <- paste(CompOutDir,CompPrefix,"analysis",".deseq2.results.txt",sep="")
write.table(comp.table, file=CompResultsFile, col.names=TRUE, row.names=FALSE, sep='\t', quote=FALSE)
## FILTER RESULTS BY FDR & LOGFC AND WRITE RESULTS FILE
pdf(file=paste(CompOutDir,CompPrefix,"analysis",".deseq2.plots.pdf",sep=""),width=10,height=8)
if (length(comp.samples) > 2) {
for (MIN_FDR in c(0.05)) {
#0.01,
## SUBSET RESULTS BY FDR
pass.fdr.table <- subset(comp.table, padj < MIN_FDR)
pass.fdr.up.table <- subset(comp.table, padj < MIN_FDR & log2FoldChange > 0)
pass.fdr.down.table <- subset(comp.table, padj < MIN_FDR & log2FoldChange < 0)
## SUBSET RESULTS BY FDR AND LOGFC
pass.fdr.logFC.table <- subset(comp.table, padj < MIN_FDR & abs(log2FoldChange) >= 1)
pass.fdr.logFC.up.table <- subset(comp.table, padj < MIN_FDR & abs(log2FoldChange) >= 1 & log2FoldChange > 0)
pass.fdr.logFC.down.table <- subset(comp.table, padj < MIN_FDR & abs(log2FoldChange) >= 1 & log2FoldChange < 0)
## WRITE RESULTS FILE
CompResultsFile <- paste(CompOutDir,CompPrefix,"analysis",".deseq2.FDR",MIN_FDR,".results.txt",sep="")
CompBEDFile <- paste(CompOutDir,CompPrefix,"analysis",".deseq2.FDR",MIN_FDR,".results.bed",sep="")
write.table(pass.fdr.table, file=CompResultsFile, col.names=TRUE, row.names=FALSE, sep='\t', quote=FALSE)
write.table(pass.fdr.table[,c("Geneid","log2FoldChange")], file=CompBEDFile, col.names=FALSE, row.names=FALSE, sep='\t', quote=FALSE)
#
CompResultsFile <- paste(CompOutDir,CompPrefix,"analysis",".deseq2.FDR",MIN_FDR,".FC2.results.txt",sep="")
CompBEDFile <- paste(CompOutDir,CompPrefix,"analysis",".deseq2.FDR",MIN_FDR,".FC2.results.bed",sep="")
write.table(pass.fdr.logFC.table, file=CompResultsFile, col.names=TRUE, row.names=FALSE, sep='\t', quote=FALSE)
write.table(pass.fdr.logFC.table[,c("Geneid","log2FoldChange")], file=CompBEDFile, col.names=FALSE, row.names=FALSE, sep='\t', quote=FALSE)
#Geneid
## MA PLOT & VOLCANO PLOT
#DESeq2::plotMA(comp.results, main=paste("MA plot FDR <= ",MIN_FDR,sep=""), colNonSig = "blue", ylim=c(-2,2),alpha=MIN_FDR)
#abline(h=c(-1:1), col="red")
#plot(comp.table$log2FoldChange, -1*log10(comp.table$padj), col=ifelse(comp.table$padj<=MIN_FDR, "red", "black"), xlab="logFC", ylab="-1*log10(FDR)", main=paste("Volcano plot FDR <=",MIN_FDR,sep=" "), pch=20)
# VOLCANO with labels, Volcano plot
alpha <- MIN_FDR # Threshold on the p-value
# par(mfrow=c(1,2))
# Compute significance, with a maximum of 320 for the p-values set to 0 due to limitation of computation precision
resultDESeq2$sig <- -log10(resultDESeq2$padj)
sum(is.infinite(resultDESeq2$sig))
resultDESeq2[is.infinite(resultDESeq2$sig),"sig"] <- 350
# View(resultDESeq2[is.na(resultDESeq2$pvalue),])
# Select genes with a defined p-value (DESeq2 assigns NA to some genes)
genes.to.plot <- !is.na(resultDESeq2$pvalue)
# sum(genes.to.plot)
range(resultDESeq2[genes.to.plot, "log2FoldChange"])
# View(resultDESeq2[genes.to.plot,])
# ## Volcano plot of adjusted p-values
# cols <- densCols(resultDESeq2$log2FoldChange, resultDESeq2$sig)
# cols[resultDESeq2$pvalue ==0] <- "purple"
# resultDESeq2$pch <- 19
# resultDESeq2$pch[resultDESeq2$pvalue ==0] <- 6
# plot(resultDESeq2$log2FoldChange,
# resultDESeq2$sig,
# col=cols, panel.first=grid(),
# main="Volcano plot",
# xlab="Effect size: log2(fold-change)",
# ylab="-log10(adjusted p-value)",
# pch=resultDESeq2$pch, cex=0.4)
# abline(v=0)
# abline(v=c(-1,1), col="brown")
# abline(h=-log10(alpha), col="brown")
#
# ## Plot the names of a reasonable number of genes, by selecting those begin not only significant but also having a strong effect size
# gn.selected <- abs(resultDESeq2$log2FoldChange) > 2 & resultDESeq2$padj < alpha
# text(resultDESeq2$log2FoldChange[gn.selected],
# -log10(resultDESeq2$padj)[gn.selected],
# lab=rownames(resultDESeq2)[gn.selected ], cex=0.6)
################################################
################################################
################################################
################################################
## GO TERM ANALYSIS USING GPROFILER ##
library(gprofiler2)
resultDESeq2.df <- na.omit(data.frame(comp.results))
resultDESeq2.df <- resultDESeq2.df[order((resultDESeq2.df$log2FoldChange), decreasing = TRUE),]
induced.sign <- rownames(resultDESeq2.df)[resultDESeq2.df$log2FoldChange >= 1 & resultDESeq2.df$padj < alpha]
# Sort the gene-level statistics by adjusted p-value:
term.induced <- gost(query=induced.sign, organism = "fcatus", ordered_query = TRUE,
multi_query = FALSE, significant = FALSE, exclude_iea = TRUE,
measure_underrepresentation = FALSE, evcodes = FALSE,
user_threshold = 0.05, correction_method = "g_SCS",
domain_scope = "annotated", custom_bg = NULL,
numeric_ns = "", sources = NULL)
termres <- term.induced$result
termres <- termres[order(termres$p_value),]
p <- gostplot(term.induced, capped = FALSE, interactive = FALSE)
ProResultsFileUp <- paste(CompOutDir,CompPrefix,"analysis",".deseq2.FDR",MIN_FDR,".gprofiler.up.txt",sep="")
ProResultsFileUppdf <- paste(CompOutDir,CompPrefix,"analysis","_deseq2_FDR",MIN_FDR*100,"_gprofiler_up.pdf",sep="")
h1 <- as.list(term.induced$result$term_id[1:4])
publish_gostplot(p, highlight_terms = h1, filename = ProResultsFileUppdf)
write.table(termres, file=ProResultsFileUp, col.names=TRUE, row.names=FALSE, sep='\t', quote=FALSE)
resultDESeq2.df <- resultDESeq2.df[order((resultDESeq2.df$log2FoldChange), decreasing = FALSE),]
repressed.sign <- rownames(resultDESeq2.df)[resultDESeq2.df$log2FoldChange <= -1 & resultDESeq2.df$padj < alpha]
# Sort the gene-level statistics by adjusted p-value:
term.repressed <- gost(query=repressed.sign, organism = "fcatus", ordered_query = TRUE,
multi_query = FALSE, significant = FALSE, exclude_iea = TRUE,
measure_underrepresentation = FALSE, evcodes = FALSE,
user_threshold = 0.05, correction_method = "g_SCS",
domain_scope = "annotated", custom_bg = NULL,
numeric_ns = "", sources = NULL)
termresrep <- term.repressed$result
termresrep <- termresrep[order(termresrep$p_value),]
r <- gostplot(term.repressed, capped = FALSE, interactive = FALSE)
ProResultsFileDN <- paste(CompOutDir,CompPrefix,"analysis",".deseq2.FDR",MIN_FDR,".gprofiler.dn.txt",sep="")
ProResultsFileDNpdf <- paste(CompOutDir,CompPrefix,"analysis","_deseq2_FDR",MIN_FDR*100,"_gprofiler_dn.pdf",sep="")
h <- as.list(term.repressed$result$term_id[1:5])
publish_gostplot(r, highlight_terms = h, filename = ProResultsFileDNpdf)
write.table(termresrep, file=ProResultsFileDN, col.names=TRUE, row.names=FALSE, sep='\t', quote=FALSE)
########################################################################################
library(biomaRt) # Install the biomaRt package (Bioconductor) if this command does not work
geneLevelStats <- subset(resultDESeq2.df , padj < alpha) #abs(log2FoldChange) >= 1 &
# Get the Ensembl ID to gene name mapping:
mart <- useMart("ENSEMBL_MART_ENSEMBL", dataset = "fcatus_gene_ensembl", host="www.ensembl.org")
ensembl2name <- getBM(attributes=c("ensembl_gene_id","external_gene_name","entrezgene", "description"),mart=mart)
#library(stringr)
#ensembl2name$test <- str_split_fixed(ensembl2name$description, "Acc:", 2)
#, "uniprot_gn"
# Merge it with our gene-level statistics:
geneLevelStats <- merge(x=geneLevelStats, y=ensembl2name, by.x=0, by.y=1, all.x=TRUE)
colnames(geneLevelStats) <- c("ensembl","baseMean","log2FoldChange", "lfcSE", "stat", "pvalue", "padj","gene","entrezgene", "description")
# , "uniprot_gn"
geneLevelStats <- geneLevelStats[!duplicated(geneLevelStats$ensembl), ] #### keep uniq gene ids , fromLast=T
rownames(geneLevelStats) <- geneLevelStats[,1]
# Sort the gene-level statistics by adjusted p-value:
geneLevelStats <- geneLevelStats[order(geneLevelStats$padj),]
## KEGG over-representation test
mydf <- geneLevelStats #data.frame(Entrez=gene, FC=geneList)
mydf <- mydf[abs(mydf$log2FoldChange) > 1,]
mydf$group <- "Up"
mydf$group[mydf$log2FoldChange < 0] <- "Down"
mydf$othergroup <- "<4 fold"
mydf$othergroup[abs(mydf$log2FoldChange) > 2] <- ">4 fold"
####
formula_res_go <- compareCluster(ensembl~group, data=mydf, fun="enrichGO", OrgDb = Felis, keyType = 'ENSEMBL',
pAdjustMethod = "BH", pvalueCutoff = 0.05, qvalueCutoff = 0.05, readable = TRUE)
GOResultsFile <- paste(CompOutDir,CompPrefix,"analysis",".deseq2.FDR",MIN_FDR,".GO.txt",sep="")
write.table(as.data.frame(formula_res_go), file=GOResultsFile, col.names=TRUE, row.names=FALSE, sep='\t', quote=FALSE)
title_comp_go = paste(CompPrefix, " enrichGO (FDR:",MIN_FDR, " FC>2)",sep="")
print(dotplot(formula_res_go, x=~group, showCategory=20, title=title_comp_go)) #+ ggplot2::facet_grid(~othergroup))
formula_res <- compareCluster(entrezgene~group, data=mydf, fun="enrichKEGG", organism = 'fca')
#head(as.data.frame(formula_res))
KEGGResultsFile <- paste(CompOutDir,CompPrefix,"analysis",".deseq2.FDR",MIN_FDR,".kegg.txt",sep="")
write.table(as.data.frame(formula_res), file=KEGGResultsFile, col.names=TRUE, row.names=FALSE, sep='\t', quote=FALSE)
#dotplot(formula_res)
title_comp = paste(CompPrefix, " enrichKEGG (FDR:",MIN_FDR, " FC>2)",sep="")
print(dotplot(formula_res, x=~group, showCategory=20, title=title_comp)) #+ ggplot2::facet_grid(~othergroup))
####
formula_res_gp_go <- compareCluster(ensembl~group+othergroup, data=mydf, fun="enrichGO", OrgDb = Felis, keyType = 'ENSEMBL',
pAdjustMethod = "BH", pvalueCutoff = 0.05, qvalueCutoff = 0.05, readable = TRUE)
GOgpResultsFile <- paste(CompOutDir,CompPrefix,"analysis",".deseq2.FDR",MIN_FDR,".GOgp.txt",sep="")
write.table(as.data.frame(formula_res_gp_go), file=GOgpResultsFile, col.names=TRUE, row.names=FALSE, sep='\t', quote=FALSE)
title_comp_gp_go = paste(CompPrefix, " enrichGO fold comparision (FDR:",MIN_FDR, " FC>2)",sep="")
print(dotplot(formula_res_gp_go, x=~group, showCategory=20, title=title_comp_gp_go) + ggplot2::facet_grid(~othergroup))
formula_res_gp <- compareCluster(entrezgene~group+othergroup, data=mydf, fun="enrichKEGG", organism = 'fca')
#head(as.data.frame(formula_res_gp))
KEGGResultsFile <- paste(CompOutDir,CompPrefix,"analysis",".deseq2.FDR",MIN_FDR,".kegg_grouped.txt",sep="")
write.table(as.data.frame(formula_res_gp), file=KEGGResultsFile, col.names=TRUE, row.names=FALSE, sep='\t', quote=FALSE)
#dotplot(formula_res_gp)
title_comp_gp = paste(CompPrefix, " enrichKEGG fold comparision" , " (FDR:",MIN_FDR, " FC>2)",sep="")
print(dotplot(formula_res_gp, x=~group, showCategory=20, title=title_comp_gp) + ggplot2::facet_grid(~othergroup))
############
mydf_up <- mydf[mydf$log2FoldChange > 1,]
mydf_up <- na.omit(mydf_up)
gene <- mydf_up$entrezgene
geneList <- as.vector(mydf_up$log2FoldChange)
kk <- clusterProfiler::enrichKEGG(gene = gene,
organism = 'fca',
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05)
# #gene = geneLevelStats$ensembl, keyType = 'ENSEMBL'
#
# head(as.data.frame(kk)[,-8])
#ProResultsFileUp <- paste(CompOutDir,CompPrefix,"analysis",".deseq2.FDR",MIN_FDR,".enrichKEGG.up.txt",sep="")
#write.table(as.data.frame(kk), file=ProResultsFileUp, col.names=TRUE, row.names=FALSE, sep='\t', quote=FALSE)
# #plots
#print(cnetplot(kk, categorySize="geneNum", foldChange=geneList))
kk <- setReadable(kk, OrgDb = Felis, keyType = "ENTREZID")
tryCatch(print(cnetplot(kk, categorySize="pvalue", foldChange=geneList)), error = function(e) cat("Error: ",e$message, "\n"))
# print(emapplot(kk))
# print(barplot(kk, drop=TRUE, showCategory=12))
#title_up = paste(CompPrefix, " (Upregulated, FDR:",MIN_FDR, ", FC>2)",sep="")
#print(dotplot(kk, title=title_up))
mydf_down <- mydf[mydf$log2FoldChange < -1,]
mydf_down <- na.omit(mydf_down)
gene <- mydf_down$entrezgene
geneList <- as.vector(mydf_down$log2FoldChange)
kk_dn <- clusterProfiler::enrichKEGG(gene = gene,
organism = 'fca',
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05)
# #gene = geneLevelStats$ensembl, keyType = 'ENSEMBL'
#
# head(as.data.frame(kk)[,-8])
#ProResultsFileDn <- paste(CompOutDir,CompPrefix,"analysis",".deseq2.FDR",MIN_FDR,".enrichKEGG.dn.txt",sep="")
#write.table(as.data.frame(kk_dn), file=ProResultsFileDn, col.names=TRUE, row.names=FALSE, sep='\t', quote=FALSE)
# # #plots
#print(cnetplot(kk_dn, categorySize="geneNum", foldChange=geneList))
#print(cnetplot(kk_dn, categorySize="pvalue", foldChange=geneList))
kk_dn <- setReadable(kk_dn, OrgDb = Felis, keyType = "ENTREZID")
tryCatch(print(cnetplot(kk_dn, categorySize="pvalue", foldChange=geneList)), error = function(e) cat("Error: ",e$message, "\n"))
#kk_dn1@gene2Symbol <- as.character(mydf_down1$gene)
#kk_dn1@keytype <- 'auto'
#kk_dn1@result <- kk_dn1@result
#kk_dn1@readable <- TRUE
# heatplot(kk_dn, foldChange = geneList, showCategory = 100)+ ggplot2::coord_flip()
# print(emapplot(kk_dn))
# print(barplot(kk_dn, drop=TRUE, showCategory=12))
# print(dotplot(kk_dn))
#title_dn = paste(CompPrefix, " (Downregulated, FDR:",MIN_FDR, ", FC>2)",sep="")
#print(dotplot(kk_dn, title=title_dn))
## ADD COUNTS TO LOGFILE
cat(CompPrefix," genes with FDR <= ",MIN_FDR,": ",nrow(pass.fdr.table)," (up=",nrow(pass.fdr.up.table),", down=",nrow(pass.fdr.down.table),")","\n",file=LogFile,append=TRUE,sep="")
cat(CompPrefix," genes with FDR <= ",MIN_FDR," & FC > 2: ",nrow(pass.fdr.logFC.table)," (up=",nrow(pass.fdr.logFC.up.table),", down=",nrow(pass.fdr.logFC.down.table),")","\n",file=LogFile,append=TRUE,sep="")
## correlogram
# corrgram::corrgram(raw.counts[,control.samples], order=TRUE, lower.panel=corrgram::panel.pie,
# upper.panel=corrgram::panel.pts, text.panel=corrgram::panel.txt,
# main="Correlogram of controls")
}
cat("\n",file=LogFile,append=TRUE,sep="")
}
## SAMPLE CORRELATION HEATMAP
rld.subset <- assay(rld)[,comp.samples]
sampleDists <- dist(t(rld.subset))
sampleDistMatrix <- as.matrix(sampleDists)
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,clustering_distance_rows=sampleDists,clustering_distance_cols=sampleDists,col=colors)
#print(mycols)
(mycols1 <- brewer.pal(8, "Dark2")[1:length(unique(c(control.group,treat.group)))])
cl <-length(control.samples)
tl <-length(treat.samples)
col_scale <- c(rep(mycols1[1], cl), rep(mycols1[2], tl))
par(mar=c(10,4,4,2))
heatmap.2(sampleDistMatrix, key=F, trace="none",
col=colorRampPalette( rev(brewer.pal(9, "Blues")) )(255),
ColSideColors=col_scale[!is.na(col_scale)], RowSideColors=col_scale[!is.na(col_scale)],
margin=c(10, 10), main="Sample Distance Matrix",cexRow=0.8, cexCol=0.8)
# ## SCATTER PLOT FOR RLOG COUNTS
# combs <- combn(comp.samples,2,simplify=FALSE)
# clabels <- sapply(combs,function(x){paste(x,collapse=' & ')})
# plotdat <- data.frame(x=unlist(lapply(combs, function(x){rld.subset[, x[1] ]})),y=unlist(lapply(combs, function(y){rld.subset[, y[2] ]})),comp=rep(clabels, each=nrow(rld.subset)))
# myplot <- xyplot(y~x|comp,plotdat,
# panel=function(...){
# panel.xyplot(...)
# panel.abline(0,1,col="red")
# },
# par.strip.text=list(cex=0.5))
#print(myplot)
## Top gene
# res <- res[order(res$padj), ]
# topGene <- rownames(res)[which.min(res$padj)]
# print(res[topGene,])
# # plotCounts(dds, gene=topGene, intgroup=c("condition"))
#
# ## all genes with padj < 0.05 and logFC > 1
# # res.plot <- subset(res, padj < 0.05 & abs(log2FoldChange) >= 1)
# # for(i in 1:nrow(res.plot)) {
# # row <- res.plot[i,]
# # plotCounts(dds, gene=rownames(row), intgroup=c("condition"))
# # }
# ## Examine plot of p-values, draw an histogram of the p-values
hist(resultDESeq2$pvalue, breaks=50, col="grey", main="DESeq2 p-value distribution", xlab="Nominal P-value", ylab="Number of genes")
#########################################################
resultDESeq2$sig <- -log10(resultDESeq2$padj)
selectedGenes <- c(
"Most significant" = rownames(resultDESeq2)[which.max(resultDESeq2$sig)])
# top.logFC = rownames(resultDESeq2)[which.max(resultDESeq2$log2FoldChange)])
#gn.most.sign <- rownames(resultDESeq2)[1]
#gn.most.diff.val <- counts(dds.norm, normalized=T)[gn.most.sign,]
## Select a gene with small fold change but high significance
sel1 <- subset(
na.omit(resultDESeq2),
sig >= 50 & log2FoldChange > 0 & log2FoldChange < 0.5)
# dim(sel1)
selectedGenes <- append(selectedGenes,
c("Small FC yet significant"=rownames(sel1)[1]))
## Select the non-significant gene with the highest fold change
sel2 <- subset(x = na.omit(resultDESeq2), padj > alpha & log2FoldChange > 0 & baseMean > 1000 & baseMean < 10000)
# dim(sel2)
sel2 <- sel2[order(sel2$log2FoldChange, decreasing = TRUE),][1,]
selectedGenes <- append(
selectedGenes,
c("Non-significant"=rownames(sel2)[1]))
par(mfrow=c(length(na.omit(selectedGenes)),1))
# for (g in na.omit(selectedGenes)) {
# barplot(counts(dds.norm, normalized=TRUE)[g,],
# col=coldata$color,
# main=g, las=2, cex.names=0.5)
# }
compt <- cbind(pseudo.counts[,paste(comp.samples,'pseudo',sep='.')])
for (g in na.omit(selectedGenes)) {
barplot(compt[g,],
col=coldata[comp.samples,]$color,
main=g, las=2, cex.names=0.5)
}
dev.off()
colnames(comp.df) <- paste(CompPrefix,".",colnames(comp.df),sep="")
deseq2_results_list[[idx]] <- comp.df
}
## WRITE RESULTS FROM ALL COMPARISONS TO FILE
#
deseq2_results_table <- cbind(interval.table, do.call(cbind, deseq2_results_list),raw.counts,pseudo.counts)
write.table(deseq2_results_table, file=ResultsFile, col.names=TRUE, row.names=FALSE, sep='\t', quote=FALSE)
################################################
################################################
## R SESSION INFO ##
################################################
################################################
RLogFile <- "R_sessionInfo.log"
if (file.exists(RLogFile) == FALSE) {
sink(RLogFile)
a <- sessionInfo()
print(a)
sink()
}
################################################
################################################
################################################
################################################
# library(biomaRt) # Install the biomaRt package (Bioconductor) if this command does not work
#
# resultDESeq2.df <- na.omit(data.frame(comp.results))
# induced.sign <- subset(resultDESeq2.df , log2FoldChange >= 1 & padj < alpha)
# geneLevelStats <- induced.sign[,c("log2FoldChange","padj")]
#
#
# # Get the Ensembl ID to gene name mapping:
# mart <- useMart("ENSEMBL_MART_ENSEMBL", dataset = "fcatus_gene_ensembl", host="www.ensembl.org")
# ensembl2name <- getBM(attributes=c("ensembl_gene_id","external_gene_name","entrezgene"),mart=mart)
#
# # Merge it with our gene-level statistics:
# geneLevelStats <- merge(x=geneLevelStats, y=ensembl2name, by.x=0, by.y=1, all.x=TRUE)
# colnames(geneLevelStats) <- c("ensembl","log2fc","padj","gene","entrezgene")
# geneLevelStats <- geneLevelStats[!duplicated(geneLevelStats$ensembl), ] ####
# rownames(geneLevelStats) <- geneLevelStats[,1]
#
# # Sort the gene-level statistics by adjusted p-value:
# geneLevelStats <- geneLevelStats[order(geneLevelStats$padj),]
#
# ## KEGG over-representation test
# geneLevelStats <- na.omit(geneLevelStats, cols = c("entrezgene"))
#
# gene <- na.omit(geneLevelStats$entrezgene)
# geneList <- as.vector(geneLevelStats$log2fc)
#
# kk <- clusterProfiler::enrichKEGG(gene = gene,
# organism = 'fca',
# pAdjustMethod = "BH",
# pvalueCutoff = 0.05,
# qvalueCutoff = 0.05)
# #gene = geneLevelStats$ensembl, keyType = 'ENSEMBL'
#
# head(as.data.frame(kk)[,-8])
# #plots
# cnetplot(kk, categorySize="geneNum", foldChange=geneList)
# cnetplot(kk, categorySize="pvalue", foldChange=geneList)
# emapplot(kk)
# barplot(kk, drop=TRUE, showCategory=12)
# dotplot(kk)
# ## KEGG GSA test
# geneList <- geneLevelStats[c("entrezgene", "log2fc", "uniprot_gn")]
# geneList <- geneList[!duplicated(geneList$uniprot_gn), ] ####
# geneList <- geneList[geneList$log2fc < 0,]
# geneList <- geneList[abs(geneList$log2fc) > 1,]
# ## feature 1: numeric vector
# geneList1 <- as.vector(geneList$log2fc)
# ## feature 2: named vector
# names(geneList1) = geneList$uniprot_gn
# ## feature 3: decreasing order
# geneList1 = sort(geneList1, decreasing = TRUE)
#
# kk2 <- gseKEGG(geneList = geneList1,
# organism = 'fca',
# nPerm = 10000,
# keyType = 'uniprot',
# minGSSize = 1,
# pvalueCutoff = 0.9,
# verbose = TRUE)
# head(kk2)
#
#
# # if (!requireNamespace("BiocManager", quietly = TRUE))
# # install.packages("BiocManager")
# # BiocManager::install("biomaRt", version = "3.8")
# library("biomaRt")
# if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
# BiocManager::install("AnnotationForge", version = "3.8")
#
#
# library(clusterProfiler)
# Gff2GeneTable("/lrlhps/users/c274411/00_references/aws-igenomes/Felis_catus/Felis_catus_9.0/Ensembl/Annotation/Genes/Felis_catus.Felis_catus_9.0.94.gtf")
#
# Gff2GeneTable("/lrlhps/users/c274411/TEST/scripts/Felis_catus.Felis_catus_9.0.94.chr.gff3")
#
# Gff2GeneTable("/lrlhps/users/c274411/TEST/scripts/GCF_000181335.3_Felis_catus_9.0_genomic.gff")
#
# #gffread /lrlhps/users/c274411/00_references/aws-igenomes/Felis_catus/Felis_catus_9.0/Ensembl/Annotation/Genes/Felis_catus.Felis_catus_9.0.94.gtf -o- > Felis_catus.Felis_catus_9.0.94.gff
#
#
# Gff2GeneTable("/lrlhps/users/c274411/TEST/scripts/GCF_000181335.3_Felis_catus_9.0_genomic.gff")
# load("geneTable.rda")
# head(geneTable)
# eg <- geneTable$GeneID
#
#
#if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#library(BiocInstaller)
#BiocManager::valid()
#BiocManager::install("AnnotationHub", version = "3.8")
#BiocManager::install("AnnotationHubData", version = "3.8")
## Gene ID conversion
#gene.df <- bitr(repressed.sign, fromType = "ENSEMBL",
# toType = c("ENTREZID", "SYMBOL"),
# OrgDb = Felis)
#keytypes(Felis)
ggo <- groupGO(gene = repressed.sign,
OrgDb = Felis,
keyType = 'ENSEMBL',
ont = "CC",
level = 3,
readable = TRUE)
head(ggo)
ego <- enrichGO(gene = repressed.sign,
OrgDb = Felis,
keyType = 'ENSEMBL',
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05,
readable = TRUE)
#universe = rownames(resultDESeq2.df), ont = "CC",
bp2 <- simplify(ego, cutoff=0.7, by="p.adjust", select_fun=min)
## feature 1: numeric vector
geneList = mydf_up$log2fc
## feature 2: named vector
names(geneList) = as.character(mydf_up$ensembl)
## feature 3: decreasing order
geneList = sort(geneList, decreasing = TRUE)
#ego <- setReadable(ego, OrgDb = Felis, keyType = "ENSEMBL")
gsecc <- gseGO(geneList=geneList, ont="MF", keyType = 'ENSEMBL', OrgDb=Felis, verbose=F)
head(summary(gsecc))
#enrichMap(ego, vertex.label.cex=1.2, layout=igraph::layout.kamada.kawai)
cnetplot(formula_res1, foldChange=mydf$log2fc)
plotGOgraph(ego)
heatplot(ego, foldChange = mydf$log2fc, showCategory = 20000)+ ggplot2::coord_flip()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment