nilesh-tawari/featureCountsDEseq2

## featureCountsDEseq2
#!/usr/bin/env Rscript

################################################
################################################
## REQUIREMENTS                               ##
################################################
################################################

## DIFFERENTIAL ANALYSIS, SCATTERPLOTS AND PCA FOR SAMPLES IN FEATURECOUNTS FILE
		## - FIRST SIX COLUMNS OF FEATURECOUNTS_FILE SHOULD BE INTERVAL INFO. REMAINDER OF COLUMNS SHOULD BE SAMPLES-SPECIFIC COUNTS.
		## - SAMPLE NAMES HAVE TO END IN "_R1" REPRESENTING REPLICATE ID. LAST 3 CHARACTERS OF SAMPLE NAME WILL BE TRIMMED TO OBTAIN GROUP ID FOR DESEQ2 COMPARISONS.
		## - BAM_SUFFIX IS PORTION OF FILENAME AFTER SAMPLE NAME IN FEATURECOUNTS COLUMN SAMPLE NAMES E.G. ".rmDup.bam" if "DRUG_R1.rmDup.bam"
		## - PACKAGES BELOW NEED TO BE AVAILABLE TO LOAD WHEN RUNNING R

################################################
################################################
## LOAD LIBRARIES                             ##
################################################
################################################

library(optparse)
library(DESeq2)
library(vsn)
library(ggplot2)
library(RColorBrewer)
library(pheatmap)
library(lattice)

################################################
################################################
## PARSE COMMAND-LINE PARAMETERS              ##
################################################
################################################

# option_list <- list(make_option(c("-i", "--featurecount_file"), type="character", default=NULL, help="Feature count file generated by the SubRead featureCounts command.", metavar="featurecount_file"),
#                     make_option(c("-g", "--group_file"), type="character", default=NULL, help="Condition file, with sample id and groups information.", metavar="group_file"),
# 										make_option(c("-b", "--bam_suffix"), type="character", default=NULL, help="Portion of filename after sample name in featurecount file header e.g. '.rmDup.bam' if 'DRUG_R1.rmDup.bam'", metavar="bam_suffix"),
# 									  make_option(c("-o", "--outdir"), type="character", default='./', help="Output directory", metavar="path"),
# 									  make_option(c("-p", "--outprefix"), type="character", default='differential', help="Output prefix", metavar="character"),
# 										make_option(c("-s", "--outsuffix"), type="character", default='', help="Output suffix for comparison-level results", metavar="character"))
#
# opt_parser <- OptionParser(option_list=option_list)
# opt <- parse_args(opt_parser)

#if (is.null(opt$featurecount_file)){
#		print_help(opt_parser)
#		stop("Please provide featurecount file.", call.=FALSE)
#}
# if (is.null(opt$bam_suffix)){
# 		print_help(opt_parser)
# 		stop("Please provide bam suffix in header of featurecount file.", call.=FALSE)
# }

################################################
################################################
## READ IN COUNTS FILE												##
################################################
################################################
#count.table <- read.delim(file=opt$featurecount_file,header=TRUE,skip=1)
setwd("/lrlhps/users/c274411/07_cat_RNAseq/analysis/results/featureCounts/DEseq2/test")
#analysis_func_go_kegg
count.table <- read.csv('analysis_fc.txt', sep="\t", head=TRUE)
coldata <- read.csv('analysis_pheno.txt', sep="\t", row.names=1)

rownames(count.table) <- count.table[,1]
#resultDESeq2[,1] <- NULL

#colnames(count.table) <- gsub(opt$bam_suffix,"",colnames(count.table))
#colnames(count.table) <- as.character(lapply(colnames(count.table), function (x) tail(strsplit(x,'.',fixed=TRUE)[[1]],1)))
#rownames(count.table) <- count.table$Geneid
interval.table <- count.table[,0:2]
count.table <- count.table[,3:ncol(count.table),drop=FALSE]


################################################
################################################
## RUN DESEQ2																	##
################################################
################################################

#if (file.exists(opt$outdir) == FALSE) {
#    dir.create(opt$outdir,recursive=TRUE)
#}


#samples.vec <- sort(colnames(count.table))
samples.vec <- colnames(count.table)
groups <- substr(coldata$condition, 1, 20)
print(unique(groups))
if (length(unique(groups)) == 1) {
	quit(save = "no", status = 0, runLast = FALSE)
}

############### Descriptive statistics
# Define a group-specific color for each sample, and add it as a supplementary column
groupColor <- c("Group_1"="violet", "Group_2"="red", "Group_3"="blue", "Group_4"="green", "Group_5"="yellow","Group_6"="orange") # Choose one color per group
print(groupColor)

## Add a column to the pheno type to specify the color of each sample according to its genotype
coldata$color[coldata$condition == "Group_1"] <- groupColor["Group_1"]
coldata$color[coldata$condition == "Group_2"] <- groupColor["Group_2"]
coldata$color[coldata$condition == "Group_3"] <- groupColor["Group_3"]
coldata$color[coldata$condition == "Group_4"] <- groupColor["Group_4"]
coldata$color[coldata$condition == "Group_5"] <- groupColor["Group_5"]
coldata$color[coldata$condition == "Group_6"] <- groupColor["Group_6"]

#Basic statistics
statsPerSample <- data.frame(t(apply(count.table, 2, summary)))
names(statsPerSample) <- c("min", "Q1", "median", "mean", "Q3", "max")
statsPerSample[,c("perc5", "perc10", "perc90", "perc95")] <-
  apply(count.table, 2, quantile, c(0.05, 0.10, 0.90, 0.95))
statsPerSample$zeros <- apply(count.table==0, 2, sum)
statsPerSample$percZeros <- 100*statsPerSample$zeros/nrow(count.table)
write.table(statsPerSample,file=paste("analysis",".statsPerSample.txt",sep=""),row.names=TRUE,col.names=TRUE,sep="\t",quote=TRUE)


# DEseq2
DDSFile <- paste("analysis",".dds.rld.RData",sep="")
if (file.exists(DDSFile) == FALSE) {
  counts <- count.table[,samples.vec,drop=FALSE]
  #data.frame(row.names=colnames(counts),condition=groups)
  dds <- DESeqDataSetFromMatrix(countData = round(counts), colData = coldata, design = ~ condition)
  print(dds)
  is(dds)
  isS4(dds)
  slotNames(dds)
  dds <- DESeq(dds)
  rld <- rlog(dds)
  save(dds,rld,file=DDSFile)
}


################ Interactive PCA
# expmatrix <- SummarizedExperiment::assay(rld)
# if (!requireNamespace("BiocManager", quietly = TRUE))
#   install.packages("BiocManager")
# BiocManager::install("pcaExplorer", version = "3.8")
#library("pcaExplorer")
#pcaExplorer(dds, rld)

#var_vs_mean(expmatrix)
#library(corrgram)


################################################
################################################
## PLOT QC  																	##
################################################
################################################

PlotFile <- paste("analysis",".plots.pdf",sep="")
if (file.exists(PlotFile) == FALSE) {
    pdf(file=PlotFile,onefile=TRUE,width=7,height=7)
    # histogram of counts
    par(mfrow=c(3,1))
    hist(as.matrix(count.table), col="blue", border="white", breaks=100)
    hist(as.matrix(count.table), col="blue", border="white",
         breaks=20000, xlim=c(0,2000), main="Counts per gene",
         xlab="Counts (truncated axis)", ylab="Number of genes",
         las=1, cex.axis=0.7)
    epsilon <- 1 # pseudo-count to avoid problems with log(0)
    hist(as.matrix(log2(count.table + epsilon)), breaks=100, col="blue", border="white",
         main="Log2-transformed counts per gene", xlab="log2(counts+1)", ylab="Number of genes",
         las=1, cex.axis=0.7)
    par(mfrow=c(1,1))

    # box plot of counts
    boxplot(log2(count.table + epsilon), pch=".", col=coldata$color,
            horizontal=TRUE, cex.axis=0.5,
            las=1, ylab="Samples", xlab="log2(counts +1)")

    # Density plot
    # We will require one function from the affy package
    if(!require("affy")){
      source("http://bioconductor.org/biocLite.R")
      biocLite("affy")
    }
    library(affy)
    plotDensity(log2(count.table + epsilon), col=coldata$color, lty=1, lwd=2)
    grid()
    legend("topright", legend=names(groupColor), col=groupColor, lwd=2)

    dds.norm <-  estimateSizeFactors(dds)
    # Checking the normalization
    par(mfrow=c(2,2),cex.lab=0.7)
    boxplot(log2(counts(dds.norm)+epsilon),  col=coldata$color, cex.axis=0.7,
            las=1, xlab="log2(counts+1)", horizontal=TRUE, main="Raw counts")
    boxplot(log2(counts(dds.norm, normalized=TRUE)+epsilon),  col=coldata$color, cex.axis=0.7,
            las=1, xlab="log2(normalized counts)", horizontal=TRUE, main="Normalized counts")
    plotDensity(log2(counts(dds.norm)+epsilon),  col=coldata$color,
                xlab="log2(counts+1)", cex.lab=0.7, panel.first=grid())
    plotDensity(log2(counts(dds.norm, normalized=TRUE)+epsilon), col=coldata$color,
                xlab="log2(normalized counts)", cex.lab=0.7, panel.first=grid())
    # Restore default parameters
    par(mfrow=c(1,1), cex.lab=1)


    # # Scatter plot
    # # Define a function to draw a scatter plot for a pair of variables (samples) with density colors
    # plotFun <- function(x,y){
    #   dns <- densCols(x,y);
    #   points(x,y, col=dns, pch=".", panel.first=grid());
    #   #  abline(a=0, b=1, col="brown")
    # }
    #
    # # Plot the scatter plot for a few pairs of variables selected at random
    # set.seed(123) # forces the random number generator to produce fixed results
    # pairs(log2(count.table[,sample(ncol(count.table),10)] + epsilon),
    #       panel=plotFun, lower.panel = NULL)
    #

    ## PCA
    pca.data <- DESeq2::plotPCA(rld,intgroup=c("condition"),returnData=TRUE)
    percentVar <- round(100 * attr(pca.data, "percentVar"))
    myplot <- ggplot(pca.data, aes(PC1, PC2, color=condition)) +
                     geom_point(size=3) +
                     xlab(paste0("PC1: ",percentVar[1],"% variance")) +
                     ylab(paste0("PC2: ",percentVar[2],"% variance")) +
										 theme(panel.grid.major = element_blank(),
													 panel.grid.minor = element_blank(),
													 panel.background = element_blank(),
                           panel.border = element_rect(colour = "black", fill=NA, size=1))
    print(myplot)

		## WRITE PC1 vs PC2 VALUES TO FILE
		pca.vals <- pca.data[,1:2]
		colnames(pca.vals) <- paste(colnames(pca.vals),paste(percentVar,'% variance',sep=""), sep=": ")
		pca.vals <- cbind(sample = rownames(pca.vals), pca.vals)
		write.table(pca.vals,file=paste("analysis",".pca.vals.txt",sep=""),row.names=FALSE,col.names=TRUE,sep="\t",quote=TRUE)

		## Dispersion plot
		plotDispEsts(dds, main="Dispersion plot")


    ## SAMPLE CORRELATION HEATMAP
    sampleDists <- dist(t(assay(rld)))
    sampleDistMatrix <- as.matrix(sampleDists)
    colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
    pheatmap(sampleDistMatrix,clustering_distance_rows=sampleDists,clustering_distance_cols=sampleDists,col=colors)

    library(RColorBrewer)
    library(gplots)
    (mycols <- brewer.pal(8, "Dark2")[1:length(unique(coldata$condition))])
    par(mar=c(10,4,4,2))
    heatmap.2(sampleDistMatrix, key=F, trace="none",
              col=colorRampPalette( rev(brewer.pal(9, "Blues")) )(255),
              ColSideColors=mycols[coldata$condition], RowSideColors=mycols[coldata$condition],
              margin=c(10, 10), main="Sample Distance Matrix",cexRow=0.8, cexCol=0.8)

    ## Box plot of counts
    par(mar=c(10,3,1,1))
    nor.counts <- counts(dds,normalized=TRUE)
    nor.counts = log2(nor.counts+1)
    par(cex.lab=0.8) # is for y-axis
    par(cex.axis=0.8) # is for x-axis
    boxplot(nor.counts,col=mycols[coldata$condition],las=2,ylab='log2(nor.counts+1)', boxwex = 0.5)


		## WRITE SAMPLE DISTANCES TO FILE
		write.table(cbind(sample = rownames(sampleDistMatrix), sampleDistMatrix),file=paste("analysis",".sample.dists.txt",sep=""),row.names=FALSE,col.names=TRUE,sep="\t",quote=FALSE)

		dev.off()
}

#plotDensity(log2(count.table + epsilon), col=coldata$color, lty=1, lwd=2)
#grid()
#legend("topright", legend=names(groupColor), col=groupColor, lwd=2)


################################################
################################################
## SAVE SIZE FACTORS  												##
################################################
################################################

SizeFactorsDir <- "sizeFactors/"
if (file.exists(SizeFactorsDir) == FALSE) {
    dir.create(SizeFactorsDir,recursive=TRUE)
}

NormFactorsFile <- paste(SizeFactorsDir, "analysis",".sizeFactors.RData",sep="")
if (file.exists(NormFactorsFile) == FALSE) {
    normFactors <- sizeFactors(dds)
    save(normFactors,file=NormFactorsFile)

		for (name in names(sizeFactors(dds))) {
		    sizeFactorFile <- paste(SizeFactorsDir,name,"analysis",".sizeFactor.txt",sep="")
		    if (file.exists(sizeFactorFile) == FALSE) {
		      write(as.numeric(sizeFactors(dds)[name]),file=sizeFactorFile)
		    }
		}
}

################################################
################################################
## WRITE LOG FILE   													##
################################################
################################################

LogFile <- paste("analysis",".log",sep="")
if (file.exists(LogFile) == FALSE) {
    cat("\nSamples =",samples.vec,"\n\n",file=LogFile,append=TRUE,sep=', ')
    cat("Groups =",groups,"\n\n",file=LogFile,append=TRUE,sep=', ')
    cat("Dimensions of count matrix =",dim(counts),"\n\n",file=LogFile,append=FALSE,sep=' ')
    cat("\n",file=LogFile,append=TRUE,sep='')
}

################################################
################################################
## LOOP THROUGH COMPARISONS    								##
################################################
################################################

ResultsFile <- paste("analysis",".results.txt",sep="")

raw.counts <- counts(dds,normalized=FALSE)
colnames(raw.counts) <- paste(colnames(raw.counts),'raw',sep='.')
pseudo.counts <- counts(dds,normalized=TRUE)
colnames(pseudo.counts) <- paste(colnames(pseudo.counts),'pseudo',sep='.')

deseq2_results_list <- list()
comparisons <- combn(unique(groups),2)


#ncol(comparisons)
library(AnnotationHub)
hub <- AnnotationHub()
#d <- display(hub)
#query(hub, "Felis")
Felis <- hub[["AH66297"]]  #org.Felis_catus.eg.sqlite AH61890
#snapshotDate(): 2018-10-24     snapshotDate(): 2018-04-30
#pd <- possibleDates(hub)
#snapshotDate(hub) <- pd[111]
library(clusterProfiler)
#1:ncol(comparisons)
# 2, 3, 8, 12
#
for (idx in 1:ncol(comparisons)) {
        control.group <- comparisons[1,idx]
        treat.group <- comparisons[2,idx]
        CompPrefix <- paste(control.group,treat.group,sep="vs")
        cat("Saving results for ",CompPrefix," ...\n",sep="")

        CompOutDir <- paste(CompPrefix,'/',sep="")
        if (file.exists(CompOutDir) == FALSE) {
            dir.create(CompOutDir,recursive=TRUE)
        }

        control.samples <- samples.vec[which(groups == control.group)]
        treat.samples <- samples.vec[which(groups == treat.group)]
        comp.samples <- c(control.samples,treat.samples)
        print(c(control.group,treat.group))
        print("=================================")
        print(control.group)
        print("control.samples")
        print(control.samples)
        print("=================================")
        print("treat.samples")
        print(treat.group)
        print(treat.samples)
        print("=================================")
        print(c("condition",c(control.group,treat.group)))
        comp.results <- results(dds,contrast=c("condition",c(control.group,treat.group)))

        # Annotations #######################################################################################
#        library(biomaRt) # Install the biomaRt package (Bioconductor) if this command does not work
        # Get the Ensembl ID to gene name mapping:
#        mart <- useMart("ENSEMBL_MART_ENSEMBL", dataset = "fcatus_gene_ensembl", host="www.ensembl.org")
#        ensembl2name <- getBM(attributes=c("ensembl_gene_id","external_gene_name","entrezgene", "description"),mart=mart)

        #library(stringr)
        #ensembl2name$test <- str_split_fixed(ensembl2name$description, "Acc:", 2)

        #, "uniprot_gn"
        # Merge it with our gene-level statistics:
#        comp.results <- merge(x=as.data.frame(comp.results), y=ensembl2name, by.x=0, by.y=1, all.x=TRUE)
#        colnames(comp.results) <- c("ensembl", "baseMean","log2FoldChange", "lfcSE", "stat", "pvalue", "padj", "gene", "entrezgene", "description")
        # , "uniprot_gn"
#        comp.results <- comp.results[!duplicated(comp.results$ensembl), ] #### keep uniq gene ids , fromLast=T
#        rownames(comp.results) <- comp.results[,1]

        #library(tidyr)
        #a <- separate(data = ensembl2name, col = description, into = c("left", "right"), sep = "Acc:")


        #res <- comp.results

        comp.df <- as.data.frame(comp.results)
        comp.table <- cbind(interval.table, as.data.frame(comp.df), raw.counts[,paste(comp.samples,'raw',sep='.')], pseudo.counts[,paste(comp.samples,'pseudo',sep='.')])

        resultDESeq2 <- as.data.frame(comp.results)
        #resultDESeq2 <- cbind(interval.table, as.data.frame(comp.results))
        #rownames(resultDESeq2) <- resultDESeq2[,1]
        #resultDESeq2[,1] <- NULL

        ## WRITE RESULTS FILE
        CompResultsFile <- paste(CompOutDir,CompPrefix,"analysis",".deseq2.results.txt",sep="")
        write.table(comp.table, file=CompResultsFile, col.names=TRUE, row.names=FALSE, sep='\t', quote=FALSE)

        ## FILTER RESULTS BY FDR & LOGFC AND WRITE RESULTS FILE
				pdf(file=paste(CompOutDir,CompPrefix,"analysis",".deseq2.plots.pdf",sep=""),width=10,height=8)
				if (length(comp.samples) > 2) {
		        for (MIN_FDR in c(0.05)) {
                #0.01,
		            ## SUBSET RESULTS BY FDR
		            pass.fdr.table <- subset(comp.table, padj < MIN_FDR)
		            pass.fdr.up.table <- subset(comp.table, padj < MIN_FDR & log2FoldChange > 0)
		            pass.fdr.down.table <- subset(comp.table, padj < MIN_FDR & log2FoldChange < 0)

		            ## SUBSET RESULTS BY FDR AND LOGFC
		            pass.fdr.logFC.table <- subset(comp.table, padj < MIN_FDR & abs(log2FoldChange) >= 1)
		            pass.fdr.logFC.up.table <- subset(comp.table, padj < MIN_FDR & abs(log2FoldChange) >= 1 & log2FoldChange > 0)
		            pass.fdr.logFC.down.table <- subset(comp.table, padj < MIN_FDR & abs(log2FoldChange) >= 1 & log2FoldChange < 0)

		            ## WRITE RESULTS FILE
		            CompResultsFile <- paste(CompOutDir,CompPrefix,"analysis",".deseq2.FDR",MIN_FDR,".results.txt",sep="")
		            CompBEDFile <- paste(CompOutDir,CompPrefix,"analysis",".deseq2.FDR",MIN_FDR,".results.bed",sep="")
		            write.table(pass.fdr.table, file=CompResultsFile, col.names=TRUE, row.names=FALSE, sep='\t', quote=FALSE)
		            write.table(pass.fdr.table[,c("Geneid","log2FoldChange")], file=CompBEDFile, col.names=FALSE, row.names=FALSE, sep='\t', quote=FALSE)
                #
		            CompResultsFile <- paste(CompOutDir,CompPrefix,"analysis",".deseq2.FDR",MIN_FDR,".FC2.results.txt",sep="")
		            CompBEDFile <- paste(CompOutDir,CompPrefix,"analysis",".deseq2.FDR",MIN_FDR,".FC2.results.bed",sep="")
		            write.table(pass.fdr.logFC.table, file=CompResultsFile, col.names=TRUE, row.names=FALSE, sep='\t', quote=FALSE)
		            write.table(pass.fdr.logFC.table[,c("Geneid","log2FoldChange")], file=CompBEDFile, col.names=FALSE, row.names=FALSE, sep='\t', quote=FALSE)
                #Geneid
		            ## MA PLOT & VOLCANO PLOT
		            #DESeq2::plotMA(comp.results, main=paste("MA plot FDR <= ",MIN_FDR,sep=""), colNonSig = "blue", ylim=c(-2,2),alpha=MIN_FDR)
		            #abline(h=c(-1:1), col="red")
		            #plot(comp.table$log2FoldChange, -1*log10(comp.table$padj), col=ifelse(comp.table$padj<=MIN_FDR, "red", "black"), xlab="logFC", ylab="-1*log10(FDR)", main=paste("Volcano plot FDR <=",MIN_FDR,sep=" "), pch=20)

		            # VOLCANO with labels, Volcano plot
		            alpha <- MIN_FDR # Threshold on the p-value

		            # par(mfrow=c(1,2))
		            # Compute significance, with a maximum of 320 for the p-values set to 0 due to limitation of computation precision
		            resultDESeq2$sig <- -log10(resultDESeq2$padj)
		            sum(is.infinite(resultDESeq2$sig))

		            resultDESeq2[is.infinite(resultDESeq2$sig),"sig"] <- 350

		            # View(resultDESeq2[is.na(resultDESeq2$pvalue),])

		            # Select genes with a defined p-value (DESeq2 assigns NA to some genes)
		            genes.to.plot <- !is.na(resultDESeq2$pvalue)
		            # sum(genes.to.plot)
		            range(resultDESeq2[genes.to.plot, "log2FoldChange"])


		            # View(resultDESeq2[genes.to.plot,])

		            # ## Volcano plot of adjusted p-values
		            # cols <- densCols(resultDESeq2$log2FoldChange, resultDESeq2$sig)
		            # cols[resultDESeq2$pvalue ==0] <- "purple"
		            # resultDESeq2$pch <- 19
		            # resultDESeq2$pch[resultDESeq2$pvalue ==0] <- 6
		            # plot(resultDESeq2$log2FoldChange,
		            #      resultDESeq2$sig,
		            #      col=cols, panel.first=grid(),
		            #      main="Volcano plot",
		            #      xlab="Effect size: log2(fold-change)",
		            #      ylab="-log10(adjusted p-value)",
		            #      pch=resultDESeq2$pch, cex=0.4)
		            # abline(v=0)
		            # abline(v=c(-1,1), col="brown")
		            # abline(h=-log10(alpha), col="brown")
		            #
		            # ## Plot the names of a reasonable number of genes, by selecting those begin not only significant but also having a strong effect size
		            # gn.selected <- abs(resultDESeq2$log2FoldChange) > 2 & resultDESeq2$padj < alpha
		            # text(resultDESeq2$log2FoldChange[gn.selected],
		            #      -log10(resultDESeq2$padj)[gn.selected],
		            #      lab=rownames(resultDESeq2)[gn.selected ], cex=0.6)
		            ################################################
		            ################################################
		            ################################################
		            ################################################
		            ## GO TERM ANALYSIS USING GPROFILER ##
		            library(gprofiler2)
		            resultDESeq2.df <- na.omit(data.frame(comp.results))

		            resultDESeq2.df <- resultDESeq2.df[order((resultDESeq2.df$log2FoldChange), decreasing = TRUE),]
		            induced.sign <- rownames(resultDESeq2.df)[resultDESeq2.df$log2FoldChange >= 1 &  resultDESeq2.df$padj < alpha]
		            # Sort the gene-level statistics by adjusted p-value:
		            term.induced <- gost(query=induced.sign, organism = "fcatus", ordered_query = TRUE,
		                                 multi_query = FALSE, significant = FALSE, exclude_iea = TRUE,
		                                 measure_underrepresentation = FALSE, evcodes = FALSE,
		                                 user_threshold = 0.05, correction_method = "g_SCS",
		                                 domain_scope = "annotated", custom_bg = NULL,
		                                 numeric_ns = "", sources = NULL)
		            termres <- term.induced$result
		            termres <- termres[order(termres$p_value),]
		            p <- gostplot(term.induced, capped = FALSE, interactive = FALSE)
		            ProResultsFileUp <- paste(CompOutDir,CompPrefix,"analysis",".deseq2.FDR",MIN_FDR,".gprofiler.up.txt",sep="")
		            ProResultsFileUppdf <- paste(CompOutDir,CompPrefix,"analysis","_deseq2_FDR",MIN_FDR*100,"_gprofiler_up.pdf",sep="")
		            h1 <- as.list(term.induced$result$term_id[1:4])
		            publish_gostplot(p, highlight_terms = h1, filename = ProResultsFileUppdf)
		            write.table(termres, file=ProResultsFileUp, col.names=TRUE, row.names=FALSE, sep='\t', quote=FALSE)

		            resultDESeq2.df <- resultDESeq2.df[order((resultDESeq2.df$log2FoldChange), decreasing = FALSE),]
		            repressed.sign <- rownames(resultDESeq2.df)[resultDESeq2.df$log2FoldChange <= -1 &  resultDESeq2.df$padj < alpha]
		            # Sort the gene-level statistics by adjusted p-value:
		            term.repressed <- gost(query=repressed.sign, organism = "fcatus", ordered_query = TRUE,
		                                   multi_query = FALSE, significant = FALSE, exclude_iea = TRUE,
		                                   measure_underrepresentation = FALSE, evcodes = FALSE,
		                                   user_threshold = 0.05, correction_method = "g_SCS",
		                                   domain_scope = "annotated", custom_bg = NULL,
		                                   numeric_ns = "", sources = NULL)
		            termresrep <- term.repressed$result
		            termresrep <- termresrep[order(termresrep$p_value),]
		            r <- gostplot(term.repressed, capped = FALSE, interactive = FALSE)
		            ProResultsFileDN <- paste(CompOutDir,CompPrefix,"analysis",".deseq2.FDR",MIN_FDR,".gprofiler.dn.txt",sep="")
		            ProResultsFileDNpdf <- paste(CompOutDir,CompPrefix,"analysis","_deseq2_FDR",MIN_FDR*100,"_gprofiler_dn.pdf",sep="")
		            h <- as.list(term.repressed$result$term_id[1:5])
		            publish_gostplot(r, highlight_terms = h, filename = ProResultsFileDNpdf)
		            write.table(termresrep, file=ProResultsFileDN, col.names=TRUE, row.names=FALSE, sep='\t', quote=FALSE)

		            ########################################################################################
		            library(biomaRt) # Install the biomaRt package (Bioconductor) if this command does not work
		            geneLevelStats <- subset(resultDESeq2.df , padj < alpha) #abs(log2FoldChange) >= 1 &

		            # Get the Ensembl ID to gene name mapping:
		            mart <- useMart("ENSEMBL_MART_ENSEMBL", dataset = "fcatus_gene_ensembl", host="www.ensembl.org")
		            ensembl2name <- getBM(attributes=c("ensembl_gene_id","external_gene_name","entrezgene", "description"),mart=mart)

		            #library(stringr)
		            #ensembl2name$test <- str_split_fixed(ensembl2name$description, "Acc:", 2)


		            #, "uniprot_gn"
		            # Merge it with our gene-level statistics:
		            geneLevelStats <- merge(x=geneLevelStats, y=ensembl2name, by.x=0, by.y=1, all.x=TRUE)
		            colnames(geneLevelStats) <- c("ensembl","baseMean","log2FoldChange", "lfcSE", "stat", "pvalue", "padj","gene","entrezgene", "description")
		            # , "uniprot_gn"
		            geneLevelStats <- geneLevelStats[!duplicated(geneLevelStats$ensembl), ] #### keep uniq gene ids , fromLast=T
		            rownames(geneLevelStats) <- geneLevelStats[,1]

		            # Sort the gene-level statistics by adjusted p-value:
		            geneLevelStats <- geneLevelStats[order(geneLevelStats$padj),]

		            ## KEGG over-representation test
		            mydf <- geneLevelStats #data.frame(Entrez=gene, FC=geneList)
		            mydf <- mydf[abs(mydf$log2FoldChange) > 1,]
		            mydf$group <- "Up"
		            mydf$group[mydf$log2FoldChange < 0] <- "Down"
		            mydf$othergroup <- "<4 fold"
		            mydf$othergroup[abs(mydf$log2FoldChange) > 2] <- ">4 fold"


		            ####
		            formula_res_go <- compareCluster(ensembl~group, data=mydf, fun="enrichGO", OrgDb = Felis, keyType = 'ENSEMBL',
		                                             pAdjustMethod = "BH", pvalueCutoff  = 0.05, qvalueCutoff  = 0.05, readable = TRUE)
		            GOResultsFile <- paste(CompOutDir,CompPrefix,"analysis",".deseq2.FDR",MIN_FDR,".GO.txt",sep="")
		            write.table(as.data.frame(formula_res_go), file=GOResultsFile, col.names=TRUE, row.names=FALSE, sep='\t', quote=FALSE)
		            title_comp_go = paste(CompPrefix,  " enrichGO (FDR:",MIN_FDR, " FC>2)",sep="")
		            print(dotplot(formula_res_go, x=~group, showCategory=20, title=title_comp_go)) #+ ggplot2::facet_grid(~othergroup))

		            formula_res <- compareCluster(entrezgene~group, data=mydf, fun="enrichKEGG", organism = 'fca')
		            #head(as.data.frame(formula_res))
		            KEGGResultsFile <- paste(CompOutDir,CompPrefix,"analysis",".deseq2.FDR",MIN_FDR,".kegg.txt",sep="")
		            write.table(as.data.frame(formula_res), file=KEGGResultsFile, col.names=TRUE, row.names=FALSE, sep='\t', quote=FALSE)
		            #dotplot(formula_res)
		            title_comp = paste(CompPrefix,  " enrichKEGG (FDR:",MIN_FDR, " FC>2)",sep="")
		            print(dotplot(formula_res, x=~group, showCategory=20, title=title_comp)) #+ ggplot2::facet_grid(~othergroup))


		            ####
		            formula_res_gp_go <- compareCluster(ensembl~group+othergroup, data=mydf, fun="enrichGO", OrgDb = Felis, keyType = 'ENSEMBL',
		                                             pAdjustMethod = "BH", pvalueCutoff  = 0.05, qvalueCutoff  = 0.05, readable = TRUE)
		            GOgpResultsFile <- paste(CompOutDir,CompPrefix,"analysis",".deseq2.FDR",MIN_FDR,".GOgp.txt",sep="")
		            write.table(as.data.frame(formula_res_gp_go), file=GOgpResultsFile, col.names=TRUE, row.names=FALSE, sep='\t', quote=FALSE)
		            title_comp_gp_go = paste(CompPrefix,  " enrichGO fold comparision (FDR:",MIN_FDR, " FC>2)",sep="")
		            print(dotplot(formula_res_gp_go, x=~group, showCategory=20, title=title_comp_gp_go) + ggplot2::facet_grid(~othergroup))

		            formula_res_gp <- compareCluster(entrezgene~group+othergroup, data=mydf, fun="enrichKEGG", organism = 'fca')
		            #head(as.data.frame(formula_res_gp))
		            KEGGResultsFile <- paste(CompOutDir,CompPrefix,"analysis",".deseq2.FDR",MIN_FDR,".kegg_grouped.txt",sep="")
		            write.table(as.data.frame(formula_res_gp), file=KEGGResultsFile, col.names=TRUE, row.names=FALSE, sep='\t', quote=FALSE)
		            #dotplot(formula_res_gp)
		            title_comp_gp = paste(CompPrefix, " enrichKEGG fold comparision" ,  " (FDR:",MIN_FDR, " FC>2)",sep="")
		            print(dotplot(formula_res_gp, x=~group, showCategory=20, title=title_comp_gp) + ggplot2::facet_grid(~othergroup))

		            ############
		            mydf_up <- mydf[mydf$log2FoldChange > 1,]
		            mydf_up <- na.omit(mydf_up)
		            gene <- mydf_up$entrezgene
		            geneList <- as.vector(mydf_up$log2FoldChange)
		            kk <- clusterProfiler::enrichKEGG(gene = gene,
		                                               organism     = 'fca',
		                                               pAdjustMethod = "BH",
		                                               pvalueCutoff = 0.05,
		                                               qvalueCutoff  = 0.05)
		            # #gene = geneLevelStats$ensembl, keyType = 'ENSEMBL'
		            #
		            # head(as.data.frame(kk)[,-8])
		            #ProResultsFileUp <- paste(CompOutDir,CompPrefix,"analysis",".deseq2.FDR",MIN_FDR,".enrichKEGG.up.txt",sep="")
		            #write.table(as.data.frame(kk), file=ProResultsFileUp, col.names=TRUE, row.names=FALSE, sep='\t', quote=FALSE)

		            # #plots
		            #print(cnetplot(kk, categorySize="geneNum", foldChange=geneList))
		            kk <- setReadable(kk, OrgDb = Felis, keyType = "ENTREZID")
		            tryCatch(print(cnetplot(kk, categorySize="pvalue", foldChange=geneList)), error = function(e) cat("Error: ",e$message, "\n"))
		            # print(emapplot(kk))
		            # print(barplot(kk, drop=TRUE, showCategory=12))
		            #title_up = paste(CompPrefix,  " (Upregulated, FDR:",MIN_FDR, ", FC>2)",sep="")
		            #print(dotplot(kk, title=title_up))

		            mydf_down <- mydf[mydf$log2FoldChange < -1,]
		            mydf_down <- na.omit(mydf_down)
		            gene <- mydf_down$entrezgene
		            geneList <- as.vector(mydf_down$log2FoldChange)
		            kk_dn <- clusterProfiler::enrichKEGG(gene = gene,
		                                              organism     = 'fca',
		                                              pAdjustMethod = "BH",
		                                              pvalueCutoff = 0.05,
		                                              qvalueCutoff  = 0.05)
		            # #gene = geneLevelStats$ensembl, keyType = 'ENSEMBL'
		            #
		            # head(as.data.frame(kk)[,-8])
		            #ProResultsFileDn <- paste(CompOutDir,CompPrefix,"analysis",".deseq2.FDR",MIN_FDR,".enrichKEGG.dn.txt",sep="")
		            #write.table(as.data.frame(kk_dn), file=ProResultsFileDn, col.names=TRUE, row.names=FALSE, sep='\t', quote=FALSE)
		            # # #plots
		            #print(cnetplot(kk_dn, categorySize="geneNum", foldChange=geneList))
		            #print(cnetplot(kk_dn, categorySize="pvalue", foldChange=geneList))
		            kk_dn <- setReadable(kk_dn, OrgDb = Felis, keyType = "ENTREZID")
		            tryCatch(print(cnetplot(kk_dn, categorySize="pvalue", foldChange=geneList)), error = function(e) cat("Error: ",e$message, "\n"))

		            #kk_dn1@gene2Symbol <- as.character(mydf_down1$gene)
		            #kk_dn1@keytype <- 'auto'
		            #kk_dn1@result <- kk_dn1@result
		            #kk_dn1@readable <- TRUE

		            # heatplot(kk_dn, foldChange = geneList, showCategory = 100)+ ggplot2::coord_flip()


		            # print(emapplot(kk_dn))
		            # print(barplot(kk_dn, drop=TRUE, showCategory=12))
		            # print(dotplot(kk_dn))
		            #title_dn = paste(CompPrefix,  " (Downregulated, FDR:",MIN_FDR, ", FC>2)",sep="")
		            #print(dotplot(kk_dn, title=title_dn))

		            ## ADD COUNTS TO LOGFILE
		            cat(CompPrefix," genes with FDR <= ",MIN_FDR,": ",nrow(pass.fdr.table)," (up=",nrow(pass.fdr.up.table),", down=",nrow(pass.fdr.down.table),")","\n",file=LogFile,append=TRUE,sep="")
		            cat(CompPrefix," genes with FDR <= ",MIN_FDR," & FC > 2: ",nrow(pass.fdr.logFC.table)," (up=",nrow(pass.fdr.logFC.up.table),", down=",nrow(pass.fdr.logFC.down.table),")","\n",file=LogFile,append=TRUE,sep="")
		            ## correlogram
		            # corrgram::corrgram(raw.counts[,control.samples], order=TRUE, lower.panel=corrgram::panel.pie,
		            #                    upper.panel=corrgram::panel.pts, text.panel=corrgram::panel.txt,
		            #                    main="Correlogram of controls")


		        }
		        cat("\n",file=LogFile,append=TRUE,sep="")
				}


				## SAMPLE CORRELATION HEATMAP
        rld.subset <- assay(rld)[,comp.samples]
        sampleDists <- dist(t(rld.subset))
        sampleDistMatrix <- as.matrix(sampleDists)
        colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
        pheatmap(sampleDistMatrix,clustering_distance_rows=sampleDists,clustering_distance_cols=sampleDists,col=colors)
        #print(mycols)
        (mycols1 <- brewer.pal(8, "Dark2")[1:length(unique(c(control.group,treat.group)))])
        cl <-length(control.samples)
        tl <-length(treat.samples)
        col_scale <- c(rep(mycols1[1], cl), rep(mycols1[2], tl))
        par(mar=c(10,4,4,2))
        heatmap.2(sampleDistMatrix, key=F, trace="none",
                  col=colorRampPalette( rev(brewer.pal(9, "Blues")) )(255),
                  ColSideColors=col_scale[!is.na(col_scale)], RowSideColors=col_scale[!is.na(col_scale)],
                  margin=c(10, 10), main="Sample Distance Matrix",cexRow=0.8, cexCol=0.8)


        # ## SCATTER PLOT FOR RLOG COUNTS
        # combs <- combn(comp.samples,2,simplify=FALSE)
        # clabels <- sapply(combs,function(x){paste(x,collapse=' & ')})
        # plotdat <- data.frame(x=unlist(lapply(combs, function(x){rld.subset[, x[1] ]})),y=unlist(lapply(combs, function(y){rld.subset[, y[2] ]})),comp=rep(clabels, each=nrow(rld.subset)))
        # myplot <- xyplot(y~x|comp,plotdat,
        #                  panel=function(...){
        #                      panel.xyplot(...)
        #                      panel.abline(0,1,col="red")
        #                  },
        #                  par.strip.text=list(cex=0.5))
        #print(myplot)
        ## Top gene
        # res <- res[order(res$padj), ]
        # topGene <- rownames(res)[which.min(res$padj)]
        # print(res[topGene,])
        # # plotCounts(dds, gene=topGene, intgroup=c("condition"))
        #
        # ## all genes with padj < 0.05 and logFC > 1
        # # res.plot <- subset(res, padj < 0.05 & abs(log2FoldChange) >= 1)
        # # for(i in 1:nrow(res.plot)) {
        # #   row <- res.plot[i,]
        # #   plotCounts(dds, gene=rownames(row), intgroup=c("condition"))
        # # }
        # ## Examine plot of p-values, draw an histogram of the p-values
        hist(resultDESeq2$pvalue, breaks=50, col="grey", main="DESeq2 p-value distribution", xlab="Nominal P-value", ylab="Number of genes")

        #########################################################
        resultDESeq2$sig <- -log10(resultDESeq2$padj)
        selectedGenes <- c(
          "Most significant" =  rownames(resultDESeq2)[which.max(resultDESeq2$sig)])

        #  top.logFC = rownames(resultDESeq2)[which.max(resultDESeq2$log2FoldChange)])
        #gn.most.sign <- rownames(resultDESeq2)[1]
        #gn.most.diff.val <- counts(dds.norm, normalized=T)[gn.most.sign,]

        ## Select a gene with small fold change but high significance
        sel1 <- subset(
          na.omit(resultDESeq2),
          sig >= 50 & log2FoldChange > 0 & log2FoldChange < 0.5)
        # dim(sel1)
        selectedGenes <- append(selectedGenes,
                                c("Small FC yet significant"=rownames(sel1)[1]))

        ## Select the non-significant gene with the highest fold change
        sel2 <- subset(x = na.omit(resultDESeq2), padj > alpha & log2FoldChange > 0 & baseMean > 1000 & baseMean < 10000)
        # dim(sel2)
        sel2 <- sel2[order(sel2$log2FoldChange, decreasing = TRUE),][1,]
        selectedGenes <- append(
          selectedGenes,
          c("Non-significant"=rownames(sel2)[1]))

        par(mfrow=c(length(na.omit(selectedGenes)),1))
        # for (g in na.omit(selectedGenes)) {
        #   barplot(counts(dds.norm, normalized=TRUE)[g,],
        #           col=coldata$color,
        #           main=g, las=2, cex.names=0.5)
        # }
        compt <- cbind(pseudo.counts[,paste(comp.samples,'pseudo',sep='.')])
        for (g in na.omit(selectedGenes)) {
          barplot(compt[g,],
                  col=coldata[comp.samples,]$color,
                  main=g, las=2, cex.names=0.5)
        }

        dev.off()

        colnames(comp.df) <- paste(CompPrefix,".",colnames(comp.df),sep="")
				deseq2_results_list[[idx]] <- comp.df
    }


## WRITE RESULTS FROM ALL COMPARISONS TO FILE
#
deseq2_results_table <- cbind(interval.table, do.call(cbind, deseq2_results_list),raw.counts,pseudo.counts)
write.table(deseq2_results_table, file=ResultsFile, col.names=TRUE, row.names=FALSE, sep='\t', quote=FALSE)


################################################
################################################
## R SESSION INFO															##
################################################
################################################

RLogFile <- "R_sessionInfo.log"
if (file.exists(RLogFile) == FALSE) {
    sink(RLogFile)
    a <- sessionInfo()
    print(a)
    sink()
}

################################################
################################################
################################################
################################################
# library(biomaRt) # Install the biomaRt package (Bioconductor) if this command does not work
#
# resultDESeq2.df <- na.omit(data.frame(comp.results))
# induced.sign <- subset(resultDESeq2.df , log2FoldChange >= 1 & padj < alpha)
# geneLevelStats <- induced.sign[,c("log2FoldChange","padj")]
#
#
# # Get the Ensembl ID to gene name mapping:
# mart <- useMart("ENSEMBL_MART_ENSEMBL", dataset = "fcatus_gene_ensembl", host="www.ensembl.org")
# ensembl2name <- getBM(attributes=c("ensembl_gene_id","external_gene_name","entrezgene"),mart=mart)
#
# # Merge it with our gene-level statistics:
# geneLevelStats <- merge(x=geneLevelStats, y=ensembl2name, by.x=0, by.y=1, all.x=TRUE)
# colnames(geneLevelStats) <- c("ensembl","log2fc","padj","gene","entrezgene")
# geneLevelStats <- geneLevelStats[!duplicated(geneLevelStats$ensembl), ] ####
# rownames(geneLevelStats) <- geneLevelStats[,1]
#
# # Sort the gene-level statistics by adjusted p-value:
# geneLevelStats <- geneLevelStats[order(geneLevelStats$padj),]
#
# ## KEGG over-representation test
# geneLevelStats <- na.omit(geneLevelStats, cols = c("entrezgene"))
#
# gene <- na.omit(geneLevelStats$entrezgene)
# geneList <- as.vector(geneLevelStats$log2fc)
#
# kk <- clusterProfiler::enrichKEGG(gene = gene,
#                                   organism     = 'fca',
#                                   pAdjustMethod = "BH",
#                                   pvalueCutoff = 0.05,
#                                   qvalueCutoff  = 0.05)
# #gene = geneLevelStats$ensembl, keyType = 'ENSEMBL'
#
# head(as.data.frame(kk)[,-8])
# #plots
# cnetplot(kk, categorySize="geneNum", foldChange=geneList)
# cnetplot(kk, categorySize="pvalue", foldChange=geneList)
# emapplot(kk)
# barplot(kk, drop=TRUE, showCategory=12)
# dotplot(kk)


# ## KEGG GSA test
# geneList <- geneLevelStats[c("entrezgene", "log2fc", "uniprot_gn")]
# geneList <- geneList[!duplicated(geneList$uniprot_gn), ] ####
# geneList <- geneList[geneList$log2fc < 0,]
# geneList <- geneList[abs(geneList$log2fc) > 1,]
# ## feature 1: numeric vector
# geneList1 <- as.vector(geneList$log2fc)
# ## feature 2: named vector
# names(geneList1) = geneList$uniprot_gn
# ## feature 3: decreasing order
# geneList1 = sort(geneList1, decreasing = TRUE)
#
# kk2 <- gseKEGG(geneList     = geneList1,
#                organism     = 'fca',
#                nPerm        = 10000,
#                keyType = 'uniprot',
#                minGSSize    = 1,
#                pvalueCutoff = 0.9,
#                verbose      = TRUE)
# head(kk2)
#
#
# # if (!requireNamespace("BiocManager", quietly = TRUE))
# #   install.packages("BiocManager")
# # BiocManager::install("biomaRt", version = "3.8")
# library("biomaRt")
# 		if (!requireNamespace("BiocManager", quietly = TRUE))
# 		  install.packages("BiocManager")
# 		BiocManager::install("AnnotationForge", version = "3.8")
#
#
# 		library(clusterProfiler)
# 		Gff2GeneTable("/lrlhps/users/c274411/00_references/aws-igenomes/Felis_catus/Felis_catus_9.0/Ensembl/Annotation/Genes/Felis_catus.Felis_catus_9.0.94.gtf")
#
# 		Gff2GeneTable("/lrlhps/users/c274411/TEST/scripts/Felis_catus.Felis_catus_9.0.94.chr.gff3")
#
# 		Gff2GeneTable("/lrlhps/users/c274411/TEST/scripts/GCF_000181335.3_Felis_catus_9.0_genomic.gff")
#
# 		#gffread /lrlhps/users/c274411/00_references/aws-igenomes/Felis_catus/Felis_catus_9.0/Ensembl/Annotation/Genes/Felis_catus.Felis_catus_9.0.94.gtf -o- > Felis_catus.Felis_catus_9.0.94.gff
#
#
# 		Gff2GeneTable("/lrlhps/users/c274411/TEST/scripts/GCF_000181335.3_Felis_catus_9.0_genomic.gff")
# 		load("geneTable.rda")
# 		head(geneTable)
# 		eg	<- geneTable$GeneID
#
#


#if (!requireNamespace("BiocManager", quietly = TRUE))
#  install.packages("BiocManager")
#library(BiocInstaller)
#BiocManager::valid()
#BiocManager::install("AnnotationHub", version = "3.8")
#BiocManager::install("AnnotationHubData", version = "3.8")


## Gene ID conversion
#gene.df <- bitr(repressed.sign, fromType = "ENSEMBL",
#                toType = c("ENTREZID", "SYMBOL"),
#                OrgDb = Felis)
#keytypes(Felis)


ggo <- groupGO(gene     = repressed.sign,
               OrgDb    = Felis,
               keyType  = 'ENSEMBL',
               ont      = "CC",
               level    = 3,
               readable = TRUE)

head(ggo)
ego <- enrichGO(gene          = repressed.sign,
                OrgDb         = Felis,
                keyType       = 'ENSEMBL',
                pAdjustMethod = "BH",
                pvalueCutoff  = 0.05,
                qvalueCutoff  = 0.05,
                readable = TRUE)
#universe      = rownames(resultDESeq2.df), ont           = "CC",
bp2 <- simplify(ego, cutoff=0.7, by="p.adjust", select_fun=min)


## feature 1: numeric vector
geneList = mydf_up$log2fc
## feature 2: named vector
names(geneList) = as.character(mydf_up$ensembl)
## feature 3: decreasing order
geneList = sort(geneList, decreasing = TRUE)
#ego <- setReadable(ego, OrgDb = Felis, keyType = "ENSEMBL")
gsecc <- gseGO(geneList=geneList, ont="MF", keyType = 'ENSEMBL', OrgDb=Felis, verbose=F)
head(summary(gsecc))


#enrichMap(ego, vertex.label.cex=1.2, layout=igraph::layout.kamada.kawai)


cnetplot(formula_res1, foldChange=mydf$log2fc)
plotGOgraph(ego)
heatplot(ego, foldChange = mydf$log2fc, showCategory = 20000)+ ggplot2::coord_flip()