tiagochst/ELMERv1.R

## ELMERv1.R
# March 1 2018
# ooo Introduction ooo
# This code performs ELMERv1 analysis on TCGA breast cancer
# data set for time comparison purporses.
# ooo Author ooo
# Tiago Chedraoui Silva (tiagochst at gmail.com)

# To run ELMER version 1
# the we clone ELMER from https://github.com/lijingya/ELMER
# and installed its auxiliary data from https://github.com/lijingya/ELMER.data
# Some updates in the NAMESPACE were required
# Then we load the modified version with devtools::load_all(path to clone repository)

#-----------------------------------
# 1 - Methylation
# ----------------------------------
library(TCGAbiolinks)
query.lgg <- GDCquery(project = "TCGA-BRCA",
                      data.category = "DNA methylation",
                      platform = "Illumina Human Methylation 450",
                      legacy = TRUE)
GDCdownload(query.lgg,files.per.chunk = 10)
met <-GDCprepare(query.lgg, save = FALSE)
#met.elmer <- TCGAprepare_elmer(met, platform = "HumanMethylation450")
#save(met, file = "met_f1000.rda")

#-----------------------------------
# 2 - Expression
# ----------------------------------
query.exp.lgg <- GDCquery(project = "TCGA-BRCA",
                          data.category = "Gene expression",
                          data.type = "Gene expression quantification",
                          platform = "Illumina HiSeq",
                          file.type  = "normalized_results",
                          experimental.strategy = "RNA-Seq",
                          legacy = TRUE )
GDCdownload(query.exp.lgg,files.per.chunk = 50)
exp <- GDCprepare(query.exp.lgg, save = FALSE)


# This function, which was removed from TCGAbiolinks,
# will help the data input to be prepared to the ELMER standard.
TCGAprepare_elmer <- function(data,
                              platform,
                              met.na.cut = 0.2,
                              save = FALSE){
  # parameters veryfication

  if (missing(data))  stop("Please set the data parameter")
  if (missing(platform))  stop("Please set the platform parameter")

  if (grepl("illuminahiseq_rnaseqv2|illuminahiseq_totalrnaseqv2",
            platform, ignore.case = TRUE)) {
    message("============ Pre-pocessing expression data =============")
    message(paste0("1 - expression = log2(expression + 1): ",
                   "To linearize \n    relation between ",
                   "methylation and expression"))
    if(typeof(data) == typeof(SummarizedExperiment())){
      row.names(data) <- paste0("ID",values(data)$entrezgene)
      data <- assay(data)
    }

    if(all(grepl("\\|",rownames(data)))){
      message("2 - rownames  (gene|loci) => ('ID'loci) ")
      aux <- strsplit(rownames(data),"\\|")
      GeneID <- unlist(lapply(aux,function(x) x[2]))
      row.names(data) <- paste0("ID",GeneID)
    }
    data <- log2(data+1)
    Exp <- data.matrix(data)

    if (save)  save(Exp,file = "Exp_elmer.rda")
    return(Exp)
  }

  if (grepl("humanmethylation", platform, ignore.case = TRUE)) {
    message("============ Pre-pocessing methylation data =============")
    if (class(data) == class(data.frame())){
      msg <- paste0("1 - Removing Columns: \n  * Gene_Symbol  \n",
                    "  * Chromosome  \n  * Genomic_Coordinate")
      message(msg)
      data <- subset(data,select = 4:ncol(data))
    }
    if(typeof(data) == typeof(SummarizedExperiment())){
      data <- assay(data)
    }
    msg <- paste0("2 - Removing probes with ",
                  "NA values in more than 20% samples")
    message(msg)
    data <- data[rowMeans(is.na(data)) < met.na.cut,]
    Met <- data.matrix(data)
    if (save)  save(Met,file = "Met_elmer.rda")
    return (Met)
  }
}
exp <- exp[,!exp$is_ffpe]
met <- met[,!met$is_fppe]
exp.elmer <- TCGAprepare_elmer(exp, platform = "IlluminaHiSeq_RNASeqV2")
met.elmer <- TCGAprepare_elmer(met, platform = "HumanMethylation450")

# STARTING ELMER CODE
geneAnnot <- txs()
geneAnnot$GENEID <- paste0("ID",geneAnnot$GENEID)
geneInfo <- promoters(geneAnnot,upstream = 0, downstream = 0)
probe <- get.feature.probe()

# create mee object, use @ to access the matrices inside the object
mee <- fetch.mee(meth = met.elmer,
                 exp = exp.elmer,
                 TCGA = TRUE,
                 probeInfo = probe,
                 geneInfo = geneInfo)
save(mee,file = "mee_brca.rda")
# 871 samples
# 84 normal
# 787 tumor


# binary data
# @param x A matrix.
# @param Break A value to binarize the data.
# @param Break2 A value to cut value to 3 categories.
# @return A binarized matrix.
Binary <- function(x,Break=0.3,Break2=NULL){
  if(!is.numeric(x)) stop("x need to be numeric")
  change <- x
  if(is.null(Break2)){
    change[x > Break] <- 1
    change[x < Break | x== Break] <- 0
  }else{
    change[x < Break | x== Break] <- 0
    change[x> Break & x < Break2] <- NA
    change[x > Break2 | x== Break2] <-1
  }

  return(change)
}


# Available directions are hypo and hyper, we will use only hypo
# due to speed constraint
direction <- c("hypo")
cores <- 10
for (j in direction){
  print(j)
  dir.out <- paste0("elmer_version1/",j)
  dir.create(dir.out, recursive = TRUE)
  #--------------------------------------
  # STEP 3: Analysis                     |
  #--------------------------------------
  # Step 3.1: Get diff methylated probes |
  #--------------------------------------
  time.diff.probes<- system.time({
  Sig.probes <- get.diff.meth(mee, cores=cores,
                              dir.out =dir.out,
                              diff.dir=j,
                              pvalue = 0.01)
  })

  #-------------------------------------------------------------
  # Step 3.2: Identify significant probe-gene pairs            |
  #-------------------------------------------------------------
  # Collect nearby 20 genes for Sig.probes
  time.neargenes <- system.time({
  nearGenes <- GetNearGenes(TRange=getProbeInfo(mee, probe=Sig.probes$probe),
                            cores=cores,
                            geneAnnot=getGeneInfo(mee))
  })
  time.get.pair <- system.time({
  pair <- get.pair(mee=mee,
                   probes=na.omit(Sig.probes$probe),
                   nearGenes=nearGenes,
                   permu.dir=paste0(dir.out,"/permu"),
                   dir.out=dir.out,
                   cores=cores,
                   label= j,
                   permu.size=10000, # For significant results use 10000
                   Pe = 0.001) # For significant results use 0.001
  })

  Sig.probes.paired <- fetch.pair(pair=pair,
                                  probeInfo = getProbeInfo(mee),
                                  geneInfo = getGeneInfo(mee))
  Sig.probes.paired <-read.csv(paste0(dir.out,
                                      "/getPair.",j,
                                      ".pairs.significant.csv"),
                               stringsAsFactors=FALSE)[,1]


  #-------------------------------------------------------------
  # Step 3.3: Motif enrichment analysis on the selected probes |
  #-------------------------------------------------------------
  if(length(Sig.probes.paired) > 0 ){
    #-------------------------------------------------------------
    # Step 3.3: Motif enrichment analysis on the selected probes |
    #-------------------------------------------------------------
    time.get.enriched.motif <- system.time({
    enriched.motif <- get.enriched.motif(probes=Sig.probes.paired,
                                         dir.out=dir.out,
                                         label=j,
                                         background.probes = probe$name)
    })
    motif.enrichment <- read.csv(paste0(dir.out,
                                        "/getMotif.",j,".motif.enrichment.csv"),
                                 stringsAsFactors=FALSE)
    if(length(enriched.motif) > 0){
      #-------------------------------------------------------------
      # Step 3.4: Identifying regulatory TFs                        |
      #-------------------------------------------------------------
      print("get.TFs")
      time.get.TFs <- system.time({
      TF <- get.TFs(mee = mee,
                    enriched.motif = enriched.motif,
                    dir.out = dir.out,
                    cores = cores, label = j)
      })

      TF.meth.cor <- get(load(paste0(dir.out,
                                     "/getTF.",j,".TFs.with.motif.pvalue.rda")))
      time.tbl <- rbind(time.get.TFs,time.get.enriched.motif,time.get.pair,time.diff.probes,time.neargenes)
      write.csv(as.data.frame(time.tbl),file=paste0("time_unsupervised ELMERv1.csv"))
      save(TF, enriched.motif, Sig.probes.paired,
           pair, nearGenes, Sig.probes, motif.enrichment,
           TF.meth.cor,
           file=paste0(dir.out,"/ELMER_results_",j,".rda"))
    }
  }
}
	# March 1 2018
	# ooo Introduction ooo
	# This code performs ELMERv1 analysis on TCGA breast cancer
	# data set for time comparison purporses.
	# ooo Author ooo
	# Tiago Chedraoui Silva (tiagochst at gmail.com)

	# To run ELMER version 1
	# the we clone ELMER from https://github.com/lijingya/ELMER
	# and installed its auxiliary data from https://github.com/lijingya/ELMER.data
	# Some updates in the NAMESPACE were required
	# Then we load the modified version with devtools::load_all(path to clone repository)

	#-----------------------------------
	# 1 - Methylation
	# ----------------------------------
	library(TCGAbiolinks)
	query.lgg <- GDCquery(project = "TCGA-BRCA",
	data.category = "DNA methylation",
	platform = "Illumina Human Methylation 450",
	legacy = TRUE)
	GDCdownload(query.lgg,files.per.chunk = 10)
	met <-GDCprepare(query.lgg, save = FALSE)
	#met.elmer <- TCGAprepare_elmer(met, platform = "HumanMethylation450")
	#save(met, file = "met_f1000.rda")

	#-----------------------------------
	# 2 - Expression
	# ----------------------------------
	query.exp.lgg <- GDCquery(project = "TCGA-BRCA",
	data.category = "Gene expression",
	data.type = "Gene expression quantification",
	platform = "Illumina HiSeq",
	file.type = "normalized_results",
	experimental.strategy = "RNA-Seq",
	legacy = TRUE )
	GDCdownload(query.exp.lgg,files.per.chunk = 50)
	exp <- GDCprepare(query.exp.lgg, save = FALSE)


	# This function, which was removed from TCGAbiolinks,
	# will help the data input to be prepared to the ELMER standard.
	TCGAprepare_elmer <- function(data,
	platform,
	met.na.cut = 0.2,
	save = FALSE){
	# parameters veryfication

	if (missing(data)) stop("Please set the data parameter")
	if (missing(platform)) stop("Please set the platform parameter")

	if (grepl("illuminahiseq_rnaseqv2\|illuminahiseq_totalrnaseqv2",
	platform, ignore.case = TRUE)) {
	message("============ Pre-pocessing expression data =============")
	message(paste0("1 - expression = log2(expression + 1): ",
	"To linearize \n relation between ",
	"methylation and expression"))
	if(typeof(data) == typeof(SummarizedExperiment())){
	row.names(data) <- paste0("ID",values(data)$entrezgene)
	data <- assay(data)
	}

	if(all(grepl("\\\|",rownames(data)))){
	message("2 - rownames (gene\|loci) => ('ID'loci) ")
	aux <- strsplit(rownames(data),"\\\|")
	GeneID <- unlist(lapply(aux,function(x) x[2]))
	row.names(data) <- paste0("ID",GeneID)
	}
	data <- log2(data+1)
	Exp <- data.matrix(data)

	if (save) save(Exp,file = "Exp_elmer.rda")
	return(Exp)
	}

	if (grepl("humanmethylation", platform, ignore.case = TRUE)) {
	message("============ Pre-pocessing methylation data =============")
	if (class(data) == class(data.frame())){
	msg <- paste0("1 - Removing Columns: \n * Gene_Symbol \n",
	" * Chromosome \n * Genomic_Coordinate")
	message(msg)
	data <- subset(data,select = 4:ncol(data))
	}
	if(typeof(data) == typeof(SummarizedExperiment())){
	data <- assay(data)
	}
	msg <- paste0("2 - Removing probes with ",
	"NA values in more than 20% samples")
	message(msg)
	data <- data[rowMeans(is.na(data)) < met.na.cut,]
	Met <- data.matrix(data)
	if (save) save(Met,file = "Met_elmer.rda")
	return (Met)
	}
	}
	exp <- exp[,!exp$is_ffpe]
	met <- met[,!met$is_fppe]
	exp.elmer <- TCGAprepare_elmer(exp, platform = "IlluminaHiSeq_RNASeqV2")
	met.elmer <- TCGAprepare_elmer(met, platform = "HumanMethylation450")

	# STARTING ELMER CODE
	geneAnnot <- txs()
	geneAnnot$GENEID <- paste0("ID",geneAnnot$GENEID)
	geneInfo <- promoters(geneAnnot,upstream = 0, downstream = 0)
	probe <- get.feature.probe()

	# create mee object, use @ to access the matrices inside the object
	mee <- fetch.mee(meth = met.elmer,
	exp = exp.elmer,
	TCGA = TRUE,
	probeInfo = probe,
	geneInfo = geneInfo)
	save(mee,file = "mee_brca.rda")
	# 871 samples
	# 84 normal
	# 787 tumor


	# binary data
	# @param x A matrix.
	# @param Break A value to binarize the data.
	# @param Break2 A value to cut value to 3 categories.
	# @return A binarized matrix.
	Binary <- function(x,Break=0.3,Break2=NULL){
	if(!is.numeric(x)) stop("x need to be numeric")
	change <- x
	if(is.null(Break2)){
	change[x > Break] <- 1
	change[x < Break \| x== Break] <- 0
	}else{
	change[x < Break \| x== Break] <- 0
	change[x> Break & x < Break2] <- NA
	change[x > Break2 \| x== Break2] <-1
	}

	return(change)
	}


	# Available directions are hypo and hyper, we will use only hypo
	# due to speed constraint
	direction <- c("hypo")
	cores <- 10
	for (j in direction){
	print(j)
	dir.out <- paste0("elmer_version1/",j)
	dir.create(dir.out, recursive = TRUE)
	#--------------------------------------
	# STEP 3: Analysis \|
	#--------------------------------------
	# Step 3.1: Get diff methylated probes \|
	#--------------------------------------
	time.diff.probes<- system.time({
	Sig.probes <- get.diff.meth(mee, cores=cores,
	dir.out =dir.out,
	diff.dir=j,
	pvalue = 0.01)
	})

	#-------------------------------------------------------------
	# Step 3.2: Identify significant probe-gene pairs \|
	#-------------------------------------------------------------
	# Collect nearby 20 genes for Sig.probes
	time.neargenes <- system.time({
	nearGenes <- GetNearGenes(TRange=getProbeInfo(mee, probe=Sig.probes$probe),
	cores=cores,
	geneAnnot=getGeneInfo(mee))
	})
	time.get.pair <- system.time({
	pair <- get.pair(mee=mee,
	probes=na.omit(Sig.probes$probe),
	nearGenes=nearGenes,
	permu.dir=paste0(dir.out,"/permu"),
	dir.out=dir.out,
	cores=cores,
	label= j,
	permu.size=10000, # For significant results use 10000
	Pe = 0.001) # For significant results use 0.001
	})

	Sig.probes.paired <- fetch.pair(pair=pair,
	probeInfo = getProbeInfo(mee),
	geneInfo = getGeneInfo(mee))
	Sig.probes.paired <-read.csv(paste0(dir.out,
	"/getPair.",j,
	".pairs.significant.csv"),
	stringsAsFactors=FALSE)[,1]


	#-------------------------------------------------------------
	# Step 3.3: Motif enrichment analysis on the selected probes \|
	#-------------------------------------------------------------
	if(length(Sig.probes.paired) > 0 ){
	#-------------------------------------------------------------
	# Step 3.3: Motif enrichment analysis on the selected probes \|
	#-------------------------------------------------------------
	time.get.enriched.motif <- system.time({
	enriched.motif <- get.enriched.motif(probes=Sig.probes.paired,
	dir.out=dir.out,
	label=j,
	background.probes = probe$name)
	})
	motif.enrichment <- read.csv(paste0(dir.out,
	"/getMotif.",j,".motif.enrichment.csv"),
	stringsAsFactors=FALSE)
	if(length(enriched.motif) > 0){
	#-------------------------------------------------------------
	# Step 3.4: Identifying regulatory TFs \|
	#-------------------------------------------------------------
	print("get.TFs")
	time.get.TFs <- system.time({
	TF <- get.TFs(mee = mee,
	enriched.motif = enriched.motif,
	dir.out = dir.out,
	cores = cores, label = j)
	})

	TF.meth.cor <- get(load(paste0(dir.out,
	"/getTF.",j,".TFs.with.motif.pvalue.rda")))
	time.tbl <- rbind(time.get.TFs,time.get.enriched.motif,time.get.pair,time.diff.probes,time.neargenes)
	write.csv(as.data.frame(time.tbl),file=paste0("time_unsupervised ELMERv1.csv"))
	save(TF, enriched.motif, Sig.probes.paired,
	pair, nearGenes, Sig.probes, motif.enrichment,
	TF.meth.cor,
	file=paste0(dir.out,"/ELMER_results_",j,".rda"))
	}
	}
	}