Last active
March 1, 2018 13:02
-
-
Save tiagochst/04c2c61b1f3f34f892cd0d0e12a81be6 to your computer and use it in GitHub Desktop.
Running ELMER v1 on TCGA breast cancer - Major aim: Get execution times
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# March 1 2018 | |
# ooo Introduction ooo | |
# This code performs ELMERv1 analysis on TCGA breast cancer | |
# data set for time comparison purporses. | |
# ooo Author ooo | |
# Tiago Chedraoui Silva (tiagochst at gmail.com) | |
# To run ELMER version 1 | |
# the we clone ELMER from https://github.com/lijingya/ELMER | |
# and installed its auxiliary data from https://github.com/lijingya/ELMER.data | |
# Some updates in the NAMESPACE were required | |
# Then we load the modified version with devtools::load_all(path to clone repository) | |
#----------------------------------- | |
# 1 - Methylation | |
# ---------------------------------- | |
library(TCGAbiolinks) | |
query.lgg <- GDCquery(project = "TCGA-BRCA", | |
data.category = "DNA methylation", | |
platform = "Illumina Human Methylation 450", | |
legacy = TRUE) | |
GDCdownload(query.lgg,files.per.chunk = 10) | |
met <-GDCprepare(query.lgg, save = FALSE) | |
#met.elmer <- TCGAprepare_elmer(met, platform = "HumanMethylation450") | |
#save(met, file = "met_f1000.rda") | |
#----------------------------------- | |
# 2 - Expression | |
# ---------------------------------- | |
query.exp.lgg <- GDCquery(project = "TCGA-BRCA", | |
data.category = "Gene expression", | |
data.type = "Gene expression quantification", | |
platform = "Illumina HiSeq", | |
file.type = "normalized_results", | |
experimental.strategy = "RNA-Seq", | |
legacy = TRUE ) | |
GDCdownload(query.exp.lgg,files.per.chunk = 50) | |
exp <- GDCprepare(query.exp.lgg, save = FALSE) | |
# This function, which was removed from TCGAbiolinks, | |
# will help the data input to be prepared to the ELMER standard. | |
TCGAprepare_elmer <- function(data, | |
platform, | |
met.na.cut = 0.2, | |
save = FALSE){ | |
# parameters veryfication | |
if (missing(data)) stop("Please set the data parameter") | |
if (missing(platform)) stop("Please set the platform parameter") | |
if (grepl("illuminahiseq_rnaseqv2|illuminahiseq_totalrnaseqv2", | |
platform, ignore.case = TRUE)) { | |
message("============ Pre-pocessing expression data =============") | |
message(paste0("1 - expression = log2(expression + 1): ", | |
"To linearize \n relation between ", | |
"methylation and expression")) | |
if(typeof(data) == typeof(SummarizedExperiment())){ | |
row.names(data) <- paste0("ID",values(data)$entrezgene) | |
data <- assay(data) | |
} | |
if(all(grepl("\\|",rownames(data)))){ | |
message("2 - rownames (gene|loci) => ('ID'loci) ") | |
aux <- strsplit(rownames(data),"\\|") | |
GeneID <- unlist(lapply(aux,function(x) x[2])) | |
row.names(data) <- paste0("ID",GeneID) | |
} | |
data <- log2(data+1) | |
Exp <- data.matrix(data) | |
if (save) save(Exp,file = "Exp_elmer.rda") | |
return(Exp) | |
} | |
if (grepl("humanmethylation", platform, ignore.case = TRUE)) { | |
message("============ Pre-pocessing methylation data =============") | |
if (class(data) == class(data.frame())){ | |
msg <- paste0("1 - Removing Columns: \n * Gene_Symbol \n", | |
" * Chromosome \n * Genomic_Coordinate") | |
message(msg) | |
data <- subset(data,select = 4:ncol(data)) | |
} | |
if(typeof(data) == typeof(SummarizedExperiment())){ | |
data <- assay(data) | |
} | |
msg <- paste0("2 - Removing probes with ", | |
"NA values in more than 20% samples") | |
message(msg) | |
data <- data[rowMeans(is.na(data)) < met.na.cut,] | |
Met <- data.matrix(data) | |
if (save) save(Met,file = "Met_elmer.rda") | |
return (Met) | |
} | |
} | |
exp <- exp[,!exp$is_ffpe] | |
met <- met[,!met$is_fppe] | |
exp.elmer <- TCGAprepare_elmer(exp, platform = "IlluminaHiSeq_RNASeqV2") | |
met.elmer <- TCGAprepare_elmer(met, platform = "HumanMethylation450") | |
# STARTING ELMER CODE | |
geneAnnot <- txs() | |
geneAnnot$GENEID <- paste0("ID",geneAnnot$GENEID) | |
geneInfo <- promoters(geneAnnot,upstream = 0, downstream = 0) | |
probe <- get.feature.probe() | |
# create mee object, use @ to access the matrices inside the object | |
mee <- fetch.mee(meth = met.elmer, | |
exp = exp.elmer, | |
TCGA = TRUE, | |
probeInfo = probe, | |
geneInfo = geneInfo) | |
save(mee,file = "mee_brca.rda") | |
# 871 samples | |
# 84 normal | |
# 787 tumor | |
# binary data | |
# @param x A matrix. | |
# @param Break A value to binarize the data. | |
# @param Break2 A value to cut value to 3 categories. | |
# @return A binarized matrix. | |
Binary <- function(x,Break=0.3,Break2=NULL){ | |
if(!is.numeric(x)) stop("x need to be numeric") | |
change <- x | |
if(is.null(Break2)){ | |
change[x > Break] <- 1 | |
change[x < Break | x== Break] <- 0 | |
}else{ | |
change[x < Break | x== Break] <- 0 | |
change[x> Break & x < Break2] <- NA | |
change[x > Break2 | x== Break2] <-1 | |
} | |
return(change) | |
} | |
# Available directions are hypo and hyper, we will use only hypo | |
# due to speed constraint | |
direction <- c("hypo") | |
cores <- 10 | |
for (j in direction){ | |
print(j) | |
dir.out <- paste0("elmer_version1/",j) | |
dir.create(dir.out, recursive = TRUE) | |
#-------------------------------------- | |
# STEP 3: Analysis | | |
#-------------------------------------- | |
# Step 3.1: Get diff methylated probes | | |
#-------------------------------------- | |
time.diff.probes<- system.time({ | |
Sig.probes <- get.diff.meth(mee, cores=cores, | |
dir.out =dir.out, | |
diff.dir=j, | |
pvalue = 0.01) | |
}) | |
#------------------------------------------------------------- | |
# Step 3.2: Identify significant probe-gene pairs | | |
#------------------------------------------------------------- | |
# Collect nearby 20 genes for Sig.probes | |
time.neargenes <- system.time({ | |
nearGenes <- GetNearGenes(TRange=getProbeInfo(mee, probe=Sig.probes$probe), | |
cores=cores, | |
geneAnnot=getGeneInfo(mee)) | |
}) | |
time.get.pair <- system.time({ | |
pair <- get.pair(mee=mee, | |
probes=na.omit(Sig.probes$probe), | |
nearGenes=nearGenes, | |
permu.dir=paste0(dir.out,"/permu"), | |
dir.out=dir.out, | |
cores=cores, | |
label= j, | |
permu.size=10000, # For significant results use 10000 | |
Pe = 0.001) # For significant results use 0.001 | |
}) | |
Sig.probes.paired <- fetch.pair(pair=pair, | |
probeInfo = getProbeInfo(mee), | |
geneInfo = getGeneInfo(mee)) | |
Sig.probes.paired <-read.csv(paste0(dir.out, | |
"/getPair.",j, | |
".pairs.significant.csv"), | |
stringsAsFactors=FALSE)[,1] | |
#------------------------------------------------------------- | |
# Step 3.3: Motif enrichment analysis on the selected probes | | |
#------------------------------------------------------------- | |
if(length(Sig.probes.paired) > 0 ){ | |
#------------------------------------------------------------- | |
# Step 3.3: Motif enrichment analysis on the selected probes | | |
#------------------------------------------------------------- | |
time.get.enriched.motif <- system.time({ | |
enriched.motif <- get.enriched.motif(probes=Sig.probes.paired, | |
dir.out=dir.out, | |
label=j, | |
background.probes = probe$name) | |
}) | |
motif.enrichment <- read.csv(paste0(dir.out, | |
"/getMotif.",j,".motif.enrichment.csv"), | |
stringsAsFactors=FALSE) | |
if(length(enriched.motif) > 0){ | |
#------------------------------------------------------------- | |
# Step 3.4: Identifying regulatory TFs | | |
#------------------------------------------------------------- | |
print("get.TFs") | |
time.get.TFs <- system.time({ | |
TF <- get.TFs(mee = mee, | |
enriched.motif = enriched.motif, | |
dir.out = dir.out, | |
cores = cores, label = j) | |
}) | |
TF.meth.cor <- get(load(paste0(dir.out, | |
"/getTF.",j,".TFs.with.motif.pvalue.rda"))) | |
time.tbl <- rbind(time.get.TFs,time.get.enriched.motif,time.get.pair,time.diff.probes,time.neargenes) | |
write.csv(as.data.frame(time.tbl),file=paste0("time_unsupervised ELMERv1.csv")) | |
save(TF, enriched.motif, Sig.probes.paired, | |
pair, nearGenes, Sig.probes, motif.enrichment, | |
TF.meth.cor, | |
file=paste0(dir.out,"/ELMER_results_",j,".rda")) | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment