Skip to content

Instantly share code, notes, and snippets.

@tiagochst
Last active March 1, 2018 13:02
Show Gist options
  • Save tiagochst/04c2c61b1f3f34f892cd0d0e12a81be6 to your computer and use it in GitHub Desktop.
Save tiagochst/04c2c61b1f3f34f892cd0d0e12a81be6 to your computer and use it in GitHub Desktop.
Running ELMER v1 on TCGA breast cancer - Major aim: Get execution times
# March 1 2018
# ooo Introduction ooo
# This code performs ELMERv1 analysis on TCGA breast cancer
# data set for time comparison purporses.
# ooo Author ooo
# Tiago Chedraoui Silva (tiagochst at gmail.com)
# To run ELMER version 1
# the we clone ELMER from https://github.com/lijingya/ELMER
# and installed its auxiliary data from https://github.com/lijingya/ELMER.data
# Some updates in the NAMESPACE were required
# Then we load the modified version with devtools::load_all(path to clone repository)
#-----------------------------------
# 1 - Methylation
# ----------------------------------
library(TCGAbiolinks)
query.lgg <- GDCquery(project = "TCGA-BRCA",
data.category = "DNA methylation",
platform = "Illumina Human Methylation 450",
legacy = TRUE)
GDCdownload(query.lgg,files.per.chunk = 10)
met <-GDCprepare(query.lgg, save = FALSE)
#met.elmer <- TCGAprepare_elmer(met, platform = "HumanMethylation450")
#save(met, file = "met_f1000.rda")
#-----------------------------------
# 2 - Expression
# ----------------------------------
query.exp.lgg <- GDCquery(project = "TCGA-BRCA",
data.category = "Gene expression",
data.type = "Gene expression quantification",
platform = "Illumina HiSeq",
file.type = "normalized_results",
experimental.strategy = "RNA-Seq",
legacy = TRUE )
GDCdownload(query.exp.lgg,files.per.chunk = 50)
exp <- GDCprepare(query.exp.lgg, save = FALSE)
# This function, which was removed from TCGAbiolinks,
# will help the data input to be prepared to the ELMER standard.
TCGAprepare_elmer <- function(data,
platform,
met.na.cut = 0.2,
save = FALSE){
# parameters veryfication
if (missing(data)) stop("Please set the data parameter")
if (missing(platform)) stop("Please set the platform parameter")
if (grepl("illuminahiseq_rnaseqv2|illuminahiseq_totalrnaseqv2",
platform, ignore.case = TRUE)) {
message("============ Pre-pocessing expression data =============")
message(paste0("1 - expression = log2(expression + 1): ",
"To linearize \n relation between ",
"methylation and expression"))
if(typeof(data) == typeof(SummarizedExperiment())){
row.names(data) <- paste0("ID",values(data)$entrezgene)
data <- assay(data)
}
if(all(grepl("\\|",rownames(data)))){
message("2 - rownames (gene|loci) => ('ID'loci) ")
aux <- strsplit(rownames(data),"\\|")
GeneID <- unlist(lapply(aux,function(x) x[2]))
row.names(data) <- paste0("ID",GeneID)
}
data <- log2(data+1)
Exp <- data.matrix(data)
if (save) save(Exp,file = "Exp_elmer.rda")
return(Exp)
}
if (grepl("humanmethylation", platform, ignore.case = TRUE)) {
message("============ Pre-pocessing methylation data =============")
if (class(data) == class(data.frame())){
msg <- paste0("1 - Removing Columns: \n * Gene_Symbol \n",
" * Chromosome \n * Genomic_Coordinate")
message(msg)
data <- subset(data,select = 4:ncol(data))
}
if(typeof(data) == typeof(SummarizedExperiment())){
data <- assay(data)
}
msg <- paste0("2 - Removing probes with ",
"NA values in more than 20% samples")
message(msg)
data <- data[rowMeans(is.na(data)) < met.na.cut,]
Met <- data.matrix(data)
if (save) save(Met,file = "Met_elmer.rda")
return (Met)
}
}
exp <- exp[,!exp$is_ffpe]
met <- met[,!met$is_fppe]
exp.elmer <- TCGAprepare_elmer(exp, platform = "IlluminaHiSeq_RNASeqV2")
met.elmer <- TCGAprepare_elmer(met, platform = "HumanMethylation450")
# STARTING ELMER CODE
geneAnnot <- txs()
geneAnnot$GENEID <- paste0("ID",geneAnnot$GENEID)
geneInfo <- promoters(geneAnnot,upstream = 0, downstream = 0)
probe <- get.feature.probe()
# create mee object, use @ to access the matrices inside the object
mee <- fetch.mee(meth = met.elmer,
exp = exp.elmer,
TCGA = TRUE,
probeInfo = probe,
geneInfo = geneInfo)
save(mee,file = "mee_brca.rda")
# 871 samples
# 84 normal
# 787 tumor
# binary data
# @param x A matrix.
# @param Break A value to binarize the data.
# @param Break2 A value to cut value to 3 categories.
# @return A binarized matrix.
Binary <- function(x,Break=0.3,Break2=NULL){
if(!is.numeric(x)) stop("x need to be numeric")
change <- x
if(is.null(Break2)){
change[x > Break] <- 1
change[x < Break | x== Break] <- 0
}else{
change[x < Break | x== Break] <- 0
change[x> Break & x < Break2] <- NA
change[x > Break2 | x== Break2] <-1
}
return(change)
}
# Available directions are hypo and hyper, we will use only hypo
# due to speed constraint
direction <- c("hypo")
cores <- 10
for (j in direction){
print(j)
dir.out <- paste0("elmer_version1/",j)
dir.create(dir.out, recursive = TRUE)
#--------------------------------------
# STEP 3: Analysis |
#--------------------------------------
# Step 3.1: Get diff methylated probes |
#--------------------------------------
time.diff.probes<- system.time({
Sig.probes <- get.diff.meth(mee, cores=cores,
dir.out =dir.out,
diff.dir=j,
pvalue = 0.01)
})
#-------------------------------------------------------------
# Step 3.2: Identify significant probe-gene pairs |
#-------------------------------------------------------------
# Collect nearby 20 genes for Sig.probes
time.neargenes <- system.time({
nearGenes <- GetNearGenes(TRange=getProbeInfo(mee, probe=Sig.probes$probe),
cores=cores,
geneAnnot=getGeneInfo(mee))
})
time.get.pair <- system.time({
pair <- get.pair(mee=mee,
probes=na.omit(Sig.probes$probe),
nearGenes=nearGenes,
permu.dir=paste0(dir.out,"/permu"),
dir.out=dir.out,
cores=cores,
label= j,
permu.size=10000, # For significant results use 10000
Pe = 0.001) # For significant results use 0.001
})
Sig.probes.paired <- fetch.pair(pair=pair,
probeInfo = getProbeInfo(mee),
geneInfo = getGeneInfo(mee))
Sig.probes.paired <-read.csv(paste0(dir.out,
"/getPair.",j,
".pairs.significant.csv"),
stringsAsFactors=FALSE)[,1]
#-------------------------------------------------------------
# Step 3.3: Motif enrichment analysis on the selected probes |
#-------------------------------------------------------------
if(length(Sig.probes.paired) > 0 ){
#-------------------------------------------------------------
# Step 3.3: Motif enrichment analysis on the selected probes |
#-------------------------------------------------------------
time.get.enriched.motif <- system.time({
enriched.motif <- get.enriched.motif(probes=Sig.probes.paired,
dir.out=dir.out,
label=j,
background.probes = probe$name)
})
motif.enrichment <- read.csv(paste0(dir.out,
"/getMotif.",j,".motif.enrichment.csv"),
stringsAsFactors=FALSE)
if(length(enriched.motif) > 0){
#-------------------------------------------------------------
# Step 3.4: Identifying regulatory TFs |
#-------------------------------------------------------------
print("get.TFs")
time.get.TFs <- system.time({
TF <- get.TFs(mee = mee,
enriched.motif = enriched.motif,
dir.out = dir.out,
cores = cores, label = j)
})
TF.meth.cor <- get(load(paste0(dir.out,
"/getTF.",j,".TFs.with.motif.pvalue.rda")))
time.tbl <- rbind(time.get.TFs,time.get.enriched.motif,time.get.pair,time.diff.probes,time.neargenes)
write.csv(as.data.frame(time.tbl),file=paste0("time_unsupervised ELMERv1.csv"))
save(TF, enriched.motif, Sig.probes.paired,
pair, nearGenes, Sig.probes, motif.enrichment,
TF.meth.cor,
file=paste0(dir.out,"/ELMER_results_",j,".rda"))
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment