jnpaulson/gtex.R

## gtex.R
library(yarn)
library(dplyr)
library(rafalib)

obj = downloadGTEx()

lowSampleSizes = c("Bladder","Cells - Leukemia cell line (CML)","Cervix - Ectocervix","Cervix - Endocervix","Fallopian Tube")
obj = filterSamples(obj,lowSampleSizes,"SMTSD") %>%
      filterMissingGenes

our_subtypes = gsub(" - "," ",as.character(pData(obj)$SMTSD))
our_subtypes = gsub(" ","_",our_subtypes)
our_subtypes = tolower(our_subtypes)
brain_0 = which(pData(obj)$SMTSD == "Brain - Hypothalamus")
brain_01 = which(pData(obj)$SMTSD == "Brain - Amygdala")
brain_02 = which(pData(obj)$SMTSD == "Brain - Anterior cingulate cortex (BA24)")
brain_03 = which(pData(obj)$SMTSD == "Brain - Hippocampus")
brain_04 = which(pData(obj)$SMTSD == "Brain - Cortex")
brain_05 = which(pData(obj)$SMTSD == "Brain - Substantia nigra")
brain_06 = which(pData(obj)$SMTSD == "Brain - Spinal cord (cervical c-1)")
brain_07 = which(pData(obj)$SMTSD == "Brain - Frontal Cortex (BA9)")
brain_0 = union(brain_0,c(brain_01,brain_02,brain_03,brain_04,brain_05,brain_06,brain_07))
our_subtypes[brain_0] = "Brain-0"
brain_1 = c(which(pData(obj)$SMTSD=="Brain - Cerebellum"),which(pData(obj)$SMTSD=="Brain - Cerebellar Hemisphere"))
our_subtypes[brain_1] = "Brain-1"
brain_2 = c(which(pData(obj)$SMTSD=="Brain - Caudate (basal ganglia)"),which(pData(obj)$SMTSD=="Brain - Nucleus accumbens (basal ganglia)"),which(pData(obj)$SMTSD=="Brain - Putamen (basal ganglia)"))
our_subtypes[brain_2] = "Brain-2"
skin = grep("sun",our_subtypes)
our_subtypes[skin] = "skin"
our_subtypes = factor(tolower(our_subtypes))
pData(obj)$our_subtypes = our_subtypes

message("smallest sample size is:")
show(min(table(pData(obj)$our_subtypes)))
obj = filterLowGenes(obj,"our_subtypes")

### Normalize using qsmooth
obj = normalizeTissueAware(obj,"our_subtypes")


saveRDS(obj,file=("gtex_portal_normalized.rds")

## voomWeightsCustomized.R
#' Compute voom weights using customized log2 counts
#' Modified from: https://raw.githubusercontent.com/jhsiao999/Humanzee/916ee1b3e0e213839c17a99280211d68edaca161/R/voomWeightsCustomized.r
#' and of course the voom function
#'
#' @param log2counts counts on log2 scale
#' @param design Experimental design of the data. Required to be an R
#'                design.matrix object
#' @param lib.size Library size.
#' @param is.cpm if the data is CPM normalized.
#'
#' @export
#'
# example:
# gender = pData(objsubset)$gender
# batch  = factor(as.character(pData(objsubset)$SMNABTCHT))
# design = model.matrix(~batch+gender)
# transformedCounts = assayData(objsubset)[["qsmooth"]]

# voomOutput = voomWeightsCustomized(transformedCounts,design)
# fitGood = lmFit(voomOutput,design)
# fitGood = eBayes(fitGood)
# gl  = fData(objsubset)$geneNames
# tt  = topTable(fit,number=Inf,genelist=gl,coef="genderMALE")

voomWeightsCustomized <- function(log2counts,design,lib.size = NULL,is.cpm = FALSE) {
  if (is.null(design)) {
    design <- model.matrix(~ 1 )
  }
  out<-list()

  if (is.cpm == TRUE) {
    fit <- lmFit(log2counts, design)
    xx <- fit$Amean + mean(log2(lib.size + 1)) - log2(1e+06)
    yy <- sqrt(fit$sigma)
    l <- lowess(xx, yy, f = .5)
    f <- approxfun(l, rule = 2)
    fitted.values <- fit$coef %*% t(fit$design)
    fitted.cpm <- 2^fitted.values
    fitted.count <- 1e-06 * t(t(fitted.cpm) * (lib.size + 1))
    fitted.logcount <- log2(fitted.count)
    w <- 1/f(fitted.logcount)^4
    dim(w) <- dim(fitted.logcount)
  }

  if (is.cpm == FALSE) {
    fit <- lmFit(log2counts, design)
    xx <- fit$Amean
    yy <- sqrt(fit$sigma)
    l <- lowess(xx, yy, f = .5)
    f <- approxfun(l, rule = 2)
    fitted.values <- fit$coef %*% t(fit$design)
    fitted.logcount <- log2(fitted.values)
    w <- 1/f(fitted.logcount)^4
    dim(w) <- dim(fitted.logcount)
  }
  out$E = log2counts
  out$weights = w
  out$design = design
  new("EList",out)
}
	library(yarn)
	library(dplyr)
	library(rafalib)

	obj = downloadGTEx()

	lowSampleSizes = c("Bladder","Cells - Leukemia cell line (CML)","Cervix - Ectocervix","Cervix - Endocervix","Fallopian Tube")
	obj = filterSamples(obj,lowSampleSizes,"SMTSD") %>%
	filterMissingGenes

	our_subtypes = gsub(" - "," ",as.character(pData(obj)$SMTSD))
	our_subtypes = gsub(" ","_",our_subtypes)
	our_subtypes = tolower(our_subtypes)
	brain_0 = which(pData(obj)$SMTSD == "Brain - Hypothalamus")
	brain_01 = which(pData(obj)$SMTSD == "Brain - Amygdala")
	brain_02 = which(pData(obj)$SMTSD == "Brain - Anterior cingulate cortex (BA24)")
	brain_03 = which(pData(obj)$SMTSD == "Brain - Hippocampus")
	brain_04 = which(pData(obj)$SMTSD == "Brain - Cortex")
	brain_05 = which(pData(obj)$SMTSD == "Brain - Substantia nigra")
	brain_06 = which(pData(obj)$SMTSD == "Brain - Spinal cord (cervical c-1)")
	brain_07 = which(pData(obj)$SMTSD == "Brain - Frontal Cortex (BA9)")
	brain_0 = union(brain_0,c(brain_01,brain_02,brain_03,brain_04,brain_05,brain_06,brain_07))
	our_subtypes[brain_0] = "Brain-0"
	brain_1 = c(which(pData(obj)$SMTSD=="Brain - Cerebellum"),which(pData(obj)$SMTSD=="Brain - Cerebellar Hemisphere"))
	our_subtypes[brain_1] = "Brain-1"
	brain_2 = c(which(pData(obj)$SMTSD=="Brain - Caudate (basal ganglia)"),which(pData(obj)$SMTSD=="Brain - Nucleus accumbens (basal ganglia)"),which(pData(obj)$SMTSD=="Brain - Putamen (basal ganglia)"))
	our_subtypes[brain_2] = "Brain-2"
	skin = grep("sun",our_subtypes)
	our_subtypes[skin] = "skin"
	our_subtypes = factor(tolower(our_subtypes))
	pData(obj)$our_subtypes = our_subtypes

	message("smallest sample size is:")
	show(min(table(pData(obj)$our_subtypes)))
	obj = filterLowGenes(obj,"our_subtypes")

	### Normalize using qsmooth
	obj = normalizeTissueAware(obj,"our_subtypes")


	saveRDS(obj,file=("gtex_portal_normalized.rds")
	#' Compute voom weights using customized log2 counts
	#' Modified from: https://raw.githubusercontent.com/jhsiao999/Humanzee/916ee1b3e0e213839c17a99280211d68edaca161/R/voomWeightsCustomized.r
	#' and of course the voom function
	#'
	#' @param log2counts counts on log2 scale
	#' @param design Experimental design of the data. Required to be an R
	#' design.matrix object
	#' @param lib.size Library size.
	#' @param is.cpm if the data is CPM normalized.
	#'
	#' @export
	#'
	# example:
	# gender = pData(objsubset)$gender
	# batch = factor(as.character(pData(objsubset)$SMNABTCHT))
	# design = model.matrix(~batch+gender)
	# transformedCounts = assayData(objsubset)[["qsmooth"]]

	# voomOutput = voomWeightsCustomized(transformedCounts,design)
	# fitGood = lmFit(voomOutput,design)
	# fitGood = eBayes(fitGood)
	# gl = fData(objsubset)$geneNames
	# tt = topTable(fit,number=Inf,genelist=gl,coef="genderMALE")

	voomWeightsCustomized <- function(log2counts,design,lib.size = NULL,is.cpm = FALSE) {
	if (is.null(design)) {
	design <- model.matrix(~ 1 )
	}
	out<-list()

	if (is.cpm == TRUE) {
	fit <- lmFit(log2counts, design)
	xx <- fit$Amean + mean(log2(lib.size + 1)) - log2(1e+06)
	yy <- sqrt(fit$sigma)
	l <- lowess(xx, yy, f = .5)
	f <- approxfun(l, rule = 2)
	fitted.values <- fit$coef %*% t(fit$design)
	fitted.cpm <- 2^fitted.values
	fitted.count <- 1e-06 * t(t(fitted.cpm) * (lib.size + 1))
	fitted.logcount <- log2(fitted.count)
	w <- 1/f(fitted.logcount)^4
	dim(w) <- dim(fitted.logcount)
	}

	if (is.cpm == FALSE) {
	fit <- lmFit(log2counts, design)
	xx <- fit$Amean
	yy <- sqrt(fit$sigma)
	l <- lowess(xx, yy, f = .5)
	f <- approxfun(l, rule = 2)
	fitted.values <- fit$coef %*% t(fit$design)
	fitted.logcount <- log2(fitted.values)
	w <- 1/f(fitted.logcount)^4
	dim(w) <- dim(fitted.logcount)
	}
	out$E = log2counts
	out$weights = w
	out$design = design
	new("EList",out)
	}