ahwagner/preprocess.R

## preprocess.R
source("https://bioconductor.org/biocLite.R")
# biocLite("sva")
# biocLite("ballgown")
library(ballgown)
library(sva)
library(pheatmap)
library(mixtools)

# Extract expression estimates
# data_directory = "/some/data/dir"
# bg = ballgown(dataDir=data_directory, samplePattern='^SCLC', meas='all')
# save(bg, file='bg.rda')
bg = load(file='bg.rda')
gene_fpkm = gexpr(bg)
genes = rownames(gene_fpkm)
samples = colnames(gene_fpkm)

# Annotate by library prep
relapse = samples[!grepl('tumor', samples)]
relapse.numbers = sub('FPKM.SCLC([[:digit:]]+)_.*','\\1', relapse, perl=TRUE)
relapse.first = relapse[as.integer(relapse.numbers) < 19]
relapse.second = relapse[as.integer(relapse.numbers) > 18]

# Filter by expressed status
relapse.first.expressed = apply(gene_fpkm[,relapse.first] > 1, 1, (function (x) table(x)['TRUE']))
relapse.first.expressed[is.na(relapse.first.expressed)] = 0
relapse.second.expressed = apply(gene_fpkm[,relapse.second] > 1, 1, (function (x) table(x)['TRUE']))
relapse.second.expressed[is.na(relapse.second.expressed)] = 0
expressed_genes = relapse.first.expressed > 1 & relapse.second.expressed > 1
expressed_gene_fpkm = log2(gene_fpkm[expressed_genes,] + 1)

# Normalize across batches
m = matrix(nrow=18, ncol=1)
batch = data.frame(m, row.names = relapse)
batch[relapse.first, 1] = 1
batch[relapse.second, 1] = 2
batch.colnames = c('batch')
modcombat = model.matrix(~1, data=batch)
combat_edata = ComBat(dat=expressed_gene_fpkm, batch=batch$m, mod=modcombat, par.prior=TRUE, prior.plots=FALSE)
save(combat_edata, file = 'corrected_expr.rda')

# ENSG IDs for DOI: 10.1016/j.ccell.2016.12.005
neuro.marker.genes = data.frame(
  ensembl = c(
    'ENSG00000139352',
    'ENSG00000162992',
    'ENSG00000102003',
    'ENSG00000173404',
    'ENSG00000100604',
    'ENSG00000089199',
    'ENSG00000171951',
    'ENSG00000134443',
    'ENSG00000149294',
    'ENSG00000154277',
    'ENSG00000110680',
    'ENSG00000175868'
  ),
  HGNC = c(
    'ASCL1',
    'NEUROD1',
    'SYP',
    'INSM1',
    'CHGA',
    'CHGB',
    'SCG2',
    'GRP',
    'NCAM1',
    'UCHL1',
    'CALCA',
    'CALCB'
  )
)

neuro.marker.genes.filtered = neuro.marker.genes[neuro.marker.genes$ensembl %in% rownames(combat_edata),]
neuro.marker.fpkm = combat_edata[neuro.marker.genes.filtered$ensembl,]

colnames(neuro.marker.fpkm) = gsub('^FPKM.', '', colnames(neuro.marker.fpkm))
rownames(neuro.marker.fpkm) = neuro.marker.genes.filtered$HGNC

intersect <- function(m1, s1, m2, s2, prop1, prop2){
  B <- (m1/s1^2 - m2/s2^2)
  A <- 0.5*(1/s2^2 - 1/s1^2)
  C <- 0.5*(m2^2/s2^2 - m1^2/s1^2) - log((s1/s2)*(prop2/prop1))
  (-B + c(1,-1)*sqrt(B^2 - 4*A*C))/(2*A)
}

m.apc = normalmixEM(combat_edata['ENSG00000134982', ], k = 2) # APC
m.myc = normalmixEM(combat_edata['ENSG00000136997', ], k = 2) # MYC
i.apc = intersect(m.apc$mu[1], m.apc$sigma[1], m.apc$mu[2], m.apc$sigma[2], m.apc$lambda[1], m.apc$lambda[2])
i.myc = intersect(m.myc$mu[1], m.myc$sigma[1], m.myc$mu[2], m.myc$sigma[2], m.myc$lambda[1], m.myc$lambda[2])
x.apc = ifelse(m.apc$x > i.apc[2], 'High', 'Low')
x.myc = ifelse(m.myc$x > i.myc[2], 'High', 'Low')

annotation_col = data.frame(
  MycStatus = x.myc,
  ApcStatus = x.apc
)

rownames(annotation_col) = colnames(neuro.marker.fpkm)

pheatmap(neuro.marker.fpkm, annotation_col = annotation_col, cluster_rows = FALSE, cutree_cols = 3)
	source("https://bioconductor.org/biocLite.R")
	# biocLite("sva")
	# biocLite("ballgown")
	library(ballgown)
	library(sva)
	library(pheatmap)
	library(mixtools)

	# Extract expression estimates
	# data_directory = "/some/data/dir"
	# bg = ballgown(dataDir=data_directory, samplePattern='^SCLC', meas='all')
	# save(bg, file='bg.rda')
	bg = load(file='bg.rda')
	gene_fpkm = gexpr(bg)
	genes = rownames(gene_fpkm)
	samples = colnames(gene_fpkm)

	# Annotate by library prep
	relapse = samples[!grepl('tumor', samples)]
	relapse.numbers = sub('FPKM.SCLC([[:digit:]]+)_.*','\\1', relapse, perl=TRUE)
	relapse.first = relapse[as.integer(relapse.numbers) < 19]
	relapse.second = relapse[as.integer(relapse.numbers) > 18]

	# Filter by expressed status
	relapse.first.expressed = apply(gene_fpkm[,relapse.first] > 1, 1, (function (x) table(x)['TRUE']))
	relapse.first.expressed[is.na(relapse.first.expressed)] = 0
	relapse.second.expressed = apply(gene_fpkm[,relapse.second] > 1, 1, (function (x) table(x)['TRUE']))
	relapse.second.expressed[is.na(relapse.second.expressed)] = 0
	expressed_genes = relapse.first.expressed > 1 & relapse.second.expressed > 1
	expressed_gene_fpkm = log2(gene_fpkm[expressed_genes,] + 1)

	# Normalize across batches
	m = matrix(nrow=18, ncol=1)
	batch = data.frame(m, row.names = relapse)
	batch[relapse.first, 1] = 1
	batch[relapse.second, 1] = 2
	batch.colnames = c('batch')
	modcombat = model.matrix(~1, data=batch)
	combat_edata = ComBat(dat=expressed_gene_fpkm, batch=batch$m, mod=modcombat, par.prior=TRUE, prior.plots=FALSE)
	save(combat_edata, file = 'corrected_expr.rda')

	# ENSG IDs for DOI: 10.1016/j.ccell.2016.12.005
	neuro.marker.genes = data.frame(
	ensembl = c(
	'ENSG00000139352',
	'ENSG00000162992',
	'ENSG00000102003',
	'ENSG00000173404',
	'ENSG00000100604',
	'ENSG00000089199',
	'ENSG00000171951',
	'ENSG00000134443',
	'ENSG00000149294',
	'ENSG00000154277',
	'ENSG00000110680',
	'ENSG00000175868'
	),
	HGNC = c(
	'ASCL1',
	'NEUROD1',
	'SYP',
	'INSM1',
	'CHGA',
	'CHGB',
	'SCG2',
	'GRP',
	'NCAM1',
	'UCHL1',
	'CALCA',
	'CALCB'
	)
	)

	neuro.marker.genes.filtered = neuro.marker.genes[neuro.marker.genes$ensembl %in% rownames(combat_edata),]
	neuro.marker.fpkm = combat_edata[neuro.marker.genes.filtered$ensembl,]

	colnames(neuro.marker.fpkm) = gsub('^FPKM.', '', colnames(neuro.marker.fpkm))
	rownames(neuro.marker.fpkm) = neuro.marker.genes.filtered$HGNC

	intersect <- function(m1, s1, m2, s2, prop1, prop2){
	B <- (m1/s1^2 - m2/s2^2)
	A <- 0.5*(1/s2^2 - 1/s1^2)
	C <- 0.5(m2^2/s2^2 - m1^2/s1^2) - log((s1/s2)(prop2/prop1))
	(-B + c(1,-1)sqrt(B^2 - 4AC))/(2A)
	}

	m.apc = normalmixEM(combat_edata['ENSG00000134982', ], k = 2) # APC
	m.myc = normalmixEM(combat_edata['ENSG00000136997', ], k = 2) # MYC
	i.apc = intersect(m.apc$mu[1], m.apc$sigma[1], m.apc$mu[2], m.apc$sigma[2], m.apc$lambda[1], m.apc$lambda[2])
	i.myc = intersect(m.myc$mu[1], m.myc$sigma[1], m.myc$mu[2], m.myc$sigma[2], m.myc$lambda[1], m.myc$lambda[2])
	x.apc = ifelse(m.apc$x > i.apc[2], 'High', 'Low')
	x.myc = ifelse(m.myc$x > i.myc[2], 'High', 'Low')

	annotation_col = data.frame(
	MycStatus = x.myc,
	ApcStatus = x.apc
	)

	rownames(annotation_col) = colnames(neuro.marker.fpkm)

	pheatmap(neuro.marker.fpkm, annotation_col = annotation_col, cluster_rows = FALSE, cutree_cols = 3)