Skip to content

Instantly share code, notes, and snippets.

@ahwagner
Last active July 21, 2017 21:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ahwagner/619b2d8fd49495f9ffb9d4dd873c0c56 to your computer and use it in GitHub Desktop.
Save ahwagner/619b2d8fd49495f9ffb9d4dd873c0c56 to your computer and use it in GitHub Desktop.
regularization and batch correction of samples run through cufflinks, followed by GMM annotation of genes and plotting with pheatmap
source("https://bioconductor.org/biocLite.R")
# biocLite("sva")
# biocLite("ballgown")
library(ballgown)
library(sva)
library(pheatmap)
library(mixtools)
# Extract expression estimates
# data_directory = "/some/data/dir"
# bg = ballgown(dataDir=data_directory, samplePattern='^SCLC', meas='all')
# save(bg, file='bg.rda')
bg = load(file='bg.rda')
gene_fpkm = gexpr(bg)
genes = rownames(gene_fpkm)
samples = colnames(gene_fpkm)
# Annotate by library prep
relapse = samples[!grepl('tumor', samples)]
relapse.numbers = sub('FPKM.SCLC([[:digit:]]+)_.*','\\1', relapse, perl=TRUE)
relapse.first = relapse[as.integer(relapse.numbers) < 19]
relapse.second = relapse[as.integer(relapse.numbers) > 18]
# Filter by expressed status
relapse.first.expressed = apply(gene_fpkm[,relapse.first] > 1, 1, (function (x) table(x)['TRUE']))
relapse.first.expressed[is.na(relapse.first.expressed)] = 0
relapse.second.expressed = apply(gene_fpkm[,relapse.second] > 1, 1, (function (x) table(x)['TRUE']))
relapse.second.expressed[is.na(relapse.second.expressed)] = 0
expressed_genes = relapse.first.expressed > 1 & relapse.second.expressed > 1
expressed_gene_fpkm = log2(gene_fpkm[expressed_genes,] + 1)
# Normalize across batches
m = matrix(nrow=18, ncol=1)
batch = data.frame(m, row.names = relapse)
batch[relapse.first, 1] = 1
batch[relapse.second, 1] = 2
batch.colnames = c('batch')
modcombat = model.matrix(~1, data=batch)
combat_edata = ComBat(dat=expressed_gene_fpkm, batch=batch$m, mod=modcombat, par.prior=TRUE, prior.plots=FALSE)
save(combat_edata, file = 'corrected_expr.rda')
# ENSG IDs for DOI: 10.1016/j.ccell.2016.12.005
neuro.marker.genes = data.frame(
ensembl = c(
'ENSG00000139352',
'ENSG00000162992',
'ENSG00000102003',
'ENSG00000173404',
'ENSG00000100604',
'ENSG00000089199',
'ENSG00000171951',
'ENSG00000134443',
'ENSG00000149294',
'ENSG00000154277',
'ENSG00000110680',
'ENSG00000175868'
),
HGNC = c(
'ASCL1',
'NEUROD1',
'SYP',
'INSM1',
'CHGA',
'CHGB',
'SCG2',
'GRP',
'NCAM1',
'UCHL1',
'CALCA',
'CALCB'
)
)
neuro.marker.genes.filtered = neuro.marker.genes[neuro.marker.genes$ensembl %in% rownames(combat_edata),]
neuro.marker.fpkm = combat_edata[neuro.marker.genes.filtered$ensembl,]
colnames(neuro.marker.fpkm) = gsub('^FPKM.', '', colnames(neuro.marker.fpkm))
rownames(neuro.marker.fpkm) = neuro.marker.genes.filtered$HGNC
intersect <- function(m1, s1, m2, s2, prop1, prop2){
B <- (m1/s1^2 - m2/s2^2)
A <- 0.5*(1/s2^2 - 1/s1^2)
C <- 0.5*(m2^2/s2^2 - m1^2/s1^2) - log((s1/s2)*(prop2/prop1))
(-B + c(1,-1)*sqrt(B^2 - 4*A*C))/(2*A)
}
m.apc = normalmixEM(combat_edata['ENSG00000134982', ], k = 2) # APC
m.myc = normalmixEM(combat_edata['ENSG00000136997', ], k = 2) # MYC
i.apc = intersect(m.apc$mu[1], m.apc$sigma[1], m.apc$mu[2], m.apc$sigma[2], m.apc$lambda[1], m.apc$lambda[2])
i.myc = intersect(m.myc$mu[1], m.myc$sigma[1], m.myc$mu[2], m.myc$sigma[2], m.myc$lambda[1], m.myc$lambda[2])
x.apc = ifelse(m.apc$x > i.apc[2], 'High', 'Low')
x.myc = ifelse(m.myc$x > i.myc[2], 'High', 'Low')
annotation_col = data.frame(
MycStatus = x.myc,
ApcStatus = x.apc
)
rownames(annotation_col) = colnames(neuro.marker.fpkm)
pheatmap(neuro.marker.fpkm, annotation_col = annotation_col, cluster_rows = FALSE, cutree_cols = 3)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment