IanSudbery/Find_detained_introns.R

## Find_detained_introns.R
# intron_meta is a dataframe with the columns: gene_id, intron_id, CHr, Start, End, Strand, Length and efflen
# intron_chunks is a datafarme with a column for Geneid-Chr-Start-End-Strand-Length (as output by featureCounts) and other
#              columns are the counts in each sample.
# They have the same order.

library(dplyr)
library(tidyr)
intron_meta$weight <- sqrt(intron_meta$efflen)
intron_meta <- intron_meta %>% group_by(gene_id) %>% mutate(norm_weight=weight/sum(weight)) %>% ungroup()

# We should filter out the the zero weight introns now to prevent them from contaminating the others
intron_chunks <- intron_chunks[intron_meta$weight>0,]
intron_meta <- intron_meta[intron_meta$weight>0,]
intron_meta <- intron_meta %>% group_by(gene_id) %>% mutate(norm_weight=weight/sum(weight)) %>% ungroup()

# Now we need to take each sample and generate its null version. We sum all the reads in all the introns and
# then distribute them according to the weights for each sample
library(DESeq2)
dds <- DESeqDataSetFromMatrix(intron_chunks,  colData=data.frame(cond=rep(1, ncol(chunk_counts))), design=~1)
dds <- estimateSizeFactors(dds)
normed_counts <- counts(dds, normalized=TRUE)
combined_tab = as.data.frame(c(intron_meta, as.data.frame(normed_counts)))
combined_tab <- combined_tab %>% gather(samp, count, Cyto.PolyA.R1:Nuc.PolyA.R3)
null_reps <- combined_tab %>% group_by(gene_id, samp) %>% mutate(count = round(sum(count)*norm_weight)) %>% ungroup()
null_reps <- null_reps %>% spread(samp, count)
null_mat <- as.matrix(null_reps[,12:17])
colnames(null_mat) <- paste(colnames(null_mat), "null", sep="_")
colnames(intron_chunks) <- paste(colnames(intron_chunks), "real", sep="_")
combined_count_mat <- as.matrix(cbind(intron_chunks, null_mat))
colData <- data.frame(name=colnames(combined_count_mat)) %>% separate(name, into=c("Fraction","Method", "Replicate","null") )
rownames(combined_count_mat) = paste(intron_meta$gene_id, intron_meta$intron_id, sep="_")
combined_count_mat <- combined_count_mat[is.finite(rowSums(combined_count_mat)),]

# Now we have the matrix we can do the test.
cyto_dds <- DESeqDataSetFromMatrix(combined_count_mat[,colData$Fraction=="Cyto"],
                                    colData=colData[ colData$Fraction=="Cyto",],
                                    design=~Replicate+null)
cyto_dds <-DESeq(cyto_dds,test="LRT", reduced=~Replicate)
cyto_results <- results(cyto_dds, contrast = c("null","real","null"),
                         , alpha=0.1)

summary(cyto_results)
table(cyto_results$padj <0.01 & cyto_results$log2FoldChange > 2)

# Save the results
write.table(cyto_results, "chunk_splicing.dir/detained_intron_calls.tsv", sep = "\t", quote=F, col.names=T, row.names=F)
	# intron_meta is a dataframe with the columns: gene_id, intron_id, CHr, Start, End, Strand, Length and efflen
	# intron_chunks is a datafarme with a column for Geneid-Chr-Start-End-Strand-Length (as output by featureCounts) and other
	# columns are the counts in each sample.
	# They have the same order.

	library(dplyr)
	library(tidyr)
	intron_meta$weight <- sqrt(intron_meta$efflen)
	intron_meta <- intron_meta %>% group_by(gene_id) %>% mutate(norm_weight=weight/sum(weight)) %>% ungroup()

	# We should filter out the the zero weight introns now to prevent them from contaminating the others
	intron_chunks <- intron_chunks[intron_meta$weight>0,]
	intron_meta <- intron_meta[intron_meta$weight>0,]
	intron_meta <- intron_meta %>% group_by(gene_id) %>% mutate(norm_weight=weight/sum(weight)) %>% ungroup()

	# Now we need to take each sample and generate its null version. We sum all the reads in all the introns and
	# then distribute them according to the weights for each sample
	library(DESeq2)
	dds <- DESeqDataSetFromMatrix(intron_chunks, colData=data.frame(cond=rep(1, ncol(chunk_counts))), design=~1)
	dds <- estimateSizeFactors(dds)
	normed_counts <- counts(dds, normalized=TRUE)
	combined_tab = as.data.frame(c(intron_meta, as.data.frame(normed_counts)))
	combined_tab <- combined_tab %>% gather(samp, count, Cyto.PolyA.R1:Nuc.PolyA.R3)
	null_reps <- combined_tab %>% group_by(gene_id, samp) %>% mutate(count = round(sum(count)*norm_weight)) %>% ungroup()
	null_reps <- null_reps %>% spread(samp, count)
	null_mat <- as.matrix(null_reps[,12:17])
	colnames(null_mat) <- paste(colnames(null_mat), "null", sep="_")
	colnames(intron_chunks) <- paste(colnames(intron_chunks), "real", sep="_")
	combined_count_mat <- as.matrix(cbind(intron_chunks, null_mat))
	colData <- data.frame(name=colnames(combined_count_mat)) %>% separate(name, into=c("Fraction","Method", "Replicate","null") )
	rownames(combined_count_mat) = paste(intron_meta$gene_id, intron_meta$intron_id, sep="_")
	combined_count_mat <- combined_count_mat[is.finite(rowSums(combined_count_mat)),]

	# Now we have the matrix we can do the test.
	cyto_dds <- DESeqDataSetFromMatrix(combined_count_mat[,colData$Fraction=="Cyto"],
	colData=colData[ colData$Fraction=="Cyto",],
	design=~Replicate+null)
	cyto_dds <-DESeq(cyto_dds,test="LRT", reduced=~Replicate)
	cyto_results <- results(cyto_dds, contrast = c("null","real","null"),
	, alpha=0.1)

	summary(cyto_results)
	table(cyto_results$padj <0.01 & cyto_results$log2FoldChange > 2)

	# Save the results
	write.table(cyto_results, "chunk_splicing.dir/detained_intron_calls.tsv", sep = "\t", quote=F, col.names=T, row.names=F)