Skip to content

Instantly share code, notes, and snippets.

@IanSudbery
Last active January 11, 2018 11:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save IanSudbery/df92ebf4f84d76027394750a6516db1b to your computer and use it in GitHub Desktop.
Save IanSudbery/df92ebf4f84d76027394750a6516db1b to your computer and use it in GitHub Desktop.
Find_detained_inrons
# intron_meta is a dataframe with the columns: gene_id, intron_id, CHr, Start, End, Strand, Length and efflen
# intron_chunks is a datafarme with a column for Geneid-Chr-Start-End-Strand-Length (as output by featureCounts) and other
# columns are the counts in each sample.
# They have the same order.
library(dplyr)
library(tidyr)
intron_meta$weight <- sqrt(intron_meta$efflen)
intron_meta <- intron_meta %>% group_by(gene_id) %>% mutate(norm_weight=weight/sum(weight)) %>% ungroup()
# We should filter out the the zero weight introns now to prevent them from contaminating the others
intron_chunks <- intron_chunks[intron_meta$weight>0,]
intron_meta <- intron_meta[intron_meta$weight>0,]
intron_meta <- intron_meta %>% group_by(gene_id) %>% mutate(norm_weight=weight/sum(weight)) %>% ungroup()
# Now we need to take each sample and generate its null version. We sum all the reads in all the introns and
# then distribute them according to the weights for each sample
library(DESeq2)
dds <- DESeqDataSetFromMatrix(intron_chunks, colData=data.frame(cond=rep(1, ncol(chunk_counts))), design=~1)
dds <- estimateSizeFactors(dds)
normed_counts <- counts(dds, normalized=TRUE)
combined_tab = as.data.frame(c(intron_meta, as.data.frame(normed_counts)))
combined_tab <- combined_tab %>% gather(samp, count, Cyto.PolyA.R1:Nuc.PolyA.R3)
null_reps <- combined_tab %>% group_by(gene_id, samp) %>% mutate(count = round(sum(count)*norm_weight)) %>% ungroup()
null_reps <- null_reps %>% spread(samp, count)
null_mat <- as.matrix(null_reps[,12:17])
colnames(null_mat) <- paste(colnames(null_mat), "null", sep="_")
colnames(intron_chunks) <- paste(colnames(intron_chunks), "real", sep="_")
combined_count_mat <- as.matrix(cbind(intron_chunks, null_mat))
colData <- data.frame(name=colnames(combined_count_mat)) %>% separate(name, into=c("Fraction","Method", "Replicate","null") )
rownames(combined_count_mat) = paste(intron_meta$gene_id, intron_meta$intron_id, sep="_")
combined_count_mat <- combined_count_mat[is.finite(rowSums(combined_count_mat)),]
# Now we have the matrix we can do the test.
cyto_dds <- DESeqDataSetFromMatrix(combined_count_mat[,colData$Fraction=="Cyto"],
colData=colData[ colData$Fraction=="Cyto",],
design=~Replicate+null)
cyto_dds <-DESeq(cyto_dds,test="LRT", reduced=~Replicate)
cyto_results <- results(cyto_dds, contrast = c("null","real","null"),
, alpha=0.1)
summary(cyto_results)
table(cyto_results$padj <0.01 & cyto_results$log2FoldChange > 2)
# Save the results
write.table(cyto_results, "chunk_splicing.dir/detained_intron_calls.tsv", sep = "\t", quote=F, col.names=T, row.names=F)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment