Skip to content

Instantly share code, notes, and snippets.

@kieranrcampbell
Created March 21, 2023 19:12
Show Gist options
  • Save kieranrcampbell/0d198f5f17aca34278bce49dc390281c to your computer and use it in GitHub Desktop.
Save kieranrcampbell/0d198f5f17aca34278bce49dc390281c to your computer and use it in GitHub Desktop.
## Going to take non tumour cells as just fibroblast and T/NK
## as these are (i) abundant, and (ii) sufficiently distinct transcriptionally
## from tumour that we're unlikely to confuse them
non_tumour_cells <- c("Fibroblast", "T/NK")
get_baf_df <- function(sce) {
message(paste0("[Running] Sample ", sce$id[1]))
colnames(sce) <- paste0(colData(sce)$id, "-", colData(sce)$barcode)
sce$cell_type <- cell_annots[colnames(sce)]
## First, define a set of sites that we're really confident
## are heterozygous in normal tissue. To do this we take only
## normal cells, and sum the ad/dp per cell, then perform a
## binomial test (2 tailed) to test whether the fractions
## are significantly different from 0.5
sce_normal <- sce[, sce$cell_type %in% non_tumour_cells]
ad_sum <- rowSums(assay(sce_normal, 'ad'))
dp_sum <- rowSums(assay(sce_normal, 'dp'))
# if we can't compute the p value, reject it has having p < 0.05
pvals <- rep(0, length(dp_sum))
for(i in seq_along(ad_sum)) {
if(dp_sum[i] > 0) {
test <- binom.test(ad_sum[i], dp_sum[i], p=0.5)
pvals[i] <- c(test$p.value)
}
}
## Here we accept heterozygous sites as having a p-value > 0.05
## (i.e. we can't reject the null that BAF=0.5), and filter
## on DP sum > 20 to avoid sites with very few reads
highconf_hzyg_sites <- dp_sum > 20 & pvals > 0.05
## Subset to heterozygous sites
sce_hc <- sce[highconf_hzyg_sites,]
## Boiler plate setting up stuff we want to return
all_cell_types = unique(sce$cell_type)
all_cell_types <- all_cell_types[!is.na(all_cell_types)]
bafs = rep(NA, length(all_cell_types))
names(bafs) <- all_cell_types
dp_sums = rep(NA, length(all_cell_types))
names(dp_sums) <- all_cell_types
pvals = rep(NA, length(all_cell_types))
names(pvals) <- all_cell_types
## Iterate over each cell type, looking at high confidence
## heterozygous sites
for(cell_type in all_cell_types) {
is_cell_type <- which(sce_hc$cell_type ==cell_type)
sce_hc_ct <- sce_hc[, is_cell_type]
dp_vec <- as.vector(assay(sce_hc_ct, 'dp'))
ad_vec <- as.vector(assay(sce_hc_ct, 'ad'))
ad_sum <- sum(ad_vec)#[dp_vec > 5])
dp_sum <- sum(dp_vec)#[dp_vec > 5])
## Compute BAFs, DP sums over all cells and positions
avg_baf <- ad_sum / dp_sum
bafs[cell_type] <- avg_baf
dp_sums[cell_type] <- dp_sum
if(dp_sum > 0) {
## Finally, we want to test if the BAF is significantly
## different to 0.5, using a binomial test, but this
## time interested in the case of p being < 0.05
test <- binom.test(ad_sum, dp_sum)
pvals[cell_type] <- test$p.value
}
}
## Flip the BAFs to always be < 0.5
for(i in seq_along(bafs)) {
if(!is.nan(bafs[i]) && !is.na(bafs[i])) {
if(bafs[i] > 0.5) {
bafs[i] <- 1 - bafs[i]
}
}
}
tibble(cell_type = all_cell_types, baf = bafs, dp_sum = dp_sums, pval = pvals, id = sce$id[1])
}
df_baf <- bind_rows(
lapply(sce_gt, get_baf_df)
)
df_baf$p_adj <- p.adjust(df_baf$pval, method="BH")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment