Skip to content

Instantly share code, notes, and snippets.

@LiNk-NY
Last active March 28, 2022 21:09
Show Gist options
  • Save LiNk-NY/ffc5d49e51c93280fbcdb559ce3373fa to your computer and use it in GitHub Desktop.
Save LiNk-NY/ffc5d49e51c93280fbcdb559ce3373fa to your computer and use it in GitHub Desktop.
metrics_file <- "pbmc_granulocyte_sorted_3k_per_barcode_metrics.csv"
bm <- read.csv(metrics_file)
dim(bm)
## all unique barcodes
identical(length(unique(bm$atac_barcode)), length(bm$atac_barcode))
abcs <- bm[, "atac_barcode", drop = FALSE]
# with col.names = TRUE
write.table(abcs, file = "atac_barcodes.tsv", sep = "\t", row.names = FALSE)
# with col.names = FALSE
# write.table(abcs, file = "atac_barcodes_nc.tsv", sep = "\t", row.names = FALSE, col.names = FALSE)
## test barcodes sample
set.seed(200)
k10 <- bm[sample(seq_len(nrow(bm)), 10), "atac_barcode", drop = FALSE]
k11 <- cbind.data.frame(
k10, atac_group_barcode = LETTERS[1:10], row.names = NULL
)
# k11 <- cbind.data.frame(
# k10, atac_group_barcode = unlist(strsplit(k10[["atac_barcode"]], "-1")),
# row.names = NULL
# )
# with col.names = TRUE
write.table(
k11, file = "test_atac_barcodes.tsv", sep = "\t",
row.names = FALSE, col.names = FALSE, quote = FALSE
)
## test with writeLines
writeLines(unlist(k10, use.names = FALSE), con = "test_atac_barcodes.txt")
# readLines("test_atac_barcodes.txt")
library(Rsamtools)
basefolder <- "~/data/10x/pbmc_3k/"
bfile <- file.path(basefolder, "pbmc_granulocyte_sorted_3k_atac_possorted_bam.bam")
ifile <- file.path(basefolder, "pbmc_granulocyte_sorted_3k_atac_possorted_bam.bam.bai")
bam <- BamFile(bfile, index = ifile)
head(idxstatsBam(bam, ifile))
## CB: Chromium cellular barcode sequence that is error-corrected
## and confirmed against a list of known-good barcode sequences.
cbtags <- unlist(scanBam(bam, param = ScanBamParam(tag = "CB")), use.names = FALSE)
uCB <- na.omit(unique(cbtags))
## check against metrics file "barcode" column and identify is_cell == 1
metrics_file <- "../pbmc_granulocyte_sorted_3k_per_barcode_metrics.csv"
bm <- read.csv(metrics_file)
all(uCB %in% bm$barcode)
#' [1] TRUE
bcoi <- bm$barcode %in% uCB & bm$is_cell == 1
cellmetrix <- bm[bcoi, ]
cell_bcodes <- cellmetrix[, "barcode", drop = FALSE]
bcode_file <- file.path(basefolder, "method0/atac_barcodes.tsv")
write.table(
cell_bcodes, file = bcode_file, sep = "\t",
row.names = FALSE, quote = FALSE
)
library(Rsamtools)
basefolder <- "~/data/10x/pbmc_3k/"
bfile <- file.path(basefolder, "pbmc_granulocyte_sorted_3k_atac_possorted_bam.bam")
ifile <- file.path(basefolder, "pbmc_granulocyte_sorted_3k_atac_possorted_bam.bam.bai")
bam <- BamFile(bfile, index = ifile)
bcode_file <- file.path(basefolder, "method0/atac_barcodes.tsv")
bm <- read.table(bcode_file, sep = "\t", header = TRUE)
tbc <- head(bm[["barcode"]], 2)
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
genes <- genes(txdb, single.strand.genes.only = FALSE)
genes <- keepStandardChromosomes(genes, pruning.mode = "coarse")
ugenes <- unlist(genes)
library(BiocParallel)
mcp <- MulticoreParam(workers = 86, jobname = "countBam_genic")
register(mcp)
dir_name <- "genic_barcodes"
if (!dir.exists(dir_name))
dir.create(dir_name)
## for all the beans
## max threads = 96
## write data to file after counting
system.time({
bplapply(bm[["barcode"]], function(bcode) {
dir_name <- "genic_barcodes"
count_file <- file.path(dir_name, paste0(bcode, "_counts.csv"))
if (!file.exists(count_file)) {
bcounts <- Rsamtools::countBam(
file = bam,
param =
ScanBamParam(tagFilter = list(CB = bcode), which = ugenes)
)
bcounts <- bcounts[, names(bcounts) != "file"]
names(bcounts)[1] <- "chr"
readr::write_csv(bcounts, file = count_file, num_threads = 1L)
}
}, BPPARAM = mcp)
})
# Timing stopped at: 7.943e+05 7271 1.553e+05
library(Rsamtools)
basefolder <- "~/data/10x/pbmc_3k/"
bfile <- file.path(basefolder, "pbmc_granulocyte_sorted_3k_atac_possorted_bam.bam")
ifile <- file.path(basefolder, "pbmc_granulocyte_sorted_3k_atac_possorted_bam.bam.bai")
bam <- BamFile(bfile, index = ifile)
bm <- read.table("atac_barcodes.tsv", sep = "\t", header = TRUE)
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
genes <- genes(txdb, single.strand.genes.only = FALSE)
## intergenic regions
igr <- gaps(unlist(genes))
igr <- keepStandardChromosomes(igr, pruning.mode = "coarse")
library(BiocParallel)
## max threads = 96
mcp <- MulticoreParam(workers = 86, jobname = "countBam_igr")
register(mcp)
dir_name <- "igr_barcodes"
if (!dir.exists(dir_name))
dir.create(dir_name)
## write data to file after counting
system.time({
bplapply(bm[["barcode"]], function(bcode) {
dir_name <- "igr_barcodes"
count_file <- file.path(dir_name, paste0(bcode, "_counts.csv"))
if (!file.exists(count_file)) {
bcounts <- Rsamtools::countBam(bam,
param = ScanBamParam(tagFilter = list(CB = bcode), which = igr)
)
bcounts <- bcounts[, names(bcounts) != "file"]
names(bcounts)[1] <- "chr"
readr::write_csv(bcounts, file = count_file, num_threads = 1L)
}
}, BPPARAM = mcp)
})
# after interrupt only a few were left... ~100 barcodes
# > NvimR.selection()
# user system elapsed
# 55560.390 480.301 14297.638
library(Rsamtools)
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
library(GenomicAlignments)
data_folder <- "~/data/10x"
# data_folder <- "~/Downloads/"
onlinebam <- "https://cf.10xgenomics.com/samples/cell-arc/2.0.0/pbmc_granulocyte_sorted_3k/pbmc_granulocyte_sorted_3k_gex_possorted_bam.bam"
bamfile <- file.path(data_folder, basename(onlinebam))
if (!file.exists(bamfile))
download.file(onlinebam, bamfile)
onlineindex <- "https://cf.10xgenomics.com/samples/cell-arc/2.0.0/pbmc_granulocyte_sorted_3k/pbmc_granulocyte_sorted_3k_gex_possorted_bam.bam.bai"
bindex <- paste0(bamfile, ".bai")
if (!file.exists(bindex))
download.file(onlineindex, bindex)
stopifnot(
file.exists(bindex),
file.exists(bamfile)
)
bam <- BamFile(bamfile, yieldSize = 20)
scanBam(bam)
bam <- BamFile(bamfile, yieldSize = 200000)
res <- scanBam(bam, param = ScanBamParam(what = scanBamWhat(), tag = "RE"))
unlist(res[[1]]$tag)
## check
bam <- BamFile(bamfile, yieldSize = 20)
res <- scanBam(bam,
param = ScanBamParam(what = scanBamWhat(),
tag = "RE", tagFilter = list(RE = "I")))
filBam <-
"~/data/10x/pbmc_granulocyte_sorted_3k_gex_possorted_bam_filter_RE_I.bam"
filterBam(file = bam,
destination = filBam,
param = ScanBamParam(what = scanBamWhat(),
tag = "RE", tagFilter = list(RE = "I")))
# bam2 <- BamFile(filBam)
bam2 <- BamFile(filBam, yieldSize = 20)
# scanBam(bam2, param = ScanBamParam(what = scanBamWhat()))
allTags <- scanBam(bam2, param = ScanBamParam(tag = "CR"))
length(allTags)
uTags <- unique(unlist(allTags, use.names = FALSE))
writeLines(uTags, "~/data/10x/barcodes.txt")
library(BiocParallel)
args(MulticoreParam)
mcp <- MulticoreParam(workers = 36, jobname = "bamsplit")
register(mcp)
bplapply(uTags, function(bcode) {
filBam <-
"~/data/10x/pbmc_granulocyte_sorted_3k_gex_possorted_bam_filter_RE_I.bam"
codeDir <- file.path("~/data/10x/barcodes/", bcode)
bcBam <- gsub("\\.bam", "", filBam)
outBam <- file.path(codeDir, paste0(basename(bcBam), "_", bcode, ".bam"))
dir.create(codeDir)
library(Rsamtools)
filterBam(filBam,
destination = outBam,
param = ScanBamParam(what = scanBamWhat(),
tag = "CR", tagFilter = list(CR = bcode))
)
}, BPPARAM = bpparam())
bams <- list.files("~/data/10x/barcodes", pattern = "\\.bam$",
recursive = TRUE, full.names = TRUE)
set.seed(200)
blist <- BamFileList(sample(bams, 10000))
txdb <- (TxDb.Hsapiens.UCSC.hg38.knownGene)
genes <- genes(txdb, single.strand.genes.only = FALSE)
## counts in each gene -------------------------------------------------------
# se <- summarizeOverlaps(
# features = genes, reads = bam, mode = "Union",
# singleEnd = FALSE, ignore.strand = TRUE, fragments = TRUE
# )
# saveRDS(se, "pbmc_granulocyte_sorted_3k_gez_possorted_bam_summ_overlaps.Rds")
## spaces between genes -------------------------------------------------------
intergenicRegions <- gaps(unlist(genes))
interse <- summarizeOverlaps(
features = intergenicRegions, reads = blist, mode = "Union",
singleEnd = FALSE, ignore.strand = TRUE, fragments = TRUE
)
#' class: RangedSummarizedExperiment
#' dim: 31664 1
#' metadata(0):
#' assays(1): counts
#' rownames: NULL
#' rowData names(0):
#' colnames(1): pbmc_granulocyte_sorted_3k_gex_possorted_bam.bam
#' colData names(0):
## example on 10K cell barcodes
table(rowSums(assay(interse)))
#'
#' 0 1 2 6 7 12 26 32 33 44 88 96 131 257
#' 31647 2 3 1 1 1 1 1 1 1 1 2 1 1
saveRDS(interse, "pbmc_granulocyte_sorted_3k_gez_possorted_interse_summ_overlaps.Rds")
library(rhdf5)
h5ls("pbmc_granulocyte_sorted_3k_gex_molecule_info.h5")
h5read("pbmc_granulocyte_sorted_3k_gex_molecule_info.h5", "/features/feature_type")
# auto-read bai file
bf <- BamFile("~/data/10x/pbmc_granulocyte_sorted_3k_gex_possorted_bam.bam",
yieldSize = 10)
scanBam(bf)
## read tabix format with Rsamtools
tb <- TabixFile("~/data/10x/pbmc_granulocyte_sorted_3k_atac_fragments.tsv.gz",
yieldSize = 2000000)
scanTabix(tb)
ex <- read.table(textConnection(scanTabix(tb)[[1]]))
## names from table in the link below
## https://support.10xgenomics.com/single-cell-atac/software/pipelines/latest/output/fragments
names(ex) <- c("chrom", "chromStart", "chromEnd", "barcode", "readSupport")
library(GenomicRanges)
ggr <- makeGRangesFromDataFrame(ex, keep.extra.columns = TRUE, starts.in.df.are.0based = TRUE)
ggrl <- as(splitAsList(ggr, mcols(ggr)$barcode), "GRangesList")
library(RaggedExperiment)
rgg <- RaggedExperiment(ggrl)
set.seed(123)
rowindx <- sample(seq_len(nrow(rgg)), 100000)
erg <- rgg[rowindx, ]
compactAssay(erg, i = "readSupport", sparse = TRUE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment