LiNk-NY/barcodes.R

## barcodes.R
metrics_file <- "pbmc_granulocyte_sorted_3k_per_barcode_metrics.csv"
bm <- read.csv(metrics_file)
dim(bm)

## all unique barcodes
identical(length(unique(bm$atac_barcode)), length(bm$atac_barcode))

abcs <- bm[, "atac_barcode", drop = FALSE]

# with col.names = TRUE
write.table(abcs, file = "atac_barcodes.tsv", sep = "\t", row.names = FALSE)
# with col.names = FALSE
# write.table(abcs, file = "atac_barcodes_nc.tsv", sep = "\t", row.names = FALSE, col.names = FALSE)

## test barcodes sample
set.seed(200)
k10 <- bm[sample(seq_len(nrow(bm)), 10), "atac_barcode", drop = FALSE]
k11 <- cbind.data.frame(
    k10, atac_group_barcode = LETTERS[1:10], row.names = NULL
)
# k11 <- cbind.data.frame(
#     k10, atac_group_barcode = unlist(strsplit(k10[["atac_barcode"]], "-1")),
#     row.names = NULL
# )

# with col.names = TRUE
write.table(
    k11, file = "test_atac_barcodes.tsv", sep = "\t",
    row.names = FALSE, col.names = FALSE, quote = FALSE
)
## test with writeLines
writeLines(unlist(k10, use.names = FALSE), con = "test_atac_barcodes.txt")
# readLines("test_atac_barcodes.txt")

## extract_barcodes.R
library(Rsamtools)
basefolder <- "~/data/10x/pbmc_3k/"
bfile <- file.path(basefolder, "pbmc_granulocyte_sorted_3k_atac_possorted_bam.bam")
ifile <- file.path(basefolder, "pbmc_granulocyte_sorted_3k_atac_possorted_bam.bam.bai")
bam <- BamFile(bfile, index = ifile)
head(idxstatsBam(bam, ifile))

## CB: Chromium cellular barcode sequence that is error-corrected
##     and confirmed against a list of known-good barcode sequences.
cbtags <- unlist(scanBam(bam, param = ScanBamParam(tag = "CB")), use.names = FALSE)
uCB <- na.omit(unique(cbtags))

## check against metrics file "barcode" column and identify is_cell == 1
metrics_file <- "../pbmc_granulocyte_sorted_3k_per_barcode_metrics.csv"
bm <- read.csv(metrics_file)

all(uCB %in% bm$barcode)
#' [1] TRUE

bcoi <- bm$barcode %in% uCB & bm$is_cell == 1
cellmetrix <- bm[bcoi, ]

cell_bcodes <- cellmetrix[, "barcode", drop = FALSE]

bcode_file <- file.path(basefolder, "method0/atac_barcodes.tsv")

write.table(
    cell_bcodes, file = bcode_file, sep = "\t",
    row.names = FALSE, quote = FALSE
)


## genicUniBam.R
library(Rsamtools)
basefolder <- "~/data/10x/pbmc_3k/"
bfile <- file.path(basefolder, "pbmc_granulocyte_sorted_3k_atac_possorted_bam.bam")
ifile <- file.path(basefolder, "pbmc_granulocyte_sorted_3k_atac_possorted_bam.bam.bai")
bam <- BamFile(bfile, index = ifile)

bcode_file <- file.path(basefolder, "method0/atac_barcodes.tsv")
bm <- read.table(bcode_file, sep = "\t", header = TRUE)

tbc <- head(bm[["barcode"]], 2)

library(TxDb.Hsapiens.UCSC.hg38.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
genes <- genes(txdb, single.strand.genes.only = FALSE)
genes <- keepStandardChromosomes(genes, pruning.mode = "coarse")
ugenes <- unlist(genes)

library(BiocParallel)
mcp <- MulticoreParam(workers = 86, jobname = "countBam_genic")
register(mcp)

dir_name <- "genic_barcodes"
if (!dir.exists(dir_name))
    dir.create(dir_name)

## for all the beans
## max threads = 96
## write data to file after counting
system.time({
    bplapply(bm[["barcode"]], function(bcode) {
        dir_name <- "genic_barcodes"
        count_file <- file.path(dir_name, paste0(bcode, "_counts.csv"))
        if (!file.exists(count_file)) {
            bcounts <- Rsamtools::countBam(
                file = bam,
                param =
                    ScanBamParam(tagFilter = list(CB = bcode), which = ugenes)
            )
            bcounts <- bcounts[, names(bcounts) != "file"]
            names(bcounts)[1] <- "chr"
            readr::write_csv(bcounts, file = count_file, num_threads = 1L)
        }
    }, BPPARAM = mcp)
})
# Timing stopped at: 7.943e+05 7271 1.553e+05

## igrUniBam.R
library(Rsamtools)
basefolder <- "~/data/10x/pbmc_3k/"
bfile <- file.path(basefolder, "pbmc_granulocyte_sorted_3k_atac_possorted_bam.bam")
ifile <- file.path(basefolder, "pbmc_granulocyte_sorted_3k_atac_possorted_bam.bam.bai")
bam <- BamFile(bfile, index = ifile)

bm <- read.table("atac_barcodes.tsv", sep = "\t", header = TRUE)

library(TxDb.Hsapiens.UCSC.hg38.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
genes <- genes(txdb, single.strand.genes.only = FALSE)
## intergenic regions
igr <- gaps(unlist(genes))
igr <- keepStandardChromosomes(igr, pruning.mode = "coarse")

library(BiocParallel)
## max threads = 96
mcp <- MulticoreParam(workers = 86, jobname = "countBam_igr")
register(mcp)

dir_name <- "igr_barcodes"
if (!dir.exists(dir_name))
    dir.create(dir_name)

## write data to file after counting
system.time({
    bplapply(bm[["barcode"]], function(bcode) {
        dir_name <- "igr_barcodes"
        count_file <- file.path(dir_name, paste0(bcode, "_counts.csv"))
        if (!file.exists(count_file)) {
            bcounts <- Rsamtools::countBam(bam,
                param = ScanBamParam(tagFilter = list(CB = bcode), which = igr)
            )
            bcounts <- bcounts[, names(bcounts) != "file"]
            names(bcounts)[1] <- "chr"
            readr::write_csv(bcounts, file = count_file, num_threads = 1L)
        }
    }, BPPARAM = mcp)
})

# after interrupt only a few were left... ~100 barcodes
# > NvimR.selection()
#      user    system   elapsed
# 55560.390   480.301 14297.638


## summOverlaps.R
library(Rsamtools)
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
library(GenomicAlignments)

data_folder <- "~/data/10x"
# data_folder <- "~/Downloads/"


onlinebam <- "https://cf.10xgenomics.com/samples/cell-arc/2.0.0/pbmc_granulocyte_sorted_3k/pbmc_granulocyte_sorted_3k_gex_possorted_bam.bam"
bamfile <- file.path(data_folder, basename(onlinebam))
if (!file.exists(bamfile))
    download.file(onlinebam, bamfile)

onlineindex <- "https://cf.10xgenomics.com/samples/cell-arc/2.0.0/pbmc_granulocyte_sorted_3k/pbmc_granulocyte_sorted_3k_gex_possorted_bam.bam.bai"
bindex <- paste0(bamfile, ".bai")
if (!file.exists(bindex))
    download.file(onlineindex, bindex)

stopifnot(
    file.exists(bindex),
    file.exists(bamfile)
)

bam <- BamFile(bamfile, yieldSize = 20)
scanBam(bam)

bam <- BamFile(bamfile, yieldSize = 200000)
res <- scanBam(bam, param = ScanBamParam(what = scanBamWhat(), tag = "RE"))
unlist(res[[1]]$tag)

## check
bam <- BamFile(bamfile, yieldSize = 20)
res <- scanBam(bam,
    param = ScanBamParam(what = scanBamWhat(),
    tag = "RE", tagFilter = list(RE = "I")))

filBam <-
    "~/data/10x/pbmc_granulocyte_sorted_3k_gex_possorted_bam_filter_RE_I.bam"
filterBam(file = bam,
    destination = filBam,
    param = ScanBamParam(what = scanBamWhat(),
    tag = "RE", tagFilter = list(RE = "I")))

# bam2 <- BamFile(filBam)
bam2 <- BamFile(filBam, yieldSize = 20)
# scanBam(bam2, param = ScanBamParam(what = scanBamWhat()))
allTags <- scanBam(bam2, param = ScanBamParam(tag = "CR"))
length(allTags)
uTags <- unique(unlist(allTags, use.names = FALSE))
writeLines(uTags, "~/data/10x/barcodes.txt")


library(BiocParallel)
args(MulticoreParam)
mcp <- MulticoreParam(workers = 36, jobname = "bamsplit")
register(mcp)

bplapply(uTags, function(bcode) {
    filBam <-
      "~/data/10x/pbmc_granulocyte_sorted_3k_gex_possorted_bam_filter_RE_I.bam"
    codeDir <- file.path("~/data/10x/barcodes/", bcode)
    bcBam <- gsub("\\.bam", "", filBam)
    outBam <- file.path(codeDir, paste0(basename(bcBam), "_", bcode, ".bam"))
    dir.create(codeDir)

    library(Rsamtools)
    filterBam(filBam,
        destination = outBam,
        param = ScanBamParam(what = scanBamWhat(),
        tag = "CR", tagFilter = list(CR = bcode))
    )
}, BPPARAM = bpparam())

bams <- list.files("~/data/10x/barcodes", pattern = "\\.bam$",
    recursive = TRUE, full.names = TRUE)

set.seed(200)
blist <- BamFileList(sample(bams, 10000))

txdb <- (TxDb.Hsapiens.UCSC.hg38.knownGene)
genes <- genes(txdb, single.strand.genes.only = FALSE)

## counts in each gene -------------------------------------------------------
# se <- summarizeOverlaps(
#     features = genes, reads = bam, mode = "Union",
#     singleEnd = FALSE, ignore.strand = TRUE, fragments = TRUE
# )
# saveRDS(se, "pbmc_granulocyte_sorted_3k_gez_possorted_bam_summ_overlaps.Rds")

## spaces between genes -------------------------------------------------------
intergenicRegions <- gaps(unlist(genes))
interse <- summarizeOverlaps(
    features = intergenicRegions, reads = blist, mode = "Union",
    singleEnd = FALSE, ignore.strand = TRUE, fragments = TRUE
)
#' class: RangedSummarizedExperiment
#' dim: 31664 1
#' metadata(0):
#' assays(1): counts
#' rownames: NULL
#' rowData names(0):
#' colnames(1): pbmc_granulocyte_sorted_3k_gex_possorted_bam.bam
#' colData names(0):

## example on 10K cell barcodes
table(rowSums(assay(interse)))
#'
#'     0     1     2     6     7    12    26    32    33    44    88    96   131   257
#' 31647     2     3     1     1     1     1     1     1     1     1     2     1     1

saveRDS(interse, "pbmc_granulocyte_sorted_3k_gez_possorted_interse_summ_overlaps.Rds")

library(rhdf5)
h5ls("pbmc_granulocyte_sorted_3k_gex_molecule_info.h5")
h5read("pbmc_granulocyte_sorted_3k_gex_molecule_info.h5", "/features/feature_type")

# auto-read bai file
bf <- BamFile("~/data/10x/pbmc_granulocyte_sorted_3k_gex_possorted_bam.bam",
    yieldSize = 10)
scanBam(bf)

## read tabix format with Rsamtools
tb <- TabixFile("~/data/10x/pbmc_granulocyte_sorted_3k_atac_fragments.tsv.gz",
    yieldSize = 2000000)
scanTabix(tb)

ex <- read.table(textConnection(scanTabix(tb)[[1]]))

## names from table in the link below
## https://support.10xgenomics.com/single-cell-atac/software/pipelines/latest/output/fragments
names(ex) <- c("chrom", "chromStart", "chromEnd", "barcode", "readSupport")
library(GenomicRanges)
ggr <- makeGRangesFromDataFrame(ex, keep.extra.columns = TRUE, starts.in.df.are.0based = TRUE)
ggrl <- as(splitAsList(ggr, mcols(ggr)$barcode), "GRangesList")
library(RaggedExperiment)
rgg <- RaggedExperiment(ggrl)
set.seed(123)
rowindx <- sample(seq_len(nrow(rgg)), 100000)
erg <- rgg[rowindx, ]
compactAssay(erg, i = "readSupport", sparse = TRUE)
	metrics_file <- "pbmc_granulocyte_sorted_3k_per_barcode_metrics.csv"
	bm <- read.csv(metrics_file)
	dim(bm)

	## all unique barcodes
	identical(length(unique(bm$atac_barcode)), length(bm$atac_barcode))

	abcs <- bm[, "atac_barcode", drop = FALSE]

	# with col.names = TRUE
	write.table(abcs, file = "atac_barcodes.tsv", sep = "\t", row.names = FALSE)
	# with col.names = FALSE
	# write.table(abcs, file = "atac_barcodes_nc.tsv", sep = "\t", row.names = FALSE, col.names = FALSE)

	## test barcodes sample
	set.seed(200)
	k10 <- bm[sample(seq_len(nrow(bm)), 10), "atac_barcode", drop = FALSE]
	k11 <- cbind.data.frame(
	k10, atac_group_barcode = LETTERS[1:10], row.names = NULL
	)
	# k11 <- cbind.data.frame(
	# k10, atac_group_barcode = unlist(strsplit(k10[["atac_barcode"]], "-1")),
	# row.names = NULL
	# )

	# with col.names = TRUE
	write.table(
	k11, file = "test_atac_barcodes.tsv", sep = "\t",
	row.names = FALSE, col.names = FALSE, quote = FALSE
	)
	## test with writeLines
	writeLines(unlist(k10, use.names = FALSE), con = "test_atac_barcodes.txt")
	# readLines("test_atac_barcodes.txt")
	library(Rsamtools)
	basefolder <- "~/data/10x/pbmc_3k/"
	bfile <- file.path(basefolder, "pbmc_granulocyte_sorted_3k_atac_possorted_bam.bam")
	ifile <- file.path(basefolder, "pbmc_granulocyte_sorted_3k_atac_possorted_bam.bam.bai")
	bam <- BamFile(bfile, index = ifile)
	head(idxstatsBam(bam, ifile))

	## CB: Chromium cellular barcode sequence that is error-corrected
	## and confirmed against a list of known-good barcode sequences.
	cbtags <- unlist(scanBam(bam, param = ScanBamParam(tag = "CB")), use.names = FALSE)
	uCB <- na.omit(unique(cbtags))

	## check against metrics file "barcode" column and identify is_cell == 1
	metrics_file <- "../pbmc_granulocyte_sorted_3k_per_barcode_metrics.csv"
	bm <- read.csv(metrics_file)

	all(uCB %in% bm$barcode)
	#' [1] TRUE

	bcoi <- bm$barcode %in% uCB & bm$is_cell == 1
	cellmetrix <- bm[bcoi, ]

	cell_bcodes <- cellmetrix[, "barcode", drop = FALSE]

	bcode_file <- file.path(basefolder, "method0/atac_barcodes.tsv")

	write.table(
	cell_bcodes, file = bcode_file, sep = "\t",
	row.names = FALSE, quote = FALSE
	)
	library(Rsamtools)
	library(TxDb.Hsapiens.UCSC.hg38.knownGene)
	library(GenomicAlignments)

	data_folder <- "~/data/10x"
	# data_folder <- "~/Downloads/"


	onlinebam <- "https://cf.10xgenomics.com/samples/cell-arc/2.0.0/pbmc_granulocyte_sorted_3k/pbmc_granulocyte_sorted_3k_gex_possorted_bam.bam"
	bamfile <- file.path(data_folder, basename(onlinebam))
	if (!file.exists(bamfile))
	download.file(onlinebam, bamfile)

	onlineindex <- "https://cf.10xgenomics.com/samples/cell-arc/2.0.0/pbmc_granulocyte_sorted_3k/pbmc_granulocyte_sorted_3k_gex_possorted_bam.bam.bai"
	bindex <- paste0(bamfile, ".bai")
	if (!file.exists(bindex))
	download.file(onlineindex, bindex)

	stopifnot(
	file.exists(bindex),
	file.exists(bamfile)
	)

	bam <- BamFile(bamfile, yieldSize = 20)
	scanBam(bam)

	bam <- BamFile(bamfile, yieldSize = 200000)
	res <- scanBam(bam, param = ScanBamParam(what = scanBamWhat(), tag = "RE"))
	unlist(res[[1]]$tag)

	## check
	bam <- BamFile(bamfile, yieldSize = 20)
	res <- scanBam(bam,
	param = ScanBamParam(what = scanBamWhat(),
	tag = "RE", tagFilter = list(RE = "I")))

	filBam <-
	"~/data/10x/pbmc_granulocyte_sorted_3k_gex_possorted_bam_filter_RE_I.bam"
	filterBam(file = bam,
	destination = filBam,
	param = ScanBamParam(what = scanBamWhat(),
	tag = "RE", tagFilter = list(RE = "I")))

	# bam2 <- BamFile(filBam)
	bam2 <- BamFile(filBam, yieldSize = 20)
	# scanBam(bam2, param = ScanBamParam(what = scanBamWhat()))
	allTags <- scanBam(bam2, param = ScanBamParam(tag = "CR"))
	length(allTags)
	uTags <- unique(unlist(allTags, use.names = FALSE))
	writeLines(uTags, "~/data/10x/barcodes.txt")


	library(BiocParallel)
	args(MulticoreParam)
	mcp <- MulticoreParam(workers = 36, jobname = "bamsplit")
	register(mcp)

	bplapply(uTags, function(bcode) {
	filBam <-
	"~/data/10x/pbmc_granulocyte_sorted_3k_gex_possorted_bam_filter_RE_I.bam"
	codeDir <- file.path("~/data/10x/barcodes/", bcode)
	bcBam <- gsub("\\.bam", "", filBam)
	outBam <- file.path(codeDir, paste0(basename(bcBam), "_", bcode, ".bam"))
	dir.create(codeDir)

	library(Rsamtools)
	filterBam(filBam,
	destination = outBam,
	param = ScanBamParam(what = scanBamWhat(),
	tag = "CR", tagFilter = list(CR = bcode))
	)
	}, BPPARAM = bpparam())

	bams <- list.files("~/data/10x/barcodes", pattern = "\\.bam$",
	recursive = TRUE, full.names = TRUE)

	set.seed(200)
	blist <- BamFileList(sample(bams, 10000))

	txdb <- (TxDb.Hsapiens.UCSC.hg38.knownGene)
	genes <- genes(txdb, single.strand.genes.only = FALSE)

	## counts in each gene -------------------------------------------------------
	# se <- summarizeOverlaps(
	# features = genes, reads = bam, mode = "Union",
	# singleEnd = FALSE, ignore.strand = TRUE, fragments = TRUE
	# )
	# saveRDS(se, "pbmc_granulocyte_sorted_3k_gez_possorted_bam_summ_overlaps.Rds")

	## spaces between genes -------------------------------------------------------
	intergenicRegions <- gaps(unlist(genes))
	interse <- summarizeOverlaps(
	features = intergenicRegions, reads = blist, mode = "Union",
	singleEnd = FALSE, ignore.strand = TRUE, fragments = TRUE
	)
	#' class: RangedSummarizedExperiment
	#' dim: 31664 1
	#' metadata(0):
	#' assays(1): counts
	#' rownames: NULL
	#' rowData names(0):
	#' colnames(1): pbmc_granulocyte_sorted_3k_gex_possorted_bam.bam
	#' colData names(0):

	## example on 10K cell barcodes
	table(rowSums(assay(interse)))
	#'
	#' 0 1 2 6 7 12 26 32 33 44 88 96 131 257
	#' 31647 2 3 1 1 1 1 1 1 1 1 2 1 1

	saveRDS(interse, "pbmc_granulocyte_sorted_3k_gez_possorted_interse_summ_overlaps.Rds")

	library(rhdf5)
	h5ls("pbmc_granulocyte_sorted_3k_gex_molecule_info.h5")
	h5read("pbmc_granulocyte_sorted_3k_gex_molecule_info.h5", "/features/feature_type")

	# auto-read bai file
	bf <- BamFile("~/data/10x/pbmc_granulocyte_sorted_3k_gex_possorted_bam.bam",
	yieldSize = 10)
	scanBam(bf)

	## read tabix format with Rsamtools
	tb <- TabixFile("~/data/10x/pbmc_granulocyte_sorted_3k_atac_fragments.tsv.gz",
	yieldSize = 2000000)
	scanTabix(tb)

	ex <- read.table(textConnection(scanTabix(tb)[[1]]))

	## names from table in the link below
	## https://support.10xgenomics.com/single-cell-atac/software/pipelines/latest/output/fragments
	names(ex) <- c("chrom", "chromStart", "chromEnd", "barcode", "readSupport")
	library(GenomicRanges)
	ggr <- makeGRangesFromDataFrame(ex, keep.extra.columns = TRUE, starts.in.df.are.0based = TRUE)
	ggrl <- as(splitAsList(ggr, mcols(ggr)$barcode), "GRangesList")
	library(RaggedExperiment)
	rgg <- RaggedExperiment(ggrl)
	set.seed(123)
	rowindx <- sample(seq_len(nrow(rgg)), 100000)
	erg <- rgg[rowindx, ]
	compactAssay(erg, i = "readSupport", sparse = TRUE)