Last active
March 28, 2022 21:09
-
-
Save LiNk-NY/ffc5d49e51c93280fbcdb559ce3373fa to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
metrics_file <- "pbmc_granulocyte_sorted_3k_per_barcode_metrics.csv" | |
bm <- read.csv(metrics_file) | |
dim(bm) | |
## all unique barcodes | |
identical(length(unique(bm$atac_barcode)), length(bm$atac_barcode)) | |
abcs <- bm[, "atac_barcode", drop = FALSE] | |
# with col.names = TRUE | |
write.table(abcs, file = "atac_barcodes.tsv", sep = "\t", row.names = FALSE) | |
# with col.names = FALSE | |
# write.table(abcs, file = "atac_barcodes_nc.tsv", sep = "\t", row.names = FALSE, col.names = FALSE) | |
## test barcodes sample | |
set.seed(200) | |
k10 <- bm[sample(seq_len(nrow(bm)), 10), "atac_barcode", drop = FALSE] | |
k11 <- cbind.data.frame( | |
k10, atac_group_barcode = LETTERS[1:10], row.names = NULL | |
) | |
# k11 <- cbind.data.frame( | |
# k10, atac_group_barcode = unlist(strsplit(k10[["atac_barcode"]], "-1")), | |
# row.names = NULL | |
# ) | |
# with col.names = TRUE | |
write.table( | |
k11, file = "test_atac_barcodes.tsv", sep = "\t", | |
row.names = FALSE, col.names = FALSE, quote = FALSE | |
) | |
## test with writeLines | |
writeLines(unlist(k10, use.names = FALSE), con = "test_atac_barcodes.txt") | |
# readLines("test_atac_barcodes.txt") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(Rsamtools) | |
basefolder <- "~/data/10x/pbmc_3k/" | |
bfile <- file.path(basefolder, "pbmc_granulocyte_sorted_3k_atac_possorted_bam.bam") | |
ifile <- file.path(basefolder, "pbmc_granulocyte_sorted_3k_atac_possorted_bam.bam.bai") | |
bam <- BamFile(bfile, index = ifile) | |
head(idxstatsBam(bam, ifile)) | |
## CB: Chromium cellular barcode sequence that is error-corrected | |
## and confirmed against a list of known-good barcode sequences. | |
cbtags <- unlist(scanBam(bam, param = ScanBamParam(tag = "CB")), use.names = FALSE) | |
uCB <- na.omit(unique(cbtags)) | |
## check against metrics file "barcode" column and identify is_cell == 1 | |
metrics_file <- "../pbmc_granulocyte_sorted_3k_per_barcode_metrics.csv" | |
bm <- read.csv(metrics_file) | |
all(uCB %in% bm$barcode) | |
#' [1] TRUE | |
bcoi <- bm$barcode %in% uCB & bm$is_cell == 1 | |
cellmetrix <- bm[bcoi, ] | |
cell_bcodes <- cellmetrix[, "barcode", drop = FALSE] | |
bcode_file <- file.path(basefolder, "method0/atac_barcodes.tsv") | |
write.table( | |
cell_bcodes, file = bcode_file, sep = "\t", | |
row.names = FALSE, quote = FALSE | |
) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(Rsamtools) | |
basefolder <- "~/data/10x/pbmc_3k/" | |
bfile <- file.path(basefolder, "pbmc_granulocyte_sorted_3k_atac_possorted_bam.bam") | |
ifile <- file.path(basefolder, "pbmc_granulocyte_sorted_3k_atac_possorted_bam.bam.bai") | |
bam <- BamFile(bfile, index = ifile) | |
bcode_file <- file.path(basefolder, "method0/atac_barcodes.tsv") | |
bm <- read.table(bcode_file, sep = "\t", header = TRUE) | |
tbc <- head(bm[["barcode"]], 2) | |
library(TxDb.Hsapiens.UCSC.hg38.knownGene) | |
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene | |
genes <- genes(txdb, single.strand.genes.only = FALSE) | |
genes <- keepStandardChromosomes(genes, pruning.mode = "coarse") | |
ugenes <- unlist(genes) | |
library(BiocParallel) | |
mcp <- MulticoreParam(workers = 86, jobname = "countBam_genic") | |
register(mcp) | |
dir_name <- "genic_barcodes" | |
if (!dir.exists(dir_name)) | |
dir.create(dir_name) | |
## for all the beans | |
## max threads = 96 | |
## write data to file after counting | |
system.time({ | |
bplapply(bm[["barcode"]], function(bcode) { | |
dir_name <- "genic_barcodes" | |
count_file <- file.path(dir_name, paste0(bcode, "_counts.csv")) | |
if (!file.exists(count_file)) { | |
bcounts <- Rsamtools::countBam( | |
file = bam, | |
param = | |
ScanBamParam(tagFilter = list(CB = bcode), which = ugenes) | |
) | |
bcounts <- bcounts[, names(bcounts) != "file"] | |
names(bcounts)[1] <- "chr" | |
readr::write_csv(bcounts, file = count_file, num_threads = 1L) | |
} | |
}, BPPARAM = mcp) | |
}) | |
# Timing stopped at: 7.943e+05 7271 1.553e+05 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(Rsamtools) | |
basefolder <- "~/data/10x/pbmc_3k/" | |
bfile <- file.path(basefolder, "pbmc_granulocyte_sorted_3k_atac_possorted_bam.bam") | |
ifile <- file.path(basefolder, "pbmc_granulocyte_sorted_3k_atac_possorted_bam.bam.bai") | |
bam <- BamFile(bfile, index = ifile) | |
bm <- read.table("atac_barcodes.tsv", sep = "\t", header = TRUE) | |
library(TxDb.Hsapiens.UCSC.hg38.knownGene) | |
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene | |
genes <- genes(txdb, single.strand.genes.only = FALSE) | |
## intergenic regions | |
igr <- gaps(unlist(genes)) | |
igr <- keepStandardChromosomes(igr, pruning.mode = "coarse") | |
library(BiocParallel) | |
## max threads = 96 | |
mcp <- MulticoreParam(workers = 86, jobname = "countBam_igr") | |
register(mcp) | |
dir_name <- "igr_barcodes" | |
if (!dir.exists(dir_name)) | |
dir.create(dir_name) | |
## write data to file after counting | |
system.time({ | |
bplapply(bm[["barcode"]], function(bcode) { | |
dir_name <- "igr_barcodes" | |
count_file <- file.path(dir_name, paste0(bcode, "_counts.csv")) | |
if (!file.exists(count_file)) { | |
bcounts <- Rsamtools::countBam(bam, | |
param = ScanBamParam(tagFilter = list(CB = bcode), which = igr) | |
) | |
bcounts <- bcounts[, names(bcounts) != "file"] | |
names(bcounts)[1] <- "chr" | |
readr::write_csv(bcounts, file = count_file, num_threads = 1L) | |
} | |
}, BPPARAM = mcp) | |
}) | |
# after interrupt only a few were left... ~100 barcodes | |
# > NvimR.selection() | |
# user system elapsed | |
# 55560.390 480.301 14297.638 | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(Rsamtools) | |
library(TxDb.Hsapiens.UCSC.hg38.knownGene) | |
library(GenomicAlignments) | |
data_folder <- "~/data/10x" | |
# data_folder <- "~/Downloads/" | |
onlinebam <- "https://cf.10xgenomics.com/samples/cell-arc/2.0.0/pbmc_granulocyte_sorted_3k/pbmc_granulocyte_sorted_3k_gex_possorted_bam.bam" | |
bamfile <- file.path(data_folder, basename(onlinebam)) | |
if (!file.exists(bamfile)) | |
download.file(onlinebam, bamfile) | |
onlineindex <- "https://cf.10xgenomics.com/samples/cell-arc/2.0.0/pbmc_granulocyte_sorted_3k/pbmc_granulocyte_sorted_3k_gex_possorted_bam.bam.bai" | |
bindex <- paste0(bamfile, ".bai") | |
if (!file.exists(bindex)) | |
download.file(onlineindex, bindex) | |
stopifnot( | |
file.exists(bindex), | |
file.exists(bamfile) | |
) | |
bam <- BamFile(bamfile, yieldSize = 20) | |
scanBam(bam) | |
bam <- BamFile(bamfile, yieldSize = 200000) | |
res <- scanBam(bam, param = ScanBamParam(what = scanBamWhat(), tag = "RE")) | |
unlist(res[[1]]$tag) | |
## check | |
bam <- BamFile(bamfile, yieldSize = 20) | |
res <- scanBam(bam, | |
param = ScanBamParam(what = scanBamWhat(), | |
tag = "RE", tagFilter = list(RE = "I"))) | |
filBam <- | |
"~/data/10x/pbmc_granulocyte_sorted_3k_gex_possorted_bam_filter_RE_I.bam" | |
filterBam(file = bam, | |
destination = filBam, | |
param = ScanBamParam(what = scanBamWhat(), | |
tag = "RE", tagFilter = list(RE = "I"))) | |
# bam2 <- BamFile(filBam) | |
bam2 <- BamFile(filBam, yieldSize = 20) | |
# scanBam(bam2, param = ScanBamParam(what = scanBamWhat())) | |
allTags <- scanBam(bam2, param = ScanBamParam(tag = "CR")) | |
length(allTags) | |
uTags <- unique(unlist(allTags, use.names = FALSE)) | |
writeLines(uTags, "~/data/10x/barcodes.txt") | |
library(BiocParallel) | |
args(MulticoreParam) | |
mcp <- MulticoreParam(workers = 36, jobname = "bamsplit") | |
register(mcp) | |
bplapply(uTags, function(bcode) { | |
filBam <- | |
"~/data/10x/pbmc_granulocyte_sorted_3k_gex_possorted_bam_filter_RE_I.bam" | |
codeDir <- file.path("~/data/10x/barcodes/", bcode) | |
bcBam <- gsub("\\.bam", "", filBam) | |
outBam <- file.path(codeDir, paste0(basename(bcBam), "_", bcode, ".bam")) | |
dir.create(codeDir) | |
library(Rsamtools) | |
filterBam(filBam, | |
destination = outBam, | |
param = ScanBamParam(what = scanBamWhat(), | |
tag = "CR", tagFilter = list(CR = bcode)) | |
) | |
}, BPPARAM = bpparam()) | |
bams <- list.files("~/data/10x/barcodes", pattern = "\\.bam$", | |
recursive = TRUE, full.names = TRUE) | |
set.seed(200) | |
blist <- BamFileList(sample(bams, 10000)) | |
txdb <- (TxDb.Hsapiens.UCSC.hg38.knownGene) | |
genes <- genes(txdb, single.strand.genes.only = FALSE) | |
## counts in each gene ------------------------------------------------------- | |
# se <- summarizeOverlaps( | |
# features = genes, reads = bam, mode = "Union", | |
# singleEnd = FALSE, ignore.strand = TRUE, fragments = TRUE | |
# ) | |
# saveRDS(se, "pbmc_granulocyte_sorted_3k_gez_possorted_bam_summ_overlaps.Rds") | |
## spaces between genes ------------------------------------------------------- | |
intergenicRegions <- gaps(unlist(genes)) | |
interse <- summarizeOverlaps( | |
features = intergenicRegions, reads = blist, mode = "Union", | |
singleEnd = FALSE, ignore.strand = TRUE, fragments = TRUE | |
) | |
#' class: RangedSummarizedExperiment | |
#' dim: 31664 1 | |
#' metadata(0): | |
#' assays(1): counts | |
#' rownames: NULL | |
#' rowData names(0): | |
#' colnames(1): pbmc_granulocyte_sorted_3k_gex_possorted_bam.bam | |
#' colData names(0): | |
## example on 10K cell barcodes | |
table(rowSums(assay(interse))) | |
#' | |
#' 0 1 2 6 7 12 26 32 33 44 88 96 131 257 | |
#' 31647 2 3 1 1 1 1 1 1 1 1 2 1 1 | |
saveRDS(interse, "pbmc_granulocyte_sorted_3k_gez_possorted_interse_summ_overlaps.Rds") | |
library(rhdf5) | |
h5ls("pbmc_granulocyte_sorted_3k_gex_molecule_info.h5") | |
h5read("pbmc_granulocyte_sorted_3k_gex_molecule_info.h5", "/features/feature_type") | |
# auto-read bai file | |
bf <- BamFile("~/data/10x/pbmc_granulocyte_sorted_3k_gex_possorted_bam.bam", | |
yieldSize = 10) | |
scanBam(bf) | |
## read tabix format with Rsamtools | |
tb <- TabixFile("~/data/10x/pbmc_granulocyte_sorted_3k_atac_fragments.tsv.gz", | |
yieldSize = 2000000) | |
scanTabix(tb) | |
ex <- read.table(textConnection(scanTabix(tb)[[1]])) | |
## names from table in the link below | |
## https://support.10xgenomics.com/single-cell-atac/software/pipelines/latest/output/fragments | |
names(ex) <- c("chrom", "chromStart", "chromEnd", "barcode", "readSupport") | |
library(GenomicRanges) | |
ggr <- makeGRangesFromDataFrame(ex, keep.extra.columns = TRUE, starts.in.df.are.0based = TRUE) | |
ggrl <- as(splitAsList(ggr, mcols(ggr)$barcode), "GRangesList") | |
library(RaggedExperiment) | |
rgg <- RaggedExperiment(ggrl) | |
set.seed(123) | |
rowindx <- sample(seq_len(nrow(rgg)), 100000) | |
erg <- rgg[rowindx, ] | |
compactAssay(erg, i = "readSupport", sparse = TRUE) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment