-
-
Save winni2k/978b33d62fee5e3484ec757de1a00412 to your computer and use it in GitHub Desktop.
""" | |
MIT License | |
Copyright (c) 2020 Warren W. Kretzschmar | |
Permission is hereby granted, free of charge, to any person obtaining a copy | |
of this software and associated documentation files (the "Software"), to deal | |
in the Software without restriction, including without limitation the rights | |
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
copies of the Software, and to permit persons to whom the Software is | |
furnished to do so, subject to the following conditions: | |
The above copyright notice and this permission notice shall be included in all | |
copies or substantial portions of the Software. | |
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
SOFTWARE. | |
""" | |
from pathlib import Path | |
import pysam | |
class BamWriter: | |
def __init__(self, alignment, barcodes, prefix): | |
self.alignment = alignment | |
self.prefix = prefix | |
self.barcodes = set(barcodes) | |
self._out_files = {} | |
def write_record_to_barcode(self, rec, barcode): | |
if barcode not in self.barcodes: | |
return | |
if barcode not in self._out_files: | |
self._open_file_for_barcode(barcode) | |
self._out_files[barcode].write(rec) | |
def _open_file_for_barcode(self, barcode): | |
self._out_files[barcode] = pysam.AlignmentFile( | |
f"{self.prefix}_{barcode}.bam", "wb", template=self.alignment | |
) | |
def main(input_bam, barcodes_file, output_prefix, contigs, barcode_tag): | |
"""Split a 10x barcoded sequencing file into barcode-specific BAMs | |
input: | |
barcodes_file can be a file containing barcodes, or a single barcode | |
contigs can be '.' for all contigs, 'chr1' for the contig 'chr1', | |
or '1-5' for chromosomes 1, 2, 3, 4, and 5 | |
""" | |
alignment = pysam.AlignmentFile(input_bam) | |
if Path(barcodes_file).is_file(): | |
with open(barcodes_file, "r") as fh: | |
barcodes = [l.rstrip() for l in fh.readlines()] | |
else: | |
barcodes = [barcodes_file] | |
print(f"Extracting single barcode: {barcodes}") | |
writer = BamWriter(alignment=alignment, barcodes=barcodes, prefix=output_prefix) | |
if contigs == ".": | |
print("Extracting reads from all contigs") | |
recs = [alignment.fetch()] | |
else: | |
if "-" in contigs: | |
start, end = contigs.split("-") | |
print(f"Extracting reads from contigs {start} to {end}") | |
recs = (alignment.fetch(str(contig)) for contig in range(start, end + 1)) | |
elif "," in contigs: | |
contigs = contigs.split(",") | |
print(f"Extracting reads from contigs {contigs}") | |
recs = (alignment.fetch(str(contig)) for contig in contigs) | |
else: | |
print(f"Extracting reads for one contig: {contigs}") | |
recs = (alignment.fetch(c) for c in [contigs]) | |
for region in recs: | |
for rec in region: | |
try: | |
barcode = rec.get_tag(barcode_tag) | |
writer.write_record_to_barcode(rec=rec, barcode=barcode) | |
except KeyError: | |
pass | |
if __name__ == "__main__": | |
import argparse | |
parser = argparse.ArgumentParser(description="Split a 10x barcoded sequencing file into barcode-specific BAMs") | |
parser.add_argument("input_bam", help="Input BAM file") | |
parser.add_argument("barcodes_file", help="File containing barcodes, or a single barcode") | |
parser.add_argument("output_prefix", help="Output file prefix for barcode-specific BAM files") | |
parser.add_argument("contigs", help="Contigs to extract reads from: '.' for all contigs, 'chr1' for the contig 'chr1', '1-5' for chromosomes 1, 2, 3, 4, and 5, or '1,2,3' for contigs 1, 2, and 3") | |
parser.add_argument("--barcode_tag", default="CB", help="Barcode tag to use for extracting barcodes") | |
args = parser.parse_args() | |
main( | |
input_bam=args.input_bam, | |
barcodes_file=args.barcodes_file, | |
output_prefix=args.output_prefix, | |
contigs=args.contigs, | |
barcode_tag=args.barcode_tag, | |
) |
Thanks for the helpful comment! I agree that this amendment to the script makes a lot of sense.
Hi @winni2k! Thanks for developing this script
I ran the script with a BAM file resulting from cellranger-v7 and wouldn't work. This is because the cell barcode tag is CB:Z
instead of CB
(as in your script).
An example of one read:
sample_name.96433731 0 chr1 11304 0 84M * 0 0 GCACTGCAGGGCCCTCTTGCTTACTGTATAGTGGTGGCACGCCGCCTGCTGGCAGCTAGGGACATTGCAGGGTCCTCTTGCTCA FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF NH:i:8 HI:i:1 AS:i:82 nM:i:0 RG:Z:GSM4955739:0:1:unknow_flowcell:0 RE:A:I xf:i:0 CR:Z:CCCAGTTAGCCACCTG CY:Z:FFFFFFFFFFFFFFFF CB:Z:CCCAGTTAGCCACCTG-1 UR:Z:GCATTAGCGG UY:Z:FFFFFFFFFF UB:Z:GCATTAGCGG
I just changed the line in your script:
barcode = rec.get_tag("CB")
--> barcode = rec.get_tag("CB:Z")
and worked perfectly.
Thanks for the info @rubenchazarra! There seems to be a bit of demand for this script. I'll see if I can make an update in my spare time.
I had GPT4 change the script so that it now uses argparse and allows the user to define the barcode tag to use on the command line. I have not tested the changes. Please let me know if anything has broken.
Hi,
Thanks for the great tool! I just have a small comment. As it is, your script does not retrieve unmapped reads. Might be useful to have an option to obtain all reads, which can be achieved with 'until_eof=True' in the 'fetch' method.