Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Convert 10X FASTQ files to single end BAM file with barcodes annotated as tags. Useful for pipelines using BAM format as input (e.g. STAR mapping, featureCounts quantification)
#! /usr/bin/env python
"""
Convert 10X FASTQ files to single end BAM file with barcodes annotated as tags
"""
from argparse import ArgumentParser
from glob import glob
from itertools import zip_longest
import gzip
import pysam
parser = ArgumentParser()
parser.add_argument(dest="input_prefix")
parser.add_argument("-o", "--output", dest="output_bam", default=None)
args = parser.parse_args()
# args = parser.parse_args("-o test.bam hgmm_1k_v2_S1_L001".split(" "))
if args.output_bam is None:
args.output_bam = args.input_prefix + ".bam"
files = glob(args.input_prefix + "*fastq.gz")
r1_file = [x for x in files if "R1" in x]
r2_file = [x for x in files if "R2" in x]
index_file = [x for x in files if "I1" in x]
assert sum(map(len, [r1_file, r2_file, index_file])) == 3
bam = pysam.AlignmentFile(
args.output_bam, mode="wb",
reference_names=["chr"], reference_lengths=[1])
read = pysam.AlignedSegment()
for i, (r1, r2, ind) in enumerate(zip_longest(
gzip.open(r1_file[0], 'rb'),
gzip.open(r2_file[0], 'rb'),
gzip.open(index_file[0], 'rb'))):
# FASTQ line 1
if i % 4 == 0:
# # read name
read.query_name = r1.strip().split()[0][1:]
# FASTQ line 2
if i % 4 == 1:
read.query_sequence = r2.strip()
read.set_tags([("CB", r1[:16]), ("UM", r1.strip()[16:]), ("RG", ind.strip())])
# FASTQ line 4
if i % 4 == 3:
read.query_qualities = pysam.qualitystring_to_array(r2.strip())
read.set_tags(read.get_tags() + [("CQ", r1[:16]), ("UQ", r1.strip()[16:]), ("GQ", ind.strip())])
bam.write(read)
read = pysam.AlignedSegment()
bam.close()
"""R1:
@A00519:7:HFWFLDMXX:1:1101:29586:1000 1:N:0:TAACAAGG
NCACCTACATGGTCATGCATACGCCTTT
+
#FFFFFFFFFFFFFFFFFFFFFFFFFFF
"""
"""R2:
@A00519:7:HFWFLDMXX:1:1101:29586:1000 2:N:0:TAACAAGG
CTTTGAGAGGCCGAGGCGGGTGTATCACCTGAGGTCGGGAGTTCGAGACTAGCCTGACCAACATGGAGAAACCCTGTCTCTACTAAAAATA
+
FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
"""
"""I1
@A00519:7:HFWFLDMXX:1:1101:29586:1000 1:N:0:TAACAAGG
TAACAAGG
+
FFFFFFFF
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment