Last active
May 21, 2019 14:31
-
-
Save afrendeiro/346696d9a0a40f1f8dd9571a874f6eca to your computer and use it in GitHub Desktop.
Convert 10X FASTQ files to single end BAM file with barcodes annotated as tags. Useful for pipelines using BAM format as input (e.g. STAR mapping, featureCounts quantification)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
""" | |
Convert 10X FASTQ files to single end BAM file with barcodes annotated as tags | |
""" | |
from argparse import ArgumentParser | |
from glob import glob | |
from itertools import zip_longest | |
import gzip | |
import pysam | |
parser = ArgumentParser() | |
parser.add_argument(dest="input_prefix") | |
parser.add_argument("-o", "--output", dest="output_bam", default=None) | |
args = parser.parse_args() | |
# args = parser.parse_args("-o test.bam hgmm_1k_v2_S1_L001".split(" ")) | |
if args.output_bam is None: | |
args.output_bam = args.input_prefix + ".bam" | |
files = glob(args.input_prefix + "*fastq.gz") | |
r1_file = [x for x in files if "R1" in x] | |
r2_file = [x for x in files if "R2" in x] | |
index_file = [x for x in files if "I1" in x] | |
assert sum(map(len, [r1_file, r2_file, index_file])) == 3 | |
bam = pysam.AlignmentFile( | |
args.output_bam, mode="wb", | |
reference_names=["chr"], reference_lengths=[1]) | |
read = pysam.AlignedSegment() | |
for i, (r1, r2, ind) in enumerate(zip_longest( | |
gzip.open(r1_file[0], 'rb'), | |
gzip.open(r2_file[0], 'rb'), | |
gzip.open(index_file[0], 'rb'))): | |
# FASTQ line 1 | |
if i % 4 == 0: | |
# # read name | |
read.query_name = r1.strip().split()[0][1:] | |
# FASTQ line 2 | |
if i % 4 == 1: | |
read.query_sequence = r2.strip() | |
read.set_tags([("CB", r1[:16]), ("UM", r1.strip()[16:]), ("RG", ind.strip())]) | |
# FASTQ line 4 | |
if i % 4 == 3: | |
read.query_qualities = pysam.qualitystring_to_array(r2.strip()) | |
read.set_tags(read.get_tags() + [("CQ", r1[:16]), ("UQ", r1.strip()[16:]), ("GQ", ind.strip())]) | |
bam.write(read) | |
read = pysam.AlignedSegment() | |
bam.close() | |
"""R1: | |
@A00519:7:HFWFLDMXX:1:1101:29586:1000 1:N:0:TAACAAGG | |
NCACCTACATGGTCATGCATACGCCTTT | |
+ | |
#FFFFFFFFFFFFFFFFFFFFFFFFFFF | |
""" | |
"""R2: | |
@A00519:7:HFWFLDMXX:1:1101:29586:1000 2:N:0:TAACAAGG | |
CTTTGAGAGGCCGAGGCGGGTGTATCACCTGAGGTCGGGAGTTCGAGACTAGCCTGACCAACATGGAGAAACCCTGTCTCTACTAAAAATA | |
+ | |
FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF | |
""" | |
"""I1 | |
@A00519:7:HFWFLDMXX:1:1101:29586:1000 1:N:0:TAACAAGG | |
TAACAAGG | |
+ | |
FFFFFFFF | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment