Skip to content

Instantly share code, notes, and snippets.

@afrendeiro
Last active May 21, 2019 14:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save afrendeiro/346696d9a0a40f1f8dd9571a874f6eca to your computer and use it in GitHub Desktop.
Save afrendeiro/346696d9a0a40f1f8dd9571a874f6eca to your computer and use it in GitHub Desktop.
Convert 10X FASTQ files to single end BAM file with barcodes annotated as tags. Useful for pipelines using BAM format as input (e.g. STAR mapping, featureCounts quantification)
#! /usr/bin/env python
"""
Convert 10X FASTQ files to single end BAM file with barcodes annotated as tags
"""
from argparse import ArgumentParser
from glob import glob
from itertools import zip_longest
import gzip
import pysam
parser = ArgumentParser()
parser.add_argument(dest="input_prefix")
parser.add_argument("-o", "--output", dest="output_bam", default=None)
args = parser.parse_args()
# args = parser.parse_args("-o test.bam hgmm_1k_v2_S1_L001".split(" "))
if args.output_bam is None:
args.output_bam = args.input_prefix + ".bam"
files = glob(args.input_prefix + "*fastq.gz")
r1_file = [x for x in files if "R1" in x]
r2_file = [x for x in files if "R2" in x]
index_file = [x for x in files if "I1" in x]
assert sum(map(len, [r1_file, r2_file, index_file])) == 3
bam = pysam.AlignmentFile(
args.output_bam, mode="wb",
reference_names=["chr"], reference_lengths=[1])
read = pysam.AlignedSegment()
for i, (r1, r2, ind) in enumerate(zip_longest(
gzip.open(r1_file[0], 'rb'),
gzip.open(r2_file[0], 'rb'),
gzip.open(index_file[0], 'rb'))):
# FASTQ line 1
if i % 4 == 0:
# # read name
read.query_name = r1.strip().split()[0][1:]
# FASTQ line 2
if i % 4 == 1:
read.query_sequence = r2.strip()
read.set_tags([("CB", r1[:16]), ("UM", r1.strip()[16:]), ("RG", ind.strip())])
# FASTQ line 4
if i % 4 == 3:
read.query_qualities = pysam.qualitystring_to_array(r2.strip())
read.set_tags(read.get_tags() + [("CQ", r1[:16]), ("UQ", r1.strip()[16:]), ("GQ", ind.strip())])
bam.write(read)
read = pysam.AlignedSegment()
bam.close()
"""R1:
@A00519:7:HFWFLDMXX:1:1101:29586:1000 1:N:0:TAACAAGG
NCACCTACATGGTCATGCATACGCCTTT
+
#FFFFFFFFFFFFFFFFFFFFFFFFFFF
"""
"""R2:
@A00519:7:HFWFLDMXX:1:1101:29586:1000 2:N:0:TAACAAGG
CTTTGAGAGGCCGAGGCGGGTGTATCACCTGAGGTCGGGAGTTCGAGACTAGCCTGACCAACATGGAGAAACCCTGTCTCTACTAAAAATA
+
FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
"""
"""I1
@A00519:7:HFWFLDMXX:1:1101:29586:1000 1:N:0:TAACAAGG
TAACAAGG
+
FFFFFFFF
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment