Created
January 18, 2019 00:16
-
-
Save sminot/de6a1d5cd59f50d1c4b84f138bb71c03 to your computer and use it in GitHub Desktop.
Split reads by 10X barcode (in header)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from Bio.SeqIO.QualityIO import FastqGeneralIterator | |
from collections import defaultdict | |
import gzip | |
from functools import lru_cache | |
import os | |
import sys | |
assert len(sys.argv) == 3, "Please specify input file and output prefix" | |
fp_in = sys.argv[1] | |
assert os.path.exists(fp_in), "File does not exist: {}".format(fp_in) | |
def parse_barcode(header): | |
if "_" not in header: | |
return None | |
bc = header.split("_", 1)[1] | |
if is_valid_barcode(bc): | |
return bc | |
else: | |
return None | |
@lru_cache(maxsize=None) | |
def is_valid_barcode(bc): | |
for c in bc: | |
if c not in ["A", "T", "C", "G"]: | |
return False | |
return True | |
barcode_reads = defaultdict(list) | |
with gzip.open(fp_in, "rt") as f: | |
for record in FastqGeneralIterator(f): | |
barcode = parse_barcode(record[0]) | |
if barcode is None: | |
continue | |
barcode_reads[barcode].append(record) | |
if len(barcode_reads[barcode]) % 1000 == 0: | |
print("{}\t{:,}".format(barcode, len(barcode_reads[barcode]))) | |
for barcode, reads in barcode_reads.items(): | |
fp_out = "{}_{}.fastq.gz".format(sys.argv[2], barcode) | |
print("Writing {:,} reads to {}".format(len(reads), fp_out)) | |
with gzip.open(fp_out, "wt") as fo: | |
for r in reads: | |
fo.write("@{}\n{}\n+{}\n{}\n".format(r[0], r[1], r[0], r[2])) | |
print("Done") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment