Skip to content

Instantly share code, notes, and snippets.

@sminot
Created January 18, 2019 00:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sminot/de6a1d5cd59f50d1c4b84f138bb71c03 to your computer and use it in GitHub Desktop.
Save sminot/de6a1d5cd59f50d1c4b84f138bb71c03 to your computer and use it in GitHub Desktop.
Split reads by 10X barcode (in header)
#!/usr/bin/env python3
from Bio.SeqIO.QualityIO import FastqGeneralIterator
from collections import defaultdict
import gzip
from functools import lru_cache
import os
import sys
assert len(sys.argv) == 3, "Please specify input file and output prefix"
fp_in = sys.argv[1]
assert os.path.exists(fp_in), "File does not exist: {}".format(fp_in)
def parse_barcode(header):
if "_" not in header:
return None
bc = header.split("_", 1)[1]
if is_valid_barcode(bc):
return bc
else:
return None
@lru_cache(maxsize=None)
def is_valid_barcode(bc):
for c in bc:
if c not in ["A", "T", "C", "G"]:
return False
return True
barcode_reads = defaultdict(list)
with gzip.open(fp_in, "rt") as f:
for record in FastqGeneralIterator(f):
barcode = parse_barcode(record[0])
if barcode is None:
continue
barcode_reads[barcode].append(record)
if len(barcode_reads[barcode]) % 1000 == 0:
print("{}\t{:,}".format(barcode, len(barcode_reads[barcode])))
for barcode, reads in barcode_reads.items():
fp_out = "{}_{}.fastq.gz".format(sys.argv[2], barcode)
print("Writing {:,} reads to {}".format(len(reads), fp_out))
with gzip.open(fp_out, "wt") as fo:
for r in reads:
fo.write("@{}\n{}\n+{}\n{}\n".format(r[0], r[1], r[0], r[2]))
print("Done")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment