Skip to content

Instantly share code, notes, and snippets.

@eco32i
Created February 22, 2018 21:14
Show Gist options
  • Save eco32i/24448e743b25b2c9957fab4dd935b6a7 to your computer and use it in GitHub Desktop.
Save eco32i/24448e743b25b2c9957fab4dd935b6a7 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import os
import contextlib
import gzip
import argparse
from collections import Counter
from itertools import zip_longest
'''
Requires python >= 3.5
Example usage:
$ ./demux.py ../data/2018-02-12/unindexed/R1.fastq.gz ../data/2018-02-12/unindexed/R2.fastq.gz -b GCCAAG GACTAG -m 1 -d ../data/2018-02-12/unindexed
This will create files R1+GCCAAG.fastq.gz, R1+GACTAG.fastq.gz, R2+GCCAAG.fastq.gz, R2+GACTAG.fastq.gz in
../data/2018-02-12/unindexed directory and run demultiplexing allowing for 1 mismatch in barcode.
'''
# The following was shamelessly lifted from scikit-bio circa v0.2.0
def _ascii_to_phred(s, offset):
"""Convert ascii to Phred quality score with specified ASCII offset."""
return np.fromstring(s, dtype='|S1').view(np.int8) - offset
def ascii_to_phred33(s):
"""Convert ascii string to Phred quality score with ASCII offset of 33.
Standard "Sanger" ASCII offset of 33. This is used by Illumina in CASAVA
versions after 1.8.0, and most other places. Note that internal Illumina
files still use offset of 64
"""
return _ascii_to_phred(s, 33)
def ascii_to_phred64(s):
"""Convert ascii string to Phred quality score with ASCII offset of 64.
Illumina-specific ASCII offset of 64. This is used by Illumina in CASAVA
versions prior to 1.8.0, and in Illumina internal formats (e.g.,
export.txt files).
"""
return _ascii_to_phred(s, 64)
def _drop_id_marker(s):
"""Drop the first character and decode bytes to text"""
id_ = s[1:]
try:
return str(id_.decode('utf-8'))
except AttributeError:
return id_
def parse_fastq(data, strict=False, enforce_qual_range=True, phred_offset=None):
r"""yields label, seq, and qual from a fastq file.
Parameters
----------
data : open file object or str
An open fastq file (opened in binary mode) or a path to it.
strict : bool, optional
Defaults to ``False``. If strict is true a FastqParse error will be
raised if the seq and qual labels dont' match.
enforce_qual_range : bool, optional
Defaults to ``True``. If ``True``, an exception will be raised if a
quality score outside the range [0, 62] is detected
phred_offset : {33, 64, None}, optional
What Phred offset to use when converting qual score symbols to integers
If ``None`` quality is returned as string.
Returns
-------
label, seq, qual : (str, bytes, np.array)
yields the label, sequence and quality for each entry
Examples
--------
Assume we have a fastq formatted file with the following contents::
@seq1
AACACCAAACTTCTCCACCACGTGAGCTACAAAAG
+
````Y^T]`]c^cabcacc`^Lb^ccYT\T\Y\WF
@seq2
TATGTATATATAACATATACATATATACATACATA
+
]KZ[PY]_[YY^```ac^\\`bT``c`\aT``bbb
We can use the following code:
>>> from StringIO import StringIO
>>> from skbio.parse.sequences import parse_fastq
>>> fastq_f = StringIO('@seq1\n'
... 'AACACCAAACTTCTCCACCACGTGAGCTACAAAAG\n'
... '+\n'
... '````Y^T]`]c^cabcacc`^Lb^ccYT\T\Y\WF\n'
... '@seq2\n'
... 'TATGTATATATAACATATACATATATACATACATA\n'
... '+\n'
... ']KZ[PY]_[YY^```ac^\\\`bT``c`\\aT``bbb\n')
>>> for label, seq, qual in parse_fastq(fastq_f, phred_offset=64):
... print(label)
... print(seq)
... print(qual)
seq1
AACACCAAACTTCTCCACCACGTGAGCTACAAAAG
[32 32 32 32 25 30 20 29 32 29 35 30 35 33 34 35 33 35 35 32 30 12 34 30 35
35 25 20 28 20 28 25 28 23 6]
seq2
TATGTATATATAACATATACATATATACATACATA
[29 11 26 27 16 25 29 31 27 25 25 30 32 32 32 33 35 30 28 28 32 34 20 32 32
35 32 28 33 20 32 32 34 34 34]
"""
if phred_offset == 33:
phred_f = ascii_to_phred33
elif phred_offset == 64:
phred_f = ascii_to_phred64
else:
phred_f = None
with gzip.open(data, mode='rt') as fi:
iters = [iter(fi)] * 4
for seqid, seq, qualid, qual in zip_longest(*iters):
seqid = seqid.strip()
# If the file simply ended in a blankline, do not error
if seqid is '':
continue
# Error if an incomplete record is found
# Note: seqid cannot be None, because if all 4 values were None,
# then the loop condition would be false, and we could not have
# gotten to this point
if seq is None or qualid is None or qual is None:
raise ValueError("Incomplete FASTQ record found at end "
"of file")
seq = seq.strip()
qualid = qualid.strip()
qual = qual.strip()
seqid = _drop_id_marker(seqid)
try:
seq = str(seq.decode("utf-8"))
except AttributeError:
pass
qualid = _drop_id_marker(qualid)
if strict:
if seqid != qualid:
raise ValueError('ID mismatch: {} != {}'.format(
seqid, qualid))
# bounds based on illumina limits, see:
# http://nar.oxfordjournals.org/content/38/6/1767/T1.expansion.html
if phred_f is not None:
qual = phred_f(qual)
if (enforce_qual_range
and (phred_f is not None)
and ((qual < 0).any() or (qual > 62).any())):
raise ValueError("Failed qual conversion for seq id: %s. "
"This may be because you passed an "
"incorrect value for phred_offset." %
seqid)
yield (seqid, seq, qual)
def hamming(s1, s2):
'''
Computes Hamming distance between s1 and s2.
'''
if len(s1) != len(s2):
raise ValueError('{s1} and {s2} must be the same length to compute Hamming distance!'.format(s1=s1, s2=s2))
return sum(ch1 != ch2 for ch1,ch2 in zip(s1, s2))
def demux(read1, read2, barcodes, mismatches=0, data_dir=None, progress=1000000):
fq_tpl='@{}\n{}\n+\n{}\n'
fname_tpl = 'R{}+{}.fastq.gz'
barcodes = [s.upper() for s in barcodes]
output_files = {}
stat = Counter()
i = 0
with contextlib.ExitStack() as stack:
for fq1,fq2 in zip(parse_fastq(read1), parse_fastq(read2)):
i += 1
if i % progress == 0:
print('{} reads processed'.format(i))
print(stat)
*_, bc = fq1[0].split()[1].split(':')
for b in barcodes:
if hamming(b, bc) <= mismatches:
if not b in output_files:
output_files[b] = (
stack.enter_context(gzip.open(os.path.join(data_dir, fname_tpl.format(1, b)), 'wt')),
stack.enter_context(gzip.open(os.path.join(data_dir, fname_tpl.format(2, b)), 'wt'))
)
output_files[b][0].write(fq_tpl.format(*fq1))
output_files[b][1].write(fq_tpl.format(*fq2))
stat[b] += 1
continue
return stat
def main():
parser = argparse.ArgumentParser(
description='demux: demultiplexing Illumina .fastq files.')
parser.add_argument('R1', type=str, help='.fastq file with read1 sequences')
parser.add_argument('R2', type=str, help='.fastq file with read2 sequences')
parser.add_argument('-b', '--barcodes', nargs='+', type=str, help='barcode sequences')
parser.add_argument('-m', '--mismatches', type=int, default=0, help='maximum number of mismatches in barcode')
parser.add_argument('-d', '--data_dir', type=str, default='',
help='directory to write output files to')
args = parser.parse_args()
kwargs = vars(args)
print(args.R1)
print(args.R2)
print(kwargs)
stat = demux(args.R1, args.R2, args.barcodes,
mismatches=kwargs['mismatches'], data_dir=kwargs['data_dir'])
print(stat)
print('Done.')
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment