Created
February 22, 2018 21:14
-
-
Save eco32i/24448e743b25b2c9957fab4dd935b6a7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import os | |
import contextlib | |
import gzip | |
import argparse | |
from collections import Counter | |
from itertools import zip_longest | |
''' | |
Requires python >= 3.5 | |
Example usage: | |
$ ./demux.py ../data/2018-02-12/unindexed/R1.fastq.gz ../data/2018-02-12/unindexed/R2.fastq.gz -b GCCAAG GACTAG -m 1 -d ../data/2018-02-12/unindexed | |
This will create files R1+GCCAAG.fastq.gz, R1+GACTAG.fastq.gz, R2+GCCAAG.fastq.gz, R2+GACTAG.fastq.gz in | |
../data/2018-02-12/unindexed directory and run demultiplexing allowing for 1 mismatch in barcode. | |
''' | |
# The following was shamelessly lifted from scikit-bio circa v0.2.0 | |
def _ascii_to_phred(s, offset): | |
"""Convert ascii to Phred quality score with specified ASCII offset.""" | |
return np.fromstring(s, dtype='|S1').view(np.int8) - offset | |
def ascii_to_phred33(s): | |
"""Convert ascii string to Phred quality score with ASCII offset of 33. | |
Standard "Sanger" ASCII offset of 33. This is used by Illumina in CASAVA | |
versions after 1.8.0, and most other places. Note that internal Illumina | |
files still use offset of 64 | |
""" | |
return _ascii_to_phred(s, 33) | |
def ascii_to_phred64(s): | |
"""Convert ascii string to Phred quality score with ASCII offset of 64. | |
Illumina-specific ASCII offset of 64. This is used by Illumina in CASAVA | |
versions prior to 1.8.0, and in Illumina internal formats (e.g., | |
export.txt files). | |
""" | |
return _ascii_to_phred(s, 64) | |
def _drop_id_marker(s): | |
"""Drop the first character and decode bytes to text""" | |
id_ = s[1:] | |
try: | |
return str(id_.decode('utf-8')) | |
except AttributeError: | |
return id_ | |
def parse_fastq(data, strict=False, enforce_qual_range=True, phred_offset=None): | |
r"""yields label, seq, and qual from a fastq file. | |
Parameters | |
---------- | |
data : open file object or str | |
An open fastq file (opened in binary mode) or a path to it. | |
strict : bool, optional | |
Defaults to ``False``. If strict is true a FastqParse error will be | |
raised if the seq and qual labels dont' match. | |
enforce_qual_range : bool, optional | |
Defaults to ``True``. If ``True``, an exception will be raised if a | |
quality score outside the range [0, 62] is detected | |
phred_offset : {33, 64, None}, optional | |
What Phred offset to use when converting qual score symbols to integers | |
If ``None`` quality is returned as string. | |
Returns | |
------- | |
label, seq, qual : (str, bytes, np.array) | |
yields the label, sequence and quality for each entry | |
Examples | |
-------- | |
Assume we have a fastq formatted file with the following contents:: | |
@seq1 | |
AACACCAAACTTCTCCACCACGTGAGCTACAAAAG | |
+ | |
````Y^T]`]c^cabcacc`^Lb^ccYT\T\Y\WF | |
@seq2 | |
TATGTATATATAACATATACATATATACATACATA | |
+ | |
]KZ[PY]_[YY^```ac^\\`bT``c`\aT``bbb | |
We can use the following code: | |
>>> from StringIO import StringIO | |
>>> from skbio.parse.sequences import parse_fastq | |
>>> fastq_f = StringIO('@seq1\n' | |
... 'AACACCAAACTTCTCCACCACGTGAGCTACAAAAG\n' | |
... '+\n' | |
... '````Y^T]`]c^cabcacc`^Lb^ccYT\T\Y\WF\n' | |
... '@seq2\n' | |
... 'TATGTATATATAACATATACATATATACATACATA\n' | |
... '+\n' | |
... ']KZ[PY]_[YY^```ac^\\\`bT``c`\\aT``bbb\n') | |
>>> for label, seq, qual in parse_fastq(fastq_f, phred_offset=64): | |
... print(label) | |
... print(seq) | |
... print(qual) | |
seq1 | |
AACACCAAACTTCTCCACCACGTGAGCTACAAAAG | |
[32 32 32 32 25 30 20 29 32 29 35 30 35 33 34 35 33 35 35 32 30 12 34 30 35 | |
35 25 20 28 20 28 25 28 23 6] | |
seq2 | |
TATGTATATATAACATATACATATATACATACATA | |
[29 11 26 27 16 25 29 31 27 25 25 30 32 32 32 33 35 30 28 28 32 34 20 32 32 | |
35 32 28 33 20 32 32 34 34 34] | |
""" | |
if phred_offset == 33: | |
phred_f = ascii_to_phred33 | |
elif phred_offset == 64: | |
phred_f = ascii_to_phred64 | |
else: | |
phred_f = None | |
with gzip.open(data, mode='rt') as fi: | |
iters = [iter(fi)] * 4 | |
for seqid, seq, qualid, qual in zip_longest(*iters): | |
seqid = seqid.strip() | |
# If the file simply ended in a blankline, do not error | |
if seqid is '': | |
continue | |
# Error if an incomplete record is found | |
# Note: seqid cannot be None, because if all 4 values were None, | |
# then the loop condition would be false, and we could not have | |
# gotten to this point | |
if seq is None or qualid is None or qual is None: | |
raise ValueError("Incomplete FASTQ record found at end " | |
"of file") | |
seq = seq.strip() | |
qualid = qualid.strip() | |
qual = qual.strip() | |
seqid = _drop_id_marker(seqid) | |
try: | |
seq = str(seq.decode("utf-8")) | |
except AttributeError: | |
pass | |
qualid = _drop_id_marker(qualid) | |
if strict: | |
if seqid != qualid: | |
raise ValueError('ID mismatch: {} != {}'.format( | |
seqid, qualid)) | |
# bounds based on illumina limits, see: | |
# http://nar.oxfordjournals.org/content/38/6/1767/T1.expansion.html | |
if phred_f is not None: | |
qual = phred_f(qual) | |
if (enforce_qual_range | |
and (phred_f is not None) | |
and ((qual < 0).any() or (qual > 62).any())): | |
raise ValueError("Failed qual conversion for seq id: %s. " | |
"This may be because you passed an " | |
"incorrect value for phred_offset." % | |
seqid) | |
yield (seqid, seq, qual) | |
def hamming(s1, s2): | |
''' | |
Computes Hamming distance between s1 and s2. | |
''' | |
if len(s1) != len(s2): | |
raise ValueError('{s1} and {s2} must be the same length to compute Hamming distance!'.format(s1=s1, s2=s2)) | |
return sum(ch1 != ch2 for ch1,ch2 in zip(s1, s2)) | |
def demux(read1, read2, barcodes, mismatches=0, data_dir=None, progress=1000000): | |
fq_tpl='@{}\n{}\n+\n{}\n' | |
fname_tpl = 'R{}+{}.fastq.gz' | |
barcodes = [s.upper() for s in barcodes] | |
output_files = {} | |
stat = Counter() | |
i = 0 | |
with contextlib.ExitStack() as stack: | |
for fq1,fq2 in zip(parse_fastq(read1), parse_fastq(read2)): | |
i += 1 | |
if i % progress == 0: | |
print('{} reads processed'.format(i)) | |
print(stat) | |
*_, bc = fq1[0].split()[1].split(':') | |
for b in barcodes: | |
if hamming(b, bc) <= mismatches: | |
if not b in output_files: | |
output_files[b] = ( | |
stack.enter_context(gzip.open(os.path.join(data_dir, fname_tpl.format(1, b)), 'wt')), | |
stack.enter_context(gzip.open(os.path.join(data_dir, fname_tpl.format(2, b)), 'wt')) | |
) | |
output_files[b][0].write(fq_tpl.format(*fq1)) | |
output_files[b][1].write(fq_tpl.format(*fq2)) | |
stat[b] += 1 | |
continue | |
return stat | |
def main(): | |
parser = argparse.ArgumentParser( | |
description='demux: demultiplexing Illumina .fastq files.') | |
parser.add_argument('R1', type=str, help='.fastq file with read1 sequences') | |
parser.add_argument('R2', type=str, help='.fastq file with read2 sequences') | |
parser.add_argument('-b', '--barcodes', nargs='+', type=str, help='barcode sequences') | |
parser.add_argument('-m', '--mismatches', type=int, default=0, help='maximum number of mismatches in barcode') | |
parser.add_argument('-d', '--data_dir', type=str, default='', | |
help='directory to write output files to') | |
args = parser.parse_args() | |
kwargs = vars(args) | |
print(args.R1) | |
print(args.R2) | |
print(kwargs) | |
stat = demux(args.R1, args.R2, args.barcodes, | |
mismatches=kwargs['mismatches'], data_dir=kwargs['data_dir']) | |
print(stat) | |
print('Done.') | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment