Skip to content

Instantly share code, notes, and snippets.

@brwnj
Last active August 15, 2016 21:09
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save brwnj/3861828 to your computer and use it in GitHub Desktop.
Save brwnj/3861828 to your computer and use it in GitHub Desktop.
join paired-end or print unique single-end reads
#!/usr/bin/env python
# encoding: utf-8
"""
Join R1 and R2, or print unpaired reads of R1 or R2
This script assumes there is whitespace separating the readname from the pair number:
> <
@HW-ST997:140:D0UA5ACXX:8:1101:1497:2138 1:N:0:ATCACG
@HW-ST997:140:D0UA5ACXX:8:1101:1497:2138 2:N:0:ATCACG
> <
"""
import sys
import bz2
import gzip
import string
import os.path as op
from itertools import islice
from toolshed import nopen
__author__ = "Joe Brown"
__author_email__ = "brwnjm@gmail.com"
COMPLEMENT = string.maketrans('ACGTNSRYMKWHBVD','TGCANSRYMKWHBVD')
def read_fastq(fq):
r"""fastq parser that returns name, seq, and qual."""
while True:
values = list(islice(fq, 4))
if len(values) == 4:
id1, seq, id2, qual = values
elif len(values) == 0:
raise StopIteration
else:
raise EOFError("unexpected end of file")
assert id1.startswith('@'),\
">> Fastq out of sync at read:\n%s\n" % id1
assert id2.startswith('+'),\
">> Fastq out of sync at read:\n%s\n" % id1
assert len(seq) == len(qual),\
">> Sequence and Quality are not the same length \
for read:\n%s\n" % id1
yield id1[1:-1], seq[:-1], qual[:-1]
def reversecomplement(seq):
"""return reverse complement of seq."""
return seq.translate(COMPLEMENT)[::-1]
def fastqtodict(fastq):
"""returns dict of read name to sequence"""
fdict = {}
with nopen(fastq) as fq:
for name, seq, qual in read_fastq(fq):
# explicitly state space to facilitate future changes
fdict[name.split(" ")[0]] = seq
return fdict
def main(args):
# set dictionary based on mode
if args.mode == "R2":
fastqdict = fastqtodict(args.R1)
fastq = args.R2
else:
fastqdict = fastqtodict(args.R2)
fastq = args.R1
with nopen(fastq) as fq:
for name, seq, qual in read_fastq(fq):
try:
# explicitly state space to facilitate future changes
name = name.split(" ")[0]
cseq = fastqdict.get(name)
if args.reverse_complement:
cseq = reversecomplement(cseq)
print "@%s\n%s%s" % (name, seq, cseq)
except KeyError:
# without pairs
if not args.mode == "paired":
print "@%s\n%s" % (name, seq)
if __name__ == "__main__":
import argparse
p = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
p.add_argument('R1', help="fastq of R1")
p.add_argument('R2', help="fastq of R2")
p.add_argument("--mode", "-m", choices=["paired", "R1", "R2"],
default="paired", help="prints <mode> to stdout where \
R1 is equivalent to reads without a pair in R2 [%(default)s]")
p.add_argument("--reverse_complement", "-r", action="store_false",
default=True, help="reverse complement R2 when joining [%(default)s]")
args = p.parse_args()
main(args)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment