Skip to content

Instantly share code, notes, and snippets.

@walterst
Last active August 8, 2017 09:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save walterst/f06ceae7cd25a69b4f80d923e0ff17fc to your computer and use it in GitHub Desktop.
Save walterst/f06ceae7cd25a69b4f80d923e0ff17fc to your computer and use it in GitHub Desktop.
Filter a fastq file to match target fastq labels, e.g. after stitching reads.
#!/usr/bin/env python
# Used to filter a fastq to match another fastq that is a subset of the query one, e.g. matching a
# index fastq to the pear assembled subset fastq
# Usage: python filter_fastq.py input_fastq target_fastq output_fastq
from sys import argv
from cogent.parse.fastq import MinimalFastqParser
from qiime.util import gzip_open
header_index = 0
sequence_index = 1
quality_index = 2
if argv[1].endswith('.gz'):
query_reads = gzip_open(argv[1])
else:
query_reads = open(argv[1], "U")
if argv[2].endswith('.gz'):
target_reads = gzip_open(argv[2])
else:
target_reads = open(argv[2], 'U')
output_fastq = open(argv[3], "w")
target_labels = []
for read_data in MinimalFastqParser(target_reads, strict=False):
target_labels.append(read_data[header_index].split()[0])
target_labels = set(target_labels)
for read_data in MinimalFastqParser(query_reads, strict=False):
if read_data[header_index].split()[0] in target_labels:
curr_read = "@%s\n" % read_data[header_index]
curr_read += "%s\n" % read_data[sequence_index]
curr_read += "+\n"
curr_read += "%s\n" % read_data[quality_index]
output_fastq.write(curr_read)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment