walterst/filter_fastqV2.py

## filter_fastqV2.py
#!/usr/bin/env python

# Used to filter a fastq to match another fastq that is a subset of the query one, e.g. matching a
# index fastq to the pear assembled subset fastq
# Usage:  python filter_fastq.py input_fastq target_fastq output_fastq

from sys import argv

from cogent.parse.fastq import MinimalFastqParser
from qiime.util import gzip_open

header_index = 0
sequence_index = 1
quality_index = 2

if argv[1].endswith('.gz'):
    query_reads = gzip_open(argv[1])
else:
    query_reads = open(argv[1], "U")

if argv[2].endswith('.gz'):
    target_reads = gzip_open(argv[2])
else:
    target_reads = open(argv[2], 'U')

output_fastq = open(argv[3], "w")

target_labels = []

for read_data in MinimalFastqParser(target_reads, strict=False):
    target_labels.append(read_data[header_index].split()[0])

target_labels = set(target_labels)

for read_data in MinimalFastqParser(query_reads, strict=False):
    if read_data[header_index].split()[0] in target_labels:
        curr_read = "@%s\n" % read_data[header_index]
        curr_read += "%s\n" % read_data[sequence_index]
        curr_read += "+\n"
        curr_read += "%s\n" % read_data[quality_index]
        output_fastq.write(curr_read)
	#!/usr/bin/env python

	# Used to filter a fastq to match another fastq that is a subset of the query one, e.g. matching a
	# index fastq to the pear assembled subset fastq
	# Usage: python filter_fastq.py input_fastq target_fastq output_fastq

	from sys import argv

	from cogent.parse.fastq import MinimalFastqParser
	from qiime.util import gzip_open

	header_index = 0
	sequence_index = 1
	quality_index = 2

	if argv[1].endswith('.gz'):
	query_reads = gzip_open(argv[1])
	else:
	query_reads = open(argv[1], "U")

	if argv[2].endswith('.gz'):
	target_reads = gzip_open(argv[2])
	else:
	target_reads = open(argv[2], 'U')

	output_fastq = open(argv[3], "w")

	target_labels = []

	for read_data in MinimalFastqParser(target_reads, strict=False):
	target_labels.append(read_data[header_index].split()[0])

	target_labels = set(target_labels)

	for read_data in MinimalFastqParser(query_reads, strict=False):
	if read_data[header_index].split()[0] in target_labels:
	curr_read = "@%s\n" % read_data[header_index]
	curr_read += "%s\n" % read_data[sequence_index]
	curr_read += "+\n"
	curr_read += "%s\n" % read_data[quality_index]
	output_fastq.write(curr_read)