Last active
August 8, 2017 09:26
-
-
Save walterst/f06ceae7cd25a69b4f80d923e0ff17fc to your computer and use it in GitHub Desktop.
Filter a fastq file to match target fastq labels, e.g. after stitching reads.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Used to filter a fastq to match another fastq that is a subset of the query one, e.g. matching a | |
# index fastq to the pear assembled subset fastq | |
# Usage: python filter_fastq.py input_fastq target_fastq output_fastq | |
from sys import argv | |
from cogent.parse.fastq import MinimalFastqParser | |
from qiime.util import gzip_open | |
header_index = 0 | |
sequence_index = 1 | |
quality_index = 2 | |
if argv[1].endswith('.gz'): | |
query_reads = gzip_open(argv[1]) | |
else: | |
query_reads = open(argv[1], "U") | |
if argv[2].endswith('.gz'): | |
target_reads = gzip_open(argv[2]) | |
else: | |
target_reads = open(argv[2], 'U') | |
output_fastq = open(argv[3], "w") | |
target_labels = [] | |
for read_data in MinimalFastqParser(target_reads, strict=False): | |
target_labels.append(read_data[header_index].split()[0]) | |
target_labels = set(target_labels) | |
for read_data in MinimalFastqParser(query_reads, strict=False): | |
if read_data[header_index].split()[0] in target_labels: | |
curr_read = "@%s\n" % read_data[header_index] | |
curr_read += "%s\n" % read_data[sequence_index] | |
curr_read += "+\n" | |
curr_read += "%s\n" % read_data[quality_index] | |
output_fastq.write(curr_read) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment