Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
Script to compare two fastq files, print labels that do not match, and tests for sequence/quality lengths that are different (useful in cases where one suspects index reads and other reads do not have matching labels). Made to work with CASAVA 1.8.0 and later, might fail on earlier versions.Usage: python fastq1_fp fastq2_fp
#!/usr/bin/env python
# Modified from Greg Caporaso's code in qiime/
# Usage: python fastq1_fp fastq2_fp
from itertools import izip
from sys import argv
from cogent.parse.fastq import MinimalFastqParser
header_index = 0
sequence_index = 1
quality_index = 2
read1 = open(argv[1], "U")
read2 = open(argv[2], "U")
def check_header_match_180_or_later(header1,header2):
""" Confirm headers are compatible in CASAVA 1.8.0 or later
These contain information on the read number, so can differ
header1 = header1.split(':')
header2 = header2.split(':')
for e1,e2 in zip(header1,header2):
if e1.split(' ')[0] != e2.split(' ')[0]:
return False
return True
print "Printing labels that do not match before space character:"
mismatched_labels_found = False
for read1_data,read2_data in izip(MinimalFastqParser(read1,strict=False),
if not check_header_match_180_or_later(read1_data[header_index], read2_data[header_index]):
print "Mismatched labels: %s, %s" % (read1_data[header_index], read2_data[header_index])
mismatched_labels_found = True
if not (len(read1_data[sequence_index]) == len(read1_data[quality_index])):
print "Sequence and quality score lengths do not match for read 1 label %s " % read1_data[header_index]
if not (len(read2_data[sequence_index]) == len(read2_data[quality_index])):
print "Sequence and quality score lengths do not match for read 2 label %s " % read2_data[header_index]
if not mismatched_labels_found:
print "No mismatched labels found."
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment