Skip to content

Instantly share code, notes, and snippets.

@walterst
Last active April 17, 2018 12:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save walterst/96de484cdcfb13c9bed3fa27d8576dce to your computer and use it in GitHub Desktop.
Save walterst/96de484cdcfb13c9bed3fa27d8576dce to your computer and use it in GitHub Desktop.
Very simple fastq parser/checker to try and detect errors. assumes lines will be exactly (@Label, sequence, +, quality scores). Checks for expected chars at label/optional label, equal length of seq/qual.
#!/usr/bin/env python
# Used to find fastq seqs in gzipped files, write first error, if any, to a log file
# Usage: python find_fastq_errors.py fastq_folder log_file
# where fastq_folder has all of the fastq files in it-will search subdirectories
from sys import argv
from glob import glob
import gzip
import os
output_log = open(argv[2], "w")
fastq_files = []
fastq_files.extend(glob(argv[1] + "/*.fastq.gz") + glob(argv[1] + "/*.fastq"))
for root,dirs,files in os.walk(argv[1]):
for curr_dir in dirs:
fastq_files.extend(glob(curr_dir + "/*.fastq.gz") + glob(curr_dir + "/*.fastq"))
# Expected to be 4 line per sequence, no empty lines, this is limited parser
for curr_file in fastq_files:
if curr_file.endswith('.gz'):
query_reads = gzip.open(curr_file, "rb")
else:
query_reads = open(curr_file, "U")
keep_reading = True
error_data = ""
while keep_reading:
label = query_reads.readline().strip()
if(label==""): # hit end of file
keep_reading = False
break
if not label.startswith("@"):
error_data += "Found read label without @: %s\n" % label
seq = query_reads.readline().strip()
opt_label = query_reads.readline().strip()
if not opt_label.startswith("+"):
error_data += "Found optional read label without +: %s\n" % opt_label
qual = query_reads.readline().strip()
if len(seq) != len(qual):
error_data += "Found seq and qual of unequal lengths: \n%s\n%s" % (seq, qual)
if error_data:
break
if error_data:
output_log.write("%s\n%s\n" % (curr_file, error_data))
query_reads.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment