Created
August 26, 2015 06:57
-
-
Save Tabea-K/94672c786697ae2cfcb2 to your computer and use it in GitHub Desktop.
Prints the lengths of the quality and the sequence lines of a fastq file, and prints a boolean indicating whether the quality line and the sequence line have the same length.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import sys | |
import os | |
import argparse | |
q_ascii = "!#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~" | |
q_ascii += '"' | |
def id_error(line_no, name, line): | |
""" | |
Prints error message. | |
""" | |
print "Error in line %s of file, because no name (%s):\n'%s'"%(line_no, name, line) | |
def check_lengths(fh): | |
""" | |
Goes through each line and checks length | |
""" | |
lineno = 0 | |
for i, line in enumerate(fh.readlines()): | |
line = line.strip() | |
#print i, lineno, line | |
if lineno == 0: | |
if not line.startswith('@'): | |
print "Error 01 in line %s. Invalid first line:"%i | |
print " %s"%line | |
print " Line needs to start with '@'!" | |
name = line | |
lineno += 1 | |
elif lineno == 1: | |
line = line.upper() | |
if not set(line).issubset(set('ACTGN')): | |
print "Error 02 in line %s. Invalid nucleotide sequence:"%i | |
print " %s"%line | |
print " Contains these characters: %s"%set(line) | |
print " Invalid character/s: %s"%(set(line) - set('ACTGN')) | |
print " Allowed characters are: %s"%set('ACTGN') | |
seq_len = len(line) | |
lineno += 1 | |
elif lineno == 2: | |
if not line.startswith('+'): | |
print "Error 02 in line %s. Invalid third line:"%i | |
print " %s"%line | |
print " Line needs to start with '+'!" | |
lineno += 1 | |
elif lineno == 3: | |
if not set(line).issubset(set(q_ascii)): | |
print "Error 04 in line %s. Invalid quality line:"%i | |
print " %s"%line | |
print " Contains these characters: %s"%set(line) | |
print " Invalid character/s: %s"%(set(line) - set(q_ascii)) | |
print " Allowed characters are: %s"%set(q_ascii) | |
qual_len = len(line) | |
lineno = 0 | |
print name, seq_len, qual_len, '\t', seq_len == qual_len | |
# MAIN | |
if __name__ == "__main__": | |
# parse arguments | |
parser = argparse.ArgumentParser() | |
parser.add_argument("Fastq_file", nargs='?', type=argparse.FileType('r'), help="file with Fastq sequences", \ | |
default=sys.stdin) | |
args = parser.parse_args() | |
seq_file = args.Fastq_file | |
check_lengths(seq_file) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment