Last active
June 1, 2016 12:37
-
-
Save Tabea-K/9fc01daab8e3c3b18002 to your computer and use it in GitHub Desktop.
Script to search for specific strings in the annotation lines of fastq files. Similar to grep, therefore the name!
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
''' | |
This script extracts specific fastq sequences | |
from a multi-fastq sequence file, with the ID's | |
of the sequences to be extracted in a txt file. | |
''' | |
import sys | |
import argparse | |
import os | |
def yield_fastq(filename): | |
""" | |
Yields a tuple of idline, seq, spacer and quals | |
from a fastq file | |
""" | |
fh = open(filename, 'r') | |
for line in fh: | |
idline=line.strip() | |
seq =next(fh).strip() | |
spacer=next(fh).strip() | |
quals =next(fh).strip() | |
yield idline, seq, spacer, quals | |
# MAIN | |
if __name__ == "__main__": | |
count_records, count_hits = 0, 0 | |
# parse arguments | |
parser = argparse.ArgumentParser() | |
parser.add_argument("ID_file", help="file with IDs to extract OR string of ID to look for") | |
parser.add_argument("Fasta_file", help="file with Fasta sequences") | |
parser.add_argument("--p", "--partial", default=False, action="store_true", help="optional argument that tells the script to only look for partial match between the annotation line in the fasta file and the ID. Default is to NOT look for exact match") | |
parser.add_argument("--i", "--ignore_case", default=False, action="store_true", help="ignore the latter case of the pattern") | |
parser.add_argument("--v", "--verbose", help="increase output verbosity", action="store_true") | |
args = parser.parse_args() | |
pattern = args.ID_file | |
seq_file = args.Fasta_file | |
partial_match_allowed = args.p | |
ignore_case = args.i | |
verbose = args.v | |
# try to open patterns file | |
if os.path.isfile(pattern): | |
# try to open a file with this name | |
pattern_file = open(pattern, 'r') | |
patterns = [] | |
for p in pattern_file.readlines(): | |
p = p.strip() | |
patterns.append(p) | |
else: | |
# if file can not be opened, it is probably an ID to look for | |
patterns = [] | |
patterns.append(pattern.strip()) | |
#print "01", patterns, type(patterns) | |
if verbose: | |
print "Looking for pattern(s):\n", '\n\t'.join(patterns) | |
# if ignore case, make upper case | |
if ignore_case: | |
patterns_upper = [] | |
if len(patterns) > 1: | |
for p in pattern: patterns_upper.append(p.upper()) | |
else: | |
patterns_upper.append(patterns[0].upper()) | |
patterns = patterns_upper | |
#print "02", patterns, type(patterns) | |
# make unique set of all patterns to look for | |
if len(patterns) > 1: | |
patterns = set(patterns) | |
#print "03", patterns, type(patterns) | |
for idline, seq, spacer, quals in yield_fastq(seq_file): | |
if verbose: print "idline: ",idline | |
# make id upper case, if case is ignored | |
id = idline | |
if ignore_case: id = id.upper() | |
# check if pattern is found in idline | |
if id in patterns or id.replace('@', '') in patterns or '@'+id in patterns: | |
sys.stdout.write( '%s\n%s\n%s\n%s\n' % ( idline, seq, spacer, quals ) ) | |
count_hits += 1 | |
else: | |
if partial_match_allowed == True: | |
for pattern in patterns: | |
if pattern in id: | |
#print pattern, id | |
sys.stdout.write( '%s\n%s\n%s\n%s\n' % ( idline, seq, spacer, quals ) ) | |
count_hits += 1 | |
break | |
count_records += 1 | |
if args.v: | |
if count_hits == 1: | |
print "Searched in %s records, found %s hit."%(count_records, count_hits) | |
else: | |
print "Searched in %s records, found %s hits."%(count_records, count_hits) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment