Skip to content

Instantly share code, notes, and snippets.

@Tabea-K
Last active June 1, 2016 12:37
Show Gist options
  • Save Tabea-K/9fc01daab8e3c3b18002 to your computer and use it in GitHub Desktop.
Save Tabea-K/9fc01daab8e3c3b18002 to your computer and use it in GitHub Desktop.
Script to search for specific strings in the annotation lines of fastq files. Similar to grep, therefore the name!
#!/usr/bin/env python
'''
This script extracts specific fastq sequences
from a multi-fastq sequence file, with the ID's
of the sequences to be extracted in a txt file.
'''
import sys
import argparse
import os
def yield_fastq(filename):
"""
Yields a tuple of idline, seq, spacer and quals
from a fastq file
"""
fh = open(filename, 'r')
for line in fh:
idline=line.strip()
seq =next(fh).strip()
spacer=next(fh).strip()
quals =next(fh).strip()
yield idline, seq, spacer, quals
# MAIN
if __name__ == "__main__":
count_records, count_hits = 0, 0
# parse arguments
parser = argparse.ArgumentParser()
parser.add_argument("ID_file", help="file with IDs to extract OR string of ID to look for")
parser.add_argument("Fasta_file", help="file with Fasta sequences")
parser.add_argument("--p", "--partial", default=False, action="store_true", help="optional argument that tells the script to only look for partial match between the annotation line in the fasta file and the ID. Default is to NOT look for exact match")
parser.add_argument("--i", "--ignore_case", default=False, action="store_true", help="ignore the latter case of the pattern")
parser.add_argument("--v", "--verbose", help="increase output verbosity", action="store_true")
args = parser.parse_args()
pattern = args.ID_file
seq_file = args.Fasta_file
partial_match_allowed = args.p
ignore_case = args.i
verbose = args.v
# try to open patterns file
if os.path.isfile(pattern):
# try to open a file with this name
pattern_file = open(pattern, 'r')
patterns = []
for p in pattern_file.readlines():
p = p.strip()
patterns.append(p)
else:
# if file can not be opened, it is probably an ID to look for
patterns = []
patterns.append(pattern.strip())
#print "01", patterns, type(patterns)
if verbose:
print "Looking for pattern(s):\n", '\n\t'.join(patterns)
# if ignore case, make upper case
if ignore_case:
patterns_upper = []
if len(patterns) > 1:
for p in pattern: patterns_upper.append(p.upper())
else:
patterns_upper.append(patterns[0].upper())
patterns = patterns_upper
#print "02", patterns, type(patterns)
# make unique set of all patterns to look for
if len(patterns) > 1:
patterns = set(patterns)
#print "03", patterns, type(patterns)
for idline, seq, spacer, quals in yield_fastq(seq_file):
if verbose: print "idline: ",idline
# make id upper case, if case is ignored
id = idline
if ignore_case: id = id.upper()
# check if pattern is found in idline
if id in patterns or id.replace('@', '') in patterns or '@'+id in patterns:
sys.stdout.write( '%s\n%s\n%s\n%s\n' % ( idline, seq, spacer, quals ) )
count_hits += 1
else:
if partial_match_allowed == True:
for pattern in patterns:
if pattern in id:
#print pattern, id
sys.stdout.write( '%s\n%s\n%s\n%s\n' % ( idline, seq, spacer, quals ) )
count_hits += 1
break
count_records += 1
if args.v:
if count_hits == 1:
print "Searched in %s records, found %s hit."%(count_records, count_hits)
else:
print "Searched in %s records, found %s hits."%(count_records, count_hits)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment