Skip to content

Instantly share code, notes, and snippets.

Created July 16, 2015 11:51
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save walterst/2c592044b3b9e44a4290 to your computer and use it in GitHub Desktop.
Save walterst/2c592044b3b9e44a4290 to your computer and use it in GitHub Desktop.
See USAGE text below. The purpose of the script is to find forward/reverse primers in an input fastq file, and remove everything before/after these primers.
#!/usr/bin/env python
# USAGE: Mapping_file input_fasta output_fasta log_filename
from sys import argv
from string import upper
from re import compile
from skbio.parse.sequences import parse_fastq
from skbio.sequence import DNA
from skbio.format.sequences import format_fastq_record
from qiime.check_id_map import process_id_map
def get_primers(header,
""" Returns lists of forward/reverse primer regular expression generators
header: list of strings of header data.
mapping_data: list of lists of mapping data
Will raise error if either the LinkerPrimerSequence or ReversePrimer fields
are not present
if "LinkerPrimerSequence" in header:
primer_ix = header.index("LinkerPrimerSequence")
raise IndexError(
("Mapping file is missing LinkerPrimerSequence field."))
if "ReversePrimer" in header:
rev_primer_ix = header.index("ReversePrimer")
raise IndexError(("Mapping file is missing ReversePrimer field."))
iupac = {'A': 'A', 'T': 'T', 'G': 'G', 'C': 'C', 'R': '[AG]', 'Y': '[CT]',
'S': '[GC]', 'W': '[AT]', 'K': '[GT]', 'M': '[AC]', 'B': '[CGT]',
'D': '[AGT]', 'H': '[ACT]', 'V': '[ACG]', 'N': '[ACGT]'}
raw_forward_primers = set([])
raw_reverse_primers = set([])
for line in mapping_data:
# Split on commas to handle pool of primers
raw_forward_primers.update([upper(primer).strip() for
primer in line[primer_ix].split(',')])
raw_reverse_primers.update([upper(str(DNA(primer).rc())) for
primer in line[rev_primer_ix].split(',')])
if not raw_forward_primers:
raise ValueError(("No forward primers detected in mapping file."))
if not raw_reverse_primers:
raise ValueError(("No reverse primers detected in mapping file."))
forward_primers = []
reverse_primers = []
for curr_primer in raw_forward_primers:
forward_primers.append(compile(''.join([iupac[symbol] for
symbol in curr_primer])))
for curr_primer in raw_reverse_primers:
reverse_primers.append(compile(''.join([iupac[symbol] for
symbol in curr_primer])))
return forward_primers, reverse_primers
map_fp = open(argv[1], "U")
header, mapping_data, run_description, errors, warnings = process_id_map(map_fp)
forward_primers, reverse_primers = get_primers(header, mapping_data)
seqs = open(argv[2], "U")
out_seqs = open(argv[3], "w")
log_out = open(argv[4], "w")
f_count = 0
r_count = 0
no_seq_left = 0
for label,seq,qual in parse_fastq(seqs):
start_slice = 0
end_slice = -1
for curr_primer in forward_primers:
start_slice = int([1])
f_count += 1
for curr_primer in reverse_primers:
end_slice = int([0])
r_count += 1
curr_seq = seq[start_slice:end_slice]
curr_qual = qual[start_slice:end_slice]
if len(curr_seq) < 1:
no_seq_left += 1
formatted_fastq_line = format_fastq_record(label, curr_seq, curr_qual)
out_seqs.write("%s" % (formatted_fastq_line))
log_out.write("Forward primer hits: %d\n" % f_count)
log_out.write("Reverse primer hits: %d\n" % r_count)
log_out.write("No seq left after truncation: %d" % no_seq_left)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment