Created
August 13, 2013 22:42
-
-
Save jonesken/6226417 to your computer and use it in GitHub Desktop.
This is a script to trim the low quality tails off of Illumina fastQ data, removes sequences that are trimmed below 75bp, and finally writes out the first 5 million pairs of reads that pass QC.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os, sys | |
from Bio import SeqIO | |
from Bio.SeqIO.QualityIO import * | |
import itertools | |
filename1 = sys.argv[1] ## Read in $File from .SH script | |
good_pairs = 0 | |
count = 0 | |
filename2 = filename1[:filename1.find("_R1_")] + "_R2_" + filename1[filename1.find("_R1_")+4:] ## take filename1 split it by the "_R1_" and replace it with "_R2" | |
f_iter = FastqGeneralIterator(open(filename1,"rU")) | |
r_iter = FastqGeneralIterator(open(filename2,"rU")) | |
print filename1 | |
print filename2 | |
outfilename1 = filename1[:-6] + "_filtered.fastq" ## take filename1...remove ".fastq" and replace with "_filtered.fastq" | |
outfilename2 = filename2[:-6] + "_filtered.fastq" ## take filename2...remove ".fastq" and replace with "_filtered.fastq" | |
outfile1 = open(outfilename1, "w") | |
outfile2 = open(outfilename2, "w") | |
for (f_id, seq1, qual1), (r_id, seq2, qual2) in itertools.izip(f_iter,r_iter): | |
good_seq1 = 0 | |
good_seq2 = 0 | |
try: | |
while qual1[len(qual1)-1] == "#" or qual1[len(qual1)-2] == "#" or qual1[len(qual1)-3] == "#": | |
seq1 = seq1[:len(seq1)-1] | |
qual1 = qual1[:len(qual1)-1] | |
if len(seq1) >= 75: | |
good_seq1 = 1 | |
while qual2[len(qual2)-1] == "#" or qual2[len(qual2)-2] == "#" or qual2[len(qual2)-3] == "#": | |
seq2 = seq2[:len(seq2)-1] | |
qual2 = qual2[:len(qual2)-1] | |
if len(seq2) >= 75: | |
good_seq2 = 1 | |
if good_seq1 == 1 and good_seq2 == 1 and good_pairs < 5000000: | |
good_pairs = good_pairs + 1 | |
outfile1.write("@") | |
outfile1.write(f_id) | |
outfile1.write("\n") | |
outfile1.write(seq1) | |
outfile1.write("\n") | |
outfile1.write("+") | |
outfile1.write("\n") | |
outfile1.write(qual1) | |
outfile1.write("\n") | |
outfile2.write("@") | |
outfile2.write(r_id) | |
outfile2.write("\n") | |
outfile2.write(seq2) | |
outfile2.write("\n") | |
outfile2.write("+") | |
outfile2.write("\n") | |
outfile2.write(qual2) | |
outfile2.write("\n") | |
except: | |
bad_seq = 1 | |
f_iter.close() | |
r_iter.close() | |
outfile1.close() | |
outfile2.close() | |
print "Good_pairs = ", good_pairs | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment