Skip to content

Instantly share code, notes, and snippets.

@jonesken
Created August 13, 2013 22:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jonesken/6226417 to your computer and use it in GitHub Desktop.
Save jonesken/6226417 to your computer and use it in GitHub Desktop.
This is a script to trim the low quality tails off of Illumina fastQ data, removes sequences that are trimmed below 75bp, and finally writes out the first 5 million pairs of reads that pass QC.
import os, sys
from Bio import SeqIO
from Bio.SeqIO.QualityIO import *
import itertools
filename1 = sys.argv[1] ## Read in $File from .SH script
good_pairs = 0
count = 0
filename2 = filename1[:filename1.find("_R1_")] + "_R2_" + filename1[filename1.find("_R1_")+4:] ## take filename1 split it by the "_R1_" and replace it with "_R2"
f_iter = FastqGeneralIterator(open(filename1,"rU"))
r_iter = FastqGeneralIterator(open(filename2,"rU"))
print filename1
print filename2
outfilename1 = filename1[:-6] + "_filtered.fastq" ## take filename1...remove ".fastq" and replace with "_filtered.fastq"
outfilename2 = filename2[:-6] + "_filtered.fastq" ## take filename2...remove ".fastq" and replace with "_filtered.fastq"
outfile1 = open(outfilename1, "w")
outfile2 = open(outfilename2, "w")
for (f_id, seq1, qual1), (r_id, seq2, qual2) in itertools.izip(f_iter,r_iter):
good_seq1 = 0
good_seq2 = 0
try:
while qual1[len(qual1)-1] == "#" or qual1[len(qual1)-2] == "#" or qual1[len(qual1)-3] == "#":
seq1 = seq1[:len(seq1)-1]
qual1 = qual1[:len(qual1)-1]
if len(seq1) >= 75:
good_seq1 = 1
while qual2[len(qual2)-1] == "#" or qual2[len(qual2)-2] == "#" or qual2[len(qual2)-3] == "#":
seq2 = seq2[:len(seq2)-1]
qual2 = qual2[:len(qual2)-1]
if len(seq2) >= 75:
good_seq2 = 1
if good_seq1 == 1 and good_seq2 == 1 and good_pairs < 5000000:
good_pairs = good_pairs + 1
outfile1.write("@")
outfile1.write(f_id)
outfile1.write("\n")
outfile1.write(seq1)
outfile1.write("\n")
outfile1.write("+")
outfile1.write("\n")
outfile1.write(qual1)
outfile1.write("\n")
outfile2.write("@")
outfile2.write(r_id)
outfile2.write("\n")
outfile2.write(seq2)
outfile2.write("\n")
outfile2.write("+")
outfile2.write("\n")
outfile2.write(qual2)
outfile2.write("\n")
except:
bad_seq = 1
f_iter.close()
r_iter.close()
outfile1.close()
outfile2.close()
print "Good_pairs = ", good_pairs
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment