Script to run Trimmomatic to discard read-pairs that have low quality bases
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
#====================================================================# | |
def run_trimmomatic_for_subsets_of_data(sample_name, num_subsets): | |
# need to run trimmomatic for each subset of the data: | |
for x in range(num_subsets): | |
subset = x + 1 # eg. if num_subsets is 17, 'subset' goes from 1 to 17 | |
# the command for trimmomatic is something like this: | |
# java -Xmx128M -jar /software/pathogen/external/apps/usr/local/Trimmomatic-0.33/trimmomatic-0.33.jar PE -threads 1 -phred33 sample1topsmall_1.fastq.gz sample1topsmall_2.fastq.gz sample1topsmallout_1_paired.fastq.gz sample1topsmallout_1_ | |
unpaired.fastq.gz sample1topsmallout_2_paired.fastq.gz sample1topsmallout_2_unpaired.fastq.gz SLIDINGWINDOW:150:20 | |
input1 = '%s_1_sub_%d.fastq.gz' % (sample_name, subset) # eg. sample1_1_sub_1.fastq.gz | |
input2 = '%s_2_sub_%d.fastq.gz' % (sample_name, subset) # eg. sample1_2_sub_1.fastq.gz | |
output1 = '%s_1_sub_%d_outpaired.fastq.gz' % (sample_name, subset) # eg. sample1_1_sub_1_outpaired.fastq.gz | |
output2 = '%s_1_sub_%d_outunpaired.fastq.gz' % (sample_name, subset) # eg. sample1_1_sub_1_outunpaired.fastq.gz | |
output3 = '%s_2_sub_%d_outpaired.fastq.gz' % (sample_name, subset) # eg. sample1_2_sub_1_outpaired.fastq.gz | |
output4 = '%s_2_sub_%d_outunpaired.fastq.gz' % (sample_name, subset) # eg. sample1_2_sub_1_outunpaired.fastq.gz | |
# this looks at the average base quality in 150-bp windows of each read and discard windows that have average base quality of <20. If the whole read length is 150 bp, it discards | |
# the whole read if the average base quality in the read is <20. | |
# output1 and output3 have the read-pairs where both reads of a pair pass this quality filter. | |
command1 = 'java -Xmx128M -jar /software/pathogen/external/apps/usr/local/Trimmomatic-0.33/trimmomatic-0.33.jar PE -threads 1 -phred33 %s %s %s %s %s %s SLIDINGWINDOW:150:20' % (input1,input2,output1,output2,output3,output4) | |
print("Running",command1) | |
os.system(command1) | |
#====================================================================# | |
def main(): | |
# check the command-line arguments: | |
if len(sys.argv) != 3: | |
print("Usage: %s sample_name num_subsets" % sys.argv[0]) | |
sys.exit(1) | |
sample_name = sys.argv[1] # name for this sample used in the fastq file names, eg. sample1 | |
num_subsets = int(sys.argv[2]) # number of subsets of data that the fastq for this sample was split into eg. 17 | |
# run trimmomatic for each of the subsets of the data: | |
run_trimmomatic_for_subsets_of_data(sample_name, num_subsets) | |
#====================================================================# | |
if __name__=="__main__": | |
main() | |
#====================================================================# |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment