avrilcoghlan/filter_fastq_files_using_trimmomatic.py

## filter_fastq_files_using_trimmomatic.py
import os
import sys

#====================================================================#

def run_trimmomatic_for_subsets_of_data(sample_name, num_subsets):

    # need to run trimmomatic for each subset of the data:
    for x in range(num_subsets):
        subset = x + 1 # eg. if num_subsets is 17, 'subset' goes from 1 to 17
        # the command for trimmomatic is something like this:
        # java -Xmx128M  -jar /software/pathogen/external/apps/usr/local/Trimmomatic-0.33/trimmomatic-0.33.jar PE -threads 1 -phred33 sample1topsmall_1.fastq.gz sample1topsmall_2.fastq.gz sample1topsmallout_1_paired.fastq.gz sample1topsmallout_1_
unpaired.fastq.gz sample1topsmallout_2_paired.fastq.gz sample1topsmallout_2_unpaired.fastq.gz SLIDINGWINDOW:150:20
        input1 = '%s_1_sub_%d.fastq.gz' % (sample_name, subset) # eg. sample1_1_sub_1.fastq.gz
        input2 = '%s_2_sub_%d.fastq.gz' % (sample_name, subset) # eg. sample1_2_sub_1.fastq.gz
        output1 = '%s_1_sub_%d_outpaired.fastq.gz' % (sample_name, subset) # eg. sample1_1_sub_1_outpaired.fastq.gz
        output2 = '%s_1_sub_%d_outunpaired.fastq.gz' % (sample_name, subset) # eg. sample1_1_sub_1_outunpaired.fastq.gz
        output3 = '%s_2_sub_%d_outpaired.fastq.gz' % (sample_name, subset) # eg. sample1_2_sub_1_outpaired.fastq.gz
        output4 = '%s_2_sub_%d_outunpaired.fastq.gz' % (sample_name, subset) # eg. sample1_2_sub_1_outunpaired.fastq.gz
        # this looks at the average base quality in 150-bp windows of each read and discard windows that have average base quality of <20. If the whole read length is 150 bp, it discards
        # the whole read if the average base quality in the read is <20.
        # output1 and output3 have the read-pairs where both reads of a pair pass this quality filter.
        command1 = 'java -Xmx128M  -jar /software/pathogen/external/apps/usr/local/Trimmomatic-0.33/trimmomatic-0.33.jar PE -threads 1 -phred33 %s %s %s %s %s %s SLIDINGWINDOW:150:20' % (input1,input2,output1,output2,output3,output4)
        print("Running",command1)
        os.system(command1)

#====================================================================#

def main():

    # check the command-line arguments:
    if len(sys.argv) != 3:
        print("Usage: %s sample_name num_subsets" % sys.argv[0])
        sys.exit(1)
    sample_name = sys.argv[1] # name for this sample used in the fastq file names, eg. sample1
    num_subsets = int(sys.argv[2]) # number of subsets of data that the fastq for this sample was split into eg. 17

    # run trimmomatic for each of the subsets of the data:
    run_trimmomatic_for_subsets_of_data(sample_name, num_subsets)

#====================================================================#

if __name__=="__main__":
    main()

#====================================================================#
	import os
	import sys

	#====================================================================#

	def run_trimmomatic_for_subsets_of_data(sample_name, num_subsets):

	# need to run trimmomatic for each subset of the data:
	for x in range(num_subsets):
	subset = x + 1 # eg. if num_subsets is 17, 'subset' goes from 1 to 17
	# the command for trimmomatic is something like this:
	# java -Xmx128M -jar /software/pathogen/external/apps/usr/local/Trimmomatic-0.33/trimmomatic-0.33.jar PE -threads 1 -phred33 sample1topsmall_1.fastq.gz sample1topsmall_2.fastq.gz sample1topsmallout_1_paired.fastq.gz sample1topsmallout_1_
	unpaired.fastq.gz sample1topsmallout_2_paired.fastq.gz sample1topsmallout_2_unpaired.fastq.gz SLIDINGWINDOW:150:20
	input1 = '%s_1_sub_%d.fastq.gz' % (sample_name, subset) # eg. sample1_1_sub_1.fastq.gz
	input2 = '%s_2_sub_%d.fastq.gz' % (sample_name, subset) # eg. sample1_2_sub_1.fastq.gz
	output1 = '%s_1_sub_%d_outpaired.fastq.gz' % (sample_name, subset) # eg. sample1_1_sub_1_outpaired.fastq.gz
	output2 = '%s_1_sub_%d_outunpaired.fastq.gz' % (sample_name, subset) # eg. sample1_1_sub_1_outunpaired.fastq.gz
	output3 = '%s_2_sub_%d_outpaired.fastq.gz' % (sample_name, subset) # eg. sample1_2_sub_1_outpaired.fastq.gz
	output4 = '%s_2_sub_%d_outunpaired.fastq.gz' % (sample_name, subset) # eg. sample1_2_sub_1_outunpaired.fastq.gz
	# this looks at the average base quality in 150-bp windows of each read and discard windows that have average base quality of <20. If the whole read length is 150 bp, it discards
	# the whole read if the average base quality in the read is <20.
	# output1 and output3 have the read-pairs where both reads of a pair pass this quality filter.
	command1 = 'java -Xmx128M -jar /software/pathogen/external/apps/usr/local/Trimmomatic-0.33/trimmomatic-0.33.jar PE -threads 1 -phred33 %s %s %s %s %s %s SLIDINGWINDOW:150:20' % (input1,input2,output1,output2,output3,output4)
	print("Running",command1)
	os.system(command1)

	#====================================================================#

	def main():

	# check the command-line arguments:
	if len(sys.argv) != 3:
	print("Usage: %s sample_name num_subsets" % sys.argv[0])
	sys.exit(1)
	sample_name = sys.argv[1] # name for this sample used in the fastq file names, eg. sample1
	num_subsets = int(sys.argv[2]) # number of subsets of data that the fastq for this sample was split into eg. 17

	# run trimmomatic for each of the subsets of the data:
	run_trimmomatic_for_subsets_of_data(sample_name, num_subsets)

	#====================================================================#

	if __name__=="__main__":
	main()

	#====================================================================#