Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Created October 26, 2018 12:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avrilcoghlan/e867c1c95c84152909358cfe1dcde1a3 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/e867c1c95c84152909358cfe1dcde1a3 to your computer and use it in GitHub Desktop.
Script to run Trimmomatic to discard read-pairs that have low quality bases
import os
import sys
#====================================================================#
def run_trimmomatic_for_subsets_of_data(sample_name, num_subsets):
# need to run trimmomatic for each subset of the data:
for x in range(num_subsets):
subset = x + 1 # eg. if num_subsets is 17, 'subset' goes from 1 to 17
# the command for trimmomatic is something like this:
# java -Xmx128M -jar /software/pathogen/external/apps/usr/local/Trimmomatic-0.33/trimmomatic-0.33.jar PE -threads 1 -phred33 sample1topsmall_1.fastq.gz sample1topsmall_2.fastq.gz sample1topsmallout_1_paired.fastq.gz sample1topsmallout_1_
unpaired.fastq.gz sample1topsmallout_2_paired.fastq.gz sample1topsmallout_2_unpaired.fastq.gz SLIDINGWINDOW:150:20
input1 = '%s_1_sub_%d.fastq.gz' % (sample_name, subset) # eg. sample1_1_sub_1.fastq.gz
input2 = '%s_2_sub_%d.fastq.gz' % (sample_name, subset) # eg. sample1_2_sub_1.fastq.gz
output1 = '%s_1_sub_%d_outpaired.fastq.gz' % (sample_name, subset) # eg. sample1_1_sub_1_outpaired.fastq.gz
output2 = '%s_1_sub_%d_outunpaired.fastq.gz' % (sample_name, subset) # eg. sample1_1_sub_1_outunpaired.fastq.gz
output3 = '%s_2_sub_%d_outpaired.fastq.gz' % (sample_name, subset) # eg. sample1_2_sub_1_outpaired.fastq.gz
output4 = '%s_2_sub_%d_outunpaired.fastq.gz' % (sample_name, subset) # eg. sample1_2_sub_1_outunpaired.fastq.gz
# this looks at the average base quality in 150-bp windows of each read and discard windows that have average base quality of <20. If the whole read length is 150 bp, it discards
# the whole read if the average base quality in the read is <20.
# output1 and output3 have the read-pairs where both reads of a pair pass this quality filter.
command1 = 'java -Xmx128M -jar /software/pathogen/external/apps/usr/local/Trimmomatic-0.33/trimmomatic-0.33.jar PE -threads 1 -phred33 %s %s %s %s %s %s SLIDINGWINDOW:150:20' % (input1,input2,output1,output2,output3,output4)
print("Running",command1)
os.system(command1)
#====================================================================#
def main():
# check the command-line arguments:
if len(sys.argv) != 3:
print("Usage: %s sample_name num_subsets" % sys.argv[0])
sys.exit(1)
sample_name = sys.argv[1] # name for this sample used in the fastq file names, eg. sample1
num_subsets = int(sys.argv[2]) # number of subsets of data that the fastq for this sample was split into eg. 17
# run trimmomatic for each of the subsets of the data:
run_trimmomatic_for_subsets_of_data(sample_name, num_subsets)
#====================================================================#
if __name__=="__main__":
main()
#====================================================================#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment