avrilcoghlan/submit_crispresso_jobs_for_subsetsoffastq.py

## submit_crispresso_jobs_for_subsetsoffastq.py
import os
import sys

#====================================================================#

def submit_crispresso_jobs(sample_name, num_subsets):

    # need to submit a crispresso job for each subset of the data:
    for x in range(num_subsets):
        subset = x + 1 # eg. if num_subsets is 17, 'subset' goes from 1 to 17
        # we are using paired read-pairs that passed a base quality filtering step (run using trimmomatic)
        fastq_file1 = '%s_1_sub_%d_outpaired.fastq.gz' % (sample_name, subset) # eg. sample1_1_sub_1_outpaired.fastq.gz
        fastq_file2 = '%s_2_sub_%d_outpaired.fastq.gz' % (sample_name, subset) # eg. sample1_2_sub_1_outpaired.fastq.gz
        # now submit a crispresso job:
        bsub_out = "bsub_out_crispresso_%s_%d" % (sample_name, subset)
        bsub_err = "bsub_err_crispresso_%s_%d" % (sample_name, subset)
        # xxx need to put your amplicon sequence here:
        amplicon =  ""
        # xxx need to put your gRNA sequence here:
        gRNA = ""
        # xxx need to put your coding sequence here:
        coding_seq = ""
        # get the current directory:
        current_dir = os.getcwd()
        output_dir = "%s/crispresso_%s_%d" % (current_dir, sample_name, subset)
        # the primer lengths are 21/22 bp for our amplicon, hard-coded values for --exclude_bp_from_left and --exclude_bp_from_right into the script here xxx
        command1 = "/nfs/team87/farm3_lims2_vms/software/python_local/bin/CRISPResso -w 500 -r1 %s -r2 %s -a %s -g %s -o %s --exclude_bp_from_left 22 --exclude_bp_from_right 22 -c %s" % (fastq_file1, fastq_file2, amplicon, gRNA, output_dir, codin
g_seq)
        job_name = "%s_%d_crispresso" % (sample_name, subset)
        command2 = 'bsub -q long -o %s -e %s -R "select[mem>50000] rusage[mem=50000]" -M50000 -p 8 %s -J %s' % (bsub_out, bsub_err, command1, job_name)
        os.system(command2)

#====================================================================#

def main():

    # check the command-line arguments:
    if len(sys.argv) != 3:
        print("Usage: %s sample_name num_subsets" % sys.argv[0])
        sys.exit(1)
    sample_name = sys.argv[1] # name for this sample used in the fastq file names, eg. sample1
    num_subsets = int(sys.argv[2]) # number of subsets of data that the fastq for this sample was split into eg. 17

    # submit crispresso jobs for each subset of the data:
    submit_crispresso_jobs(sample_name, num_subsets)

#====================================================================#

if __name__=="__main__":
    main()

#====================================================================#
	import os
	import sys

	#====================================================================#

	def submit_crispresso_jobs(sample_name, num_subsets):

	# need to submit a crispresso job for each subset of the data:
	for x in range(num_subsets):
	subset = x + 1 # eg. if num_subsets is 17, 'subset' goes from 1 to 17
	# we are using paired read-pairs that passed a base quality filtering step (run using trimmomatic)
	fastq_file1 = '%s_1_sub_%d_outpaired.fastq.gz' % (sample_name, subset) # eg. sample1_1_sub_1_outpaired.fastq.gz
	fastq_file2 = '%s_2_sub_%d_outpaired.fastq.gz' % (sample_name, subset) # eg. sample1_2_sub_1_outpaired.fastq.gz
	# now submit a crispresso job:
	bsub_out = "bsub_out_crispresso_%s_%d" % (sample_name, subset)
	bsub_err = "bsub_err_crispresso_%s_%d" % (sample_name, subset)
	# xxx need to put your amplicon sequence here:
	amplicon = ""
	# xxx need to put your gRNA sequence here:
	gRNA = ""
	# xxx need to put your coding sequence here:
	coding_seq = ""
	# get the current directory:
	current_dir = os.getcwd()
	output_dir = "%s/crispresso_%s_%d" % (current_dir, sample_name, subset)
	# the primer lengths are 21/22 bp for our amplicon, hard-coded values for --exclude_bp_from_left and --exclude_bp_from_right into the script here xxx
	command1 = "/nfs/team87/farm3_lims2_vms/software/python_local/bin/CRISPResso -w 500 -r1 %s -r2 %s -a %s -g %s -o %s --exclude_bp_from_left 22 --exclude_bp_from_right 22 -c %s" % (fastq_file1, fastq_file2, amplicon, gRNA, output_dir, codin
	g_seq)
	job_name = "%s_%d_crispresso" % (sample_name, subset)
	command2 = 'bsub -q long -o %s -e %s -R "select[mem>50000] rusage[mem=50000]" -M50000 -p 8 %s -J %s' % (bsub_out, bsub_err, command1, job_name)
	os.system(command2)

	#====================================================================#

	def main():

	# check the command-line arguments:
	if len(sys.argv) != 3:
	print("Usage: %s sample_name num_subsets" % sys.argv[0])
	sys.exit(1)
	sample_name = sys.argv[1] # name for this sample used in the fastq file names, eg. sample1
	num_subsets = int(sys.argv[2]) # number of subsets of data that the fastq for this sample was split into eg. 17

	# submit crispresso jobs for each subset of the data:
	submit_crispresso_jobs(sample_name, num_subsets)

	#====================================================================#

	if __name__=="__main__":
	main()

	#====================================================================#