Script to run CRISPResso jobs on a farm, for lots of subsets of data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
#====================================================================# | |
def submit_crispresso_jobs(sample_name, num_subsets): | |
# need to submit a crispresso job for each subset of the data: | |
for x in range(num_subsets): | |
subset = x + 1 # eg. if num_subsets is 17, 'subset' goes from 1 to 17 | |
# we are using paired read-pairs that passed a base quality filtering step (run using trimmomatic) | |
fastq_file1 = '%s_1_sub_%d_outpaired.fastq.gz' % (sample_name, subset) # eg. sample1_1_sub_1_outpaired.fastq.gz | |
fastq_file2 = '%s_2_sub_%d_outpaired.fastq.gz' % (sample_name, subset) # eg. sample1_2_sub_1_outpaired.fastq.gz | |
# now submit a crispresso job: | |
bsub_out = "bsub_out_crispresso_%s_%d" % (sample_name, subset) | |
bsub_err = "bsub_err_crispresso_%s_%d" % (sample_name, subset) | |
# xxx need to put your amplicon sequence here: | |
amplicon = "" | |
# xxx need to put your gRNA sequence here: | |
gRNA = "" | |
# xxx need to put your coding sequence here: | |
coding_seq = "" | |
# get the current directory: | |
current_dir = os.getcwd() | |
output_dir = "%s/crispresso_%s_%d" % (current_dir, sample_name, subset) | |
# the primer lengths are 21/22 bp for our amplicon, hard-coded values for --exclude_bp_from_left and --exclude_bp_from_right into the script here xxx | |
command1 = "/nfs/team87/farm3_lims2_vms/software/python_local/bin/CRISPResso -w 500 -r1 %s -r2 %s -a %s -g %s -o %s --exclude_bp_from_left 22 --exclude_bp_from_right 22 -c %s" % (fastq_file1, fastq_file2, amplicon, gRNA, output_dir, codin | |
g_seq) | |
job_name = "%s_%d_crispresso" % (sample_name, subset) | |
command2 = 'bsub -q long -o %s -e %s -R "select[mem>50000] rusage[mem=50000]" -M50000 -p 8 %s -J %s' % (bsub_out, bsub_err, command1, job_name) | |
os.system(command2) | |
#====================================================================# | |
def main(): | |
# check the command-line arguments: | |
if len(sys.argv) != 3: | |
print("Usage: %s sample_name num_subsets" % sys.argv[0]) | |
sys.exit(1) | |
sample_name = sys.argv[1] # name for this sample used in the fastq file names, eg. sample1 | |
num_subsets = int(sys.argv[2]) # number of subsets of data that the fastq for this sample was split into eg. 17 | |
# submit crispresso jobs for each subset of the data: | |
submit_crispresso_jobs(sample_name, num_subsets) | |
#====================================================================# | |
if __name__=="__main__": | |
main() | |
#====================================================================# |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment