Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avrilcoghlan/5798bdc25a6c5fbf673c69491822bf7f to your computer and use it in GitHub Desktop.
Save avrilcoghlan/5798bdc25a6c5fbf673c69491822bf7f to your computer and use it in GitHub Desktop.
Script to run CRISPResso jobs on a farm, for lots of subsets of data
import os
import sys
#====================================================================#
def submit_crispresso_jobs(sample_name, num_subsets):
# need to submit a crispresso job for each subset of the data:
for x in range(num_subsets):
subset = x + 1 # eg. if num_subsets is 17, 'subset' goes from 1 to 17
# we are using paired read-pairs that passed a base quality filtering step (run using trimmomatic)
fastq_file1 = '%s_1_sub_%d_outpaired.fastq.gz' % (sample_name, subset) # eg. sample1_1_sub_1_outpaired.fastq.gz
fastq_file2 = '%s_2_sub_%d_outpaired.fastq.gz' % (sample_name, subset) # eg. sample1_2_sub_1_outpaired.fastq.gz
# now submit a crispresso job:
bsub_out = "bsub_out_crispresso_%s_%d" % (sample_name, subset)
bsub_err = "bsub_err_crispresso_%s_%d" % (sample_name, subset)
# xxx need to put your amplicon sequence here:
amplicon = ""
# xxx need to put your gRNA sequence here:
gRNA = ""
# xxx need to put your coding sequence here:
coding_seq = ""
# get the current directory:
current_dir = os.getcwd()
output_dir = "%s/crispresso_%s_%d" % (current_dir, sample_name, subset)
# the primer lengths are 21/22 bp for our amplicon, hard-coded values for --exclude_bp_from_left and --exclude_bp_from_right into the script here xxx
command1 = "/nfs/team87/farm3_lims2_vms/software/python_local/bin/CRISPResso -w 500 -r1 %s -r2 %s -a %s -g %s -o %s --exclude_bp_from_left 22 --exclude_bp_from_right 22 -c %s" % (fastq_file1, fastq_file2, amplicon, gRNA, output_dir, codin
g_seq)
job_name = "%s_%d_crispresso" % (sample_name, subset)
command2 = 'bsub -q long -o %s -e %s -R "select[mem>50000] rusage[mem=50000]" -M50000 -p 8 %s -J %s' % (bsub_out, bsub_err, command1, job_name)
os.system(command2)
#====================================================================#
def main():
# check the command-line arguments:
if len(sys.argv) != 3:
print("Usage: %s sample_name num_subsets" % sys.argv[0])
sys.exit(1)
sample_name = sys.argv[1] # name for this sample used in the fastq file names, eg. sample1
num_subsets = int(sys.argv[2]) # number of subsets of data that the fastq for this sample was split into eg. 17
# submit crispresso jobs for each subset of the data:
submit_crispresso_jobs(sample_name, num_subsets)
#====================================================================#
if __name__=="__main__":
main()
#====================================================================#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment