Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Created October 27, 2017 09:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avrilcoghlan/e7e8d49dcf6c716199ffba1a60321517 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/e7e8d49dcf6c716199ffba1a60321517 to your computer and use it in GitHub Desktop.
Python script to submit CRISPresso jobs on a compute farm
import sys
import os
from collections import defaultdict
#====================================================================#
# define a function to read in a list of files:
def read_file_list(input_file_list):
file_list = []
fileObj = open(input_file_list, "r")
for line in fileObj:
line = line.rstrip()
file_list.append(line)
fileObj.close()
return(file_list)
#====================================================================#
# define a function to read in sequences from a file.
# The file has two columns, a barcode (1-96) in the first column,
# then a space, then the sequence (if any) in the second column.
def read_seqs(seq_file):
seq = defaultdict()
fileObj = open(seq_file, "r")
for line in fileObj:
line = line.rstrip()
temp = line.split()
if len(temp) == 2: # there is some sequence for this barcode
barcode = int(temp[0]) # convert to an integer
sequence = temp[1]
assert(barcode not in seq)
seq[barcode] = sequence
fileObj.close()
return seq
#====================================================================#
def submit_crispresso_job(amplicon, gRNA, barcode, plate_number, fastq_dir, lane_id):
# get the current directory:
current_dir = os.getcwd()
# submit the CRISPresso job:
output_dir = "plate%d_barcode%d" % (plate_number, barcode)
fastq_file1 = "%s/%s#%d_1.fastq.gz" % (fastq_dir, lane_id, barcode)
fastq_file2 = "%s/%s#%d_2.fastq.gz" % (fastq_dir, lane_id, barcode)
# This uses a 50 bp window (suggested by Peter Keen). Patrick said "That way we are being generous in our definition of an indel event".
# This uses the --hide_mutations_outside_window_NHEJ option. Patrick said "I think '--hide_mutations_outside_window_NHEJ' is a great option.
# If it isn’t happening around the double stranded break site it probably isn’t happening.".
command1 = "/nfs/team87/farm3_lims2_vms/software/python_local/bin/CRISPResso -w 50 --hide_mutations_outside_window_NHEJ -o %s -r1 %s -r2 %s -a %s -g %s" % (output_dir, fastq_file1, fastq_file2, amplicon, gRNA)
# specify the bsub output and error file names:
bsub_out = "plate%d_barcode%d.o" % (plate_number, barcode)
bsub_err = "plate%d_barcode%d.e" % (plate_number, barcode)
# submit job, here I requested 1000 Mbyte of RAM
command2 = 'bsub -o %s -e %s -R "select[mem>1000] rusage[mem=1000]" -M1000 "%s"' % (bsub_out, bsub_err, command1)
print(command2)
os.system(command2)
return
#====================================================================#
# define a function to submit the CRISPresso jobs for a plate:
def submit_crispresso_jobs_for_plate(amplicon_file, gRNA_file, plate_number, fastq_dir, lane_id):
# first read in the amplicon sequences:
amplicons = read_seqs(amplicon_file)
# read in the gRNA sequences:
gRNAs = read_seqs(gRNA_file)
# submit the CRISPresso jobs for barcodes 1 - 96:
for barcode in range(1, 97): # index will go from 1 to 96
if barcode in amplicons:
assert(barcode in gRNAs)
amplicon = amplicons[barcode]
gRNA = gRNAs[barcode]
# submit the CRISPResso job for this amplicon, barcode and gRNA:
submit_crispresso_job(amplicon, gRNA, barcode, plate_number, fastq_dir, lane_id)
else:
assert(barcode not in gRNAs)
return
#====================================================================#
def main():
# check the command-line arguments:
if len(sys.argv) != 5 or os.path.exists(sys.argv[1]) == False or os.path.exists(sys.argv[2]) == False or os.path.exists(sys.argv[3]) == False:
print("Usage: %s input_amplicon_file_list input_gRNA_file_list fastq_dir lane_id" % sys.argv[0])
sys.exit(1)
input_amplicon_file_list = sys.argv[1] # input list of amplicon files
input_gRNA_file_list = sys.argv[2] # input list of gRNA files
fastq_dir = sys.argv[3] # directory containing the fastq files
lane_id = sys.argv[4] # the lane id., eg. 23528_1
# read in the amplicon files:
amplicon_files = read_file_list(input_amplicon_file_list)
# read in the gRNA files:
gRNA_files = read_file_list(input_gRNA_file_list)
# each plate has 96 wells, look at the data plate by plate:
# (each plate has an amplicon file, and a gRNA file)
for index, amplicon_file in enumerate(amplicon_files):
plate_number = index + 1
# find the corresponding gRNA file:
gRNA_file = gRNA_files[index]
# submit the CRISPresso jobs for this plate:
submit_crispresso_jobs_for_plate(amplicon_file, gRNA_file, plate_number, fastq_dir, lane_id)
print("FINISHED")
#====================================================================#
if __name__=="__main__":
main()
#====================================================================#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment