Skip to content

Instantly share code, notes, and snippets.

@shriphani
Created March 5, 2013 20:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shriphani/5093858 to your computer and use it in GitHub Desktop.
Save shriphani/5093858 to your computer and use it in GitHub Desktop.
Create condor_run strings for running jobs in parallel over a list of files
"""
Given a large list of files and the number of processes,
this script will divide up the work, generate a list of condor_submit files,
submit them to condor and exit
"""
import argparse
import errno
import math
import os
class OutputDirectoryExistsError(Exception):
pass
def chunk(l, n):
return [l[i:i+int(math.ceil(float(len(l))/n))] for i in range(0, len(l), int(math.ceil(float(len(l))/n)))]
def build_condor_run_string(job_name, output_directory, command_str, my_job_id, file_list):
job_files_list_name = job_name + '_' + str(my_job_id) + '.txt'
with open(os.path.join(output_directory, job_files_list_name), 'w') as file_list_handle:
for file_name in file_list:
file_list_handle.write(file_name + '\n')
print 'condor_run ' + '\"' + command_str.replace('{}', os.path.join(output_directory, job_files_list_name)) + '\" &'
if __name__ == '__main__':
def parse_cmdline_args():
parser = argparse.ArgumentParser()
parser.add_argument(
'job_name',
metavar = 'job-name',
help = 'We use this as the name of the process'
)
parser.add_argument(
'list_of_files',
metavar = 'list-of-files',
help = 'List of files to use'
)
parser.add_argument(
'num_procs',
metavar = 'num-procs',
help = 'Number of processes we are using',
type = int
)
parser.add_argument(
'command_str',
metavar = 'command-str',
help = 'A shell command pls.'
)
parser.add_argument(
'--output-directory',
dest = 'output_directory',
help = 'Where to dump the files',
default = '.'
)
return parser.parse_args()
parsed = parse_cmdline_args()
def safe_mkdir(path):
try:
os.makedirs(path)
except OSError as exception:
if exception.errno != errno.EEXIST:
raise
else:
raise OutputDirectoryExistsError('Output directory must not already exist')
if parsed.output_directory != '.': safe_mkdir(parsed.output_directory)
with open(parsed.list_of_files, 'r') as file_list_handle:
chunked_file_list = chunk(map(lambda s : s.strip(), file_list_handle.readlines()), parsed.num_procs)
for i in range(len(chunked_file_list)):
build_condor_run_string(parsed.job_name, parsed.output_directory, parsed.command_str, i, chunked_file_list[i])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment