Last active
May 19, 2023 01:10
-
-
Save cabb99/e7cd08955907de50754a53b58c71ae24 to your computer and use it in GitHub Desktop.
Submit a list of SLURM jobs with a maximum number of simultaneous jobs, taking into account previously submitted jobs.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
print_usage() { | |
echo "Usage: $0 [--max-jobs <max_jobs>] [--user <username>] [--help|-h] <script_path1> <script_path2> ... <script_pathN>" | |
echo | |
echo "This script submits a list of SLURM jobs with a maximum number of simultaneous jobs, taking into account previously submitted jobs." | |
echo | |
echo "Options:" | |
echo " --max-jobs <max_jobs> Set the maximum number of simultaneous jobs (default: 10)" | |
echo " --user <username> Set the user name for checking the SLURM queue (default: current user)" | |
echo " --help, -h Show this help message and exit" | |
echo | |
echo "Arguments:" | |
echo " script_path Path to the SLURM script to be submitted" | |
} | |
# Default maximum number of simultaneous jobs | |
max_jobs=10 | |
user="$USER" | |
# Check if the input arguments are provided | |
if [ "$#" -lt 1 ]; then | |
print_usage | |
exit 1 | |
fi | |
# Parse command line arguments | |
while [[ "$#" -gt 0 ]]; do | |
if [[ "$1" =~ ^--.* ]]; then | |
case $1 in | |
--max-jobs) | |
if ! [[ "$2" =~ ^[0-9]+$ ]]; then | |
echo "Error: --max-jobs value must be a positive integer." | |
exit 1 | |
fi | |
max_jobs="$2" | |
shift 2 | |
;; | |
--user) | |
user="$2" | |
shift 2 | |
;; | |
--help|-h) | |
print_usage | |
exit 0 | |
;; | |
*) | |
echo "Error: Unrecognized option '$1'" | |
print_usage | |
exit 1 | |
;; | |
esac | |
else | |
script_paths+=("$1") | |
shift | |
fi | |
done | |
# Check if any script paths were provided | |
if [[ ${#script_paths[@]} -eq 0 ]]; then | |
echo "Error: No script paths provided." | |
print_usage | |
exit 1 | |
fi | |
# Get the list of job IDs from the SLURM queue for the specified user | |
job_ids_queue=$(squeue -u "$user" --noheader --format="%i") | |
# Process job IDs to remove array ID part and find the max_jobs largest job IDs | |
if [ -n "$job_ids_queue" ]; then | |
processed_job_ids=$(echo "$job_ids_queue" | awk -F '_' '{print $1}' | sort -nu) | |
largest_job_ids=$(echo "$processed_job_ids" | tail -n "$max_jobs") | |
dependency_list=$(echo "$largest_job_ids" | paste -sd "," -) | |
else | |
dependency_list="" | |
fi | |
# Initialize the job counter with the number of jobs in the dependency list | |
if [ -n "$dependency_list" ]; then | |
job_counter=$(echo "$dependency_list" | grep -o ',' | wc -l) | |
job_counter=$((job_counter + 1)) | |
else | |
job_counter=0 | |
fi | |
# Initialize an array to store the scripts that failed to submit | |
failed_scripts=() | |
# Iterate through the script paths and submit the SLURM jobs | |
for script_path in "${script_paths[@]}"; do | |
# Check if the script exists | |
if [ ! -f "$script_path" ]; then | |
echo "Error: Script not found at $script_path" | |
continue | |
fi | |
# Go to the script's folder | |
script_dir=$(dirname "$script_path") | |
pushd "$script_dir" >/dev/null | |
# Submit the job using sbatch with dependencies on the max_jobs largest job IDs, if any | |
echo "Submitting job for script $script_path" | |
if [ "$job_counter" -ge "$max_jobs" ] && [ -n "$dependency_list" ]; then | |
dependency_job=$(echo "$dependency_list" | cut -d',' -f1) | |
if [ -n "$dependency_job" ]; then | |
echo " with dependencies on job IDs: $dependency_job" | |
sbatch_output=$(sbatch --dependency=afterany:"$dependency_job" "$(basename "$script_path")") | |
else | |
sbatch_output=$(sbatch "$(basename "$script_path")") | |
fi | |
else | |
sbatch_output=$(sbatch "$(basename "$script_path")") | |
fi | |
sbatch_exit_code=$? | |
if [ "$sbatch_exit_code" -eq 0 ]; then | |
job_id=$(echo "$sbatch_output" | awk '{print $NF}') | |
echo " submitted with job ID: $job_id" | |
else | |
echo "Error: Job submission failed for script $script_path" | |
echo "$sbatch_output" | |
failed_scripts+=("$script_path") | |
popd >/dev/null | |
continue | |
fi | |
# Increment the job counter | |
job_counter=$((job_counter + 1)) | |
# Update the dependency list with the new job ID | |
if [ -n "$dependency_list" ]; then | |
dependency_list="$dependency_list,$job_id" | |
else | |
dependency_list="$job_id" | |
fi | |
# If the job counter exceeds max_jobs, remove the oldest job ID from the dependency list | |
if [ "$job_counter" -gt "$max_jobs" ]; then | |
dependency_list=$(echo "$dependency_list" | awk -F ',' -v max_jobs="$max_jobs" '{for (i=(NF-max_jobs+1>1?NF-max_jobs+1:1); i<=NF; i++) printf "%s%s", $i, (i==NF?"":",")}') | |
fi | |
# Return to the previous working directory | |
popd >/dev/null | |
done | |
# Check if any scripts failed to submit | |
if [ ${#failed_scripts[@]} -ne 0 ]; then | |
echo "Error: The following scripts failed to submit:" | |
for failed_script in "${failed_scripts[@]}"; do | |
echo " $failed_script" | |
done | |
exit 1 | |
else | |
echo "All jobs submitted." | |
fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment