Last active
May 18, 2016 09:17
-
-
Save jmiserez/6410da9daa6bc3cdc3f1 to your computer and use it in GitHub Desktop.
Run bash functions in parallel for a list of directories. Emulates GNU parallel (grouped output) but only uses standard tools (find, xargs, bash, etc.)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# reliable way to get the directory where this file is stored | |
SCRIPT=$(readlink -f $0) | |
export SCRIPTPATH=`dirname $SCRIPT` | |
SCRIPTNAME=$(basename "$SCRIPT") | |
if [ "$#" -lt 1 ] | |
then | |
echo "Usage: ./$SCRIPTNAME <root folder path> [<pattern for matching directories within root directory, default is \"*\">]" | |
echo "or ./$SCRIPTNAME -i <single directory path to process>" | |
exit 1 | |
fi | |
# Note variables used here must have been exported befor | |
run_per_dir() { | |
echo "Running custom function for directory $1" | |
# This is the function that will run in parallel. Put your code here! | |
# | |
# ... | |
# | |
} | |
export -f run_per_dir | |
dirs_array=() | |
export IS_SINGLE_JOB=false | |
case "$1" in | |
-i) | |
if [ "$#" -eq 2 ] | |
then | |
dirs_array+=("$2") | |
export IS_SINGLE_JOB=true | |
else | |
echo "No directory specified for option -i" | |
exit 1 | |
fi | |
;; | |
*) | |
WORKSPACE=$1 | |
# set default value if not set | |
MATCHPATTERN=${2:-"*"} | |
# get directories, reads NUL-terminated strings of filenames into an array. This is the only way to do it safely. | |
# sort -z preserves NULs, -n sorts by name | |
while IFS= read -r -d $'\0'; do | |
dirs_array+=("$REPLY") | |
done < <(find "$WORKSPACE" -maxdepth 1 -type d -name "$MATCHPATTERN" -print0 | sort -nz) | |
;; | |
esac | |
echo "Directories to be processed:" | |
for i in "${dirs_array[@]}"; do | |
echo " " "$i" | |
done; | |
if [ "$IS_SINGLE_JOB" = true ] | |
then | |
: | |
else | |
# create tmp directory | |
export CURRENT_TMP_DIR=`mktemp -d` | |
# set trap to cleanup upon exit/CTRL-C. Note: not triggered when using kill -9. | |
trap 'rm -rf "$CURRENT_TMP_DIR"' EXIT | |
fi | |
func_call_by_name(){ | |
if [ "$IS_SINGLE_JOB" = true ] | |
then | |
# no redirection, no tmpfile | |
# call function $1 with all remaining arguments | |
$1 "${@:2}" | |
else | |
# redirect output to tmpfile, then print out once done | |
FUNC_CALL_OUTPUT_TMPFILE=$(mktemp --tmpdir="$CURRENT_TMP_DIR") | |
# echo "Storing output temporarily in $FUNC_CALL_OUTPUT_TMPFILE" | |
$1 "${@:2}" >> "$FUNC_CALL_OUTPUT_TMPFILE" 2>&1 | |
# print output once done | |
cat "$FUNC_CALL_OUTPUT_TMPFILE" | |
# remove temp file | |
rm -f "$FUNC_CALL_OUTPUT_TMPFILE" | |
fi | |
} | |
export -f func_call_by_name | |
# How this works: | |
# (Equivalent to: parallel generate_plots ::: "${dirs_array[@]}") | |
# 1. print each entry in the array followed by a NUL char (\x00) | |
# 2. xargs: | |
# -0 handle NUL chars as delimiter | |
# -i use {} | |
# -n 1 pass at most 1 entry from the array to each process | |
# -P N run N processes in parallel | |
NUM_CPU_CORES=$(cat /proc/cpuinfo | egrep ^processor | wc -l) | |
case $NUM_CPU_CORES in | |
''|*[!0-9]*) | |
# not a number, let's set it to 1 | |
NUM_CPU_CORES=1 | |
;; | |
*) | |
if [ "$NUM_CPU_CORES" -lt 1 ] | |
then | |
NUM_CPU_CORES=1 | |
fi | |
;; | |
esac | |
echo "NUM_CPU_CORES=$NUM_CPU_CORES" | |
printf "%s\x00" "${dirs_array[@]}" | xargs -0 -i -n 1 -P $NUM_CPU_CORES bash -c 'func_call_by_name run_per_dir {}' | |
if [ "$IS_SINGLE_JOB" = true ] | |
then | |
: | |
else | |
rm -rf "$CURRENT_TMP_DIR" | |
fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This works, but it's not pretty. There are some issues when running on OS X, the xargs arguments don't work. Comments on how to make it work on OS X are welcome, it should be an easy fix.