Skip to content

Instantly share code, notes, and snippets.

@davetang
Created December 8, 2021 03:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save davetang/674dbe5886d0b0859dda65db4760948e to your computer and use it in GitHub Desktop.
Save davetang/674dbe5886d0b0859dda65db4760948e to your computer and use it in GitHub Desktop.
Run grep in parallel
#!/usr/bin/env bash
set -euo pipefail
usage() {
>&2 echo "Usage: $0 [ -l search_list ] [ -f file_to_grep ] [ -n split_num ] [ -p num_threads ]"
exit 1
}
num_param=4
required_param=$(bc -l<<<${num_param}*2+1)
while getopts ":l:f:n:p:" options; do
case "${options}" in
l)
list=${OPTARG}
;;
f)
file=${OPTARG}
;;
n)
num=${OPTARG}
regex='^[1-9][0-9]*$'
if [[ ! ${num} =~ ${regex} ]]; then
usage
fi
;;
p)
num_threads=${OPTARG}
regex='^[1-9][0-9]*$'
if [[ ! ${num_threads} =~ ${regex} ]]; then
usage
fi
;;
:)
echo "Error: -${OPTARG} requires an argument."
exit 1
;;
*)
usage ;;
esac
done
if [[ ${OPTIND} -ne ${required_param} ]]; then
usage
fi
# check if input files exist
for check in ${list} ${file}; do
if [[ ! -e ${check} ]]; then
>&2 echo ${check} does not exist
exit 1
fi
done
# generate prefixes
prefixes=({a..z}{a..z})
# check to see requested number of splits is larger than supported
num_prefix=${#prefixes[@]}
if [[ $num -gt ${num_prefix} ]]; then
>&2 echo Please enter number less than ${num_prefix}
exit 1
fi
# get basename
base=$(basename -- ${file})
base="${base%.*}"
# calculate number of lines per split
total=$(cat ${file} | wc -l)
div=$(bc -l<<<${total}/${num}+1)
lines=$(printf %.0f ${div})
# split file to search
split -l ${lines} ${file} ${base}.
# file containing commands to run
cmd_txt=$(date +%Y%M%d%H%M%N)
# generate commands
#
# -w to prevent partial matches
# -F Interpret PATTERN as a list of fixed strings, separated by newlines, any of which is to be matched.
# -c count
#
for ((n = 0; n < ${num}; n++)); do
echo "grep -w -c -F -f ${list} ${base}.${prefixes[${n}]}" >> ${cmd_txt}
done
parallel -j ${num_threads} < ${cmd_txt} | perl -nle '$s += $_; END { print $s }'
# clean up
rm ${cmd_txt}
for ((n = 0; n < ${num}; n++)); do
rm ${base}.${prefixes[${n}]}
done
exit 0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment