mbland/count-matching-lines-driver.sh

## count-matching-lines-driver.sh
#! /bin/bash
#
# Driver for the count-matching-lines script
#
# Author: Mike Bland <mbland@acm.org>
#         https://mike-bland.com/, https://github.com/mbland/
# Date:   2016-08-19

usage() {
  local message
  read -r -d '' message <<END_USAGE
Driver for the count-matching-lines script

Usage:

  ${0##*/} <output directory> <pattern> file [file...]

This script will run every combination of flags possible for the
count-matching-lines script based on the number of input files and the version
of bash used to run this script. This means:

- If there is more than one input file, both the one-file-at-a-time and
  concatenated files input methods will run. If there's only one input file,
  there will not be a concatenated files run.

- If the bash version is 4.x and above, the bash 3.x implementation will run,
  followed by all bash 4.x implementations. If the bash version is below 4.x,
  only the bash 3.x implementation will run.

Each run will produce an output file in <output directory>, which will be
created if it doesn't exist, but must be empty if it does. When all runs have
completed, a "summary.txt" file will be generated in <output directory> and
output to the terminal.

If 'memusg' (https://gist.github.com/netj/526585) is in your \$PATH, the results
will contain the peak memory usage for each implementation.
END_USAGE
  echo "$message"
}

do_run() {
  local impl="$1"
  local outfile="$OUTPUT_DIR/"
  local memusg="$(command -v 'memusg')"

  # We need to ensure count-matching-lines runs with whatever version of bash
  # was used to launch this driver rather than relying on $PATH to find bash.

  local run_bash=("$BASH")
  local args=()

  if [[ -n $memusg ]]; then
    run_bash=("$memusg" "$BASH")
  fi

  case "$impl" in
  grep|bashv3|perl|python)
    outfile+="$impl"
    args+=("--$impl")
    ;;
  '')
    impl="bashv${BASH_VERSINFO[0]}"
    outfile+="$impl"
    ;;
  *)
    outfile+="bashv${BASH_VERSINFO[0]}-$impl"
    args+=("--$impl")
    ;;
  esac

  args+=("${ARGV[@]}")

  echo "Running $impl"

  # We have to enable job control because of memusg.
  #
  # This will be easier to understand after playing with a working example.
  # First create the following script; I'll call it "foo.sh":
  #
  #   memusg ls >"foo0.out" 2>&1
  #   memusg ls >"foo1.out" 2>&1
  #
  # Then:
  #
  # - Run it as 'bash foo.sh' and it will hang; run 'fg' to continue.
  # - Run it as 'bash -m foo.sh' and it will complete.
  #
  # Also try editing the output redirections per the following and running it
  # with 'bash foo.sh':
  #
  # - Remove the first "2>&1" redirection and it will hang.
  # - Remove the second "2>&1" redirection and it will complete.
  # - Remove both "2>&1" redirection and it will complete.
  # - Remove the first >"foo0.out" redirection and it will hang.
  # - Remove the second >"foo0.out" redirection and it will complete.
  #
  # This is what's happening to the best of my current knowledge:
  #
  # When it detects that it is running in the same process group as its parent,
  # memusg will try to launch its command arguments in a new interactive shell
  # ('exec bash -i -c "$cmd"', which seems contradictory) to make sure that the
  # command it's measuring is in its own process group. It uses this bash
  # invocation as a more portable alternative to the 'setsid' command, which is
  # available on Linux but not OS X, for example. See:
  #
  #   https://gist.github.com/netj/526585#gistcomment-11943
  #
  # This normally works fine on the command line; it even works fine in a script
  # when the command's standard error is not redirected, or when it's redirected
  # only for the first 'memusg' invocation. In these cases, standard input will
  # redirect to a file just fine and memusg's background process will print
  # "memusg=: peak=..." either to the terminal when standard error isn't
  # redirected, or to the file when it is.
  #
  # However, on the second 'memusg' invocation with standard error redirected,
  # this new shell will somehow try to read from the terminal that launched the
  # script. When that happens, the script will receive a SIGTTIN signal, which
  # will push it into the background. (See the "JOB CONTROL" section of the bash
  # man page.) At that point, the user must repeatedly bring the process back
  # into the foreground manually with 'fg' until the final 'memusg' invocation
  # completes.
  #
  # "set -m" enables "Monitor mode", i.e. job control. Normally it only makes
  # sense for interactive shells, but the side effect we need is that all
  # processes are then started in a separate process group. This keeps the 'exec
  # bash -i -c "$cmd"' invocation from happening, which in turn prevents memusg
  # from suspending the script. I validated this by adding the following to
  # memusg before the 'exec bash -i -c "$cmd"' call:
  #
  #   echo "EXECING FOR NEW PROCESS GROUP" >&2
  #
  # Then running the script with:
  #
  # - no job control and no standard error redirection:
  #   message appears, process completes
  # - no job control and with standard error redirection:
  #   message appears, process suspends
  # - with job control and no standard error redirection
  #   message doesn't appear, process completes
  # - with job control and with standard error redirection
  #   message doesn't appear, process completes
  #
  # The following reference helped me zero in on the need for "set -m":
  #
  #   https://stackoverflow.com/questions/690266/why-cant-i-use-job-control-in-a-bash-script/690297#690297
  #   (or https://stackoverflow.com/a/690297, but that redirects to http://.)
  #
  # Why this only happens starting with the second 'memusg' invocation with
  # standard error redirected to a file is still a mystery. If I find out, I'll
  # update this comment.

  set -m
  "${run_bash[@]}" "$COUNT_MATCHING_LINES" "${args[@]}" >"$outfile.txt" 2>&1

  if [[ $DO_CAT = 'true' ]]; then
    echo "Running $impl with concatenated files"
    "${run_bash[@]}" "$COUNT_MATCHING_LINES" --cat "${args[@]}" \
      >"$outfile-cat.txt" 2>&1
  fi
  set +m
}

summarize_timing() {
  printf "Timing:\n"

  grep 'real' "$OUTPUT_DIR"/* | sort -k 2 |
    sed -e "s#^$OUTPUT_DIR/##" -e 's/\.txt:real*//' |
    awk '{printf "  %s  %s\n", $2, $1}'
}

wait_for_memusg() {
  while ps -e -o command | grep -q "[m]emusg $BASH $COUNT_MATCHING_LINES"; do
    sleep 0.1
  done
}

summarize_memusg() {
  wait_for_memusg

  if ! grep -q 'memusg' "$OUTPUT_DIR"/*; then
    return
  fi

  printf "\nMemory usage:\n"
  grep 'memusg' "$OUTPUT_DIR"/* |
    sed -e "s#^$OUTPUT_DIR/##" -e 's/.txt:memusg: peak=/ /' | sort -n -k 2 |
    awk '{printf "  %s  %s\n", $2, $1}'
}

summarize_file_stats() {
  printf "\nInput file stats:\n"
  egrep "^(Total|Average) " $OUTPUT_DIR/grep.txt | awk '{ printf "  %s\n", $0 }'
}

summarize() {
  local summary_file="$OUTPUT_DIR/summary.txt"

  summarize_timing > "$summary_file"
  summarize_memusg >> "$summary_file"
  summarize_file_stats >> "$summary_file"

  cat "$summary_file"
  printf "\nAll output saved in: %s\nSummary saved in: %s\n\n" \
    "$OUTPUT_DIR" "$summary_file"
}

count_matching_lines_driver() {
  if [[ $# -eq 0 ]]; then
    usage
    return
  fi

  COUNT_MATCHING_LINES="${0%/*}/count-matching-lines"

  if [[ ! -f $COUNT_MATCHING_LINES ]]; then
    echo "count-matching-lines script should be in the same directory" >&2
    exit 1
  fi

  OUTPUT_DIR="$1"
  shift

  if [[ ! -d $OUTPUT_DIR ]] && ! mkdir "$OUTPUT_DIR"; then
    echo "Could not create output directory: $OUTPUT_DIR" >&2
    exit 1
  elif ! rmdir "$OUTPUT_DIR"; then
    echo "Remove all files from $OUTPUT_DIR and try again." >&2
    exit
  fi

  mkdir "$OUTPUT_DIR"

  # If there is more than one filename specified, also run with concatenated
  # input.
  if [[ $# -gt 2 ]]; then
    DO_CAT='true'
  fi

  ARGV=("$@")

  do_run 'grep'

  if [[ ${BASH_VERSINFO[0]} -ge 4 ]]; then
    do_run 'bashv3'
  fi

  do_run ''

  if [[ ${BASH_VERSINFO[0]} -ge 4 ]]; then
    do_run 'mapfile'
    do_run 'callback'
  fi

  local languages=('perl' 'python')

  for language in "${languages[@]}"; do
    if command -v "$language" >/dev/null; then
      do_run "$language"
    fi
  done

  printf "Done\n\n"
  summarize
}

count_matching_lines_driver "$@"

## count-matching-lines.sh
#! /bin/bash
#
# Proof-of-concept bash-only implementation of 'grep -o $exp | sort | uniq -c'
#
# When processing a single, relatively small file (around 200 lines), the
# performance of both the grep and bash-only implementations is fairly close.
# When processing multiple relatively small files, the bash-only version begins
# to pull ahead slightly, as it does not create a new process pipeline for each
# file.
#
# Beyond two hundred lines of input, or when concatenating multiple files into
# standard input, the grep pipeline is significantly faster. The advantage of
# the bash version becomes a potentially smaller memory footprint (since it
# doesn't launch new processes and only one copy of each matching string is
# stored in memory at any time) and portability to platforms that lack grep,
# sort, or uniq (unlikely if bash is installed, but possible).
#
# The bash 4.x version using an associative array significantly outperforms the
# 3.x version, and uses even less memory. Using 'mapfile' in Bash 4.x instead of
# 'while read -r' didn't significantly impact the performance on small files. On
# very large files, the 'mapfile' version
#
# Author: Mike Bland <mbland@acm.org>
#         https://mike-bland.com/, https://github.com/mbland/
# Date:   2016-08-18

usage() {
  local message
  read -r -d '' message <<END_USAGE
Proof-of-concept bash-only implementation of 'grep -o \$exp | sort | uniq -c'

Usage:

  ${0##*/} [--<flag>] <pattern> file [file...]

Flags:
  --cat        Concatenate all input files, rather than looping over them
  --grep       Use the 'grep -o <pattern> | sort | uniq -c' implementation
  --mapfile    In bash 4.x, replace the 'while read' loop with 'mapfile'
  --callback   In bash 4.x, use a callback with 'mapfile' instead of looping
  --bashv3     Use the bash 3.x implementation, even when running under 4.x
  --perl       Use the perl implementation
  --python     Use the python implementation

The default behavior is to loop over the files one-at-a-time, process them using
the bash-only implementation matching the shell version, then output the
matching lines and their frequency counts for each file.

The first line of output will indicate the implementation used, whether the
files were provided one-at-a-time or concatenated, and the number of lines and
bytes per file.

The process is then executed, the time reported, and the number of files, lines,
and bytes summarized.

NOTE ON BASH 4.X, PERL, AND PYTHON IMPLEMENTATIONS: The output will not
necessarily be sorted by matching string as with the other implementations, as
they use an associative array. The output from these implementations could then
be sorted with 'sort -k 2'.

END_USAGE
  echo "$message"
}

using_grep() {
  grep -o "$1" | sort | uniq -c
}

# Uses the '=~' operator and ${BASH_REMATCH[0]} to replace 'grep -o'.
# Uses an insertion sort to replace 'sort | uniq -c'.
using_bash_v3() {
  local pattern="$1"
  local matches=()
  local counts=()
  local line
  local match
  local i
  local j
  local current

  while read -r line; do
    if [[ ! $line =~ $pattern ]]; then
      continue
    fi

    match="${BASH_REMATCH[0]}"

    for ((i=${#matches[@]}; i != 0; --i)); do
      current="${matches[$((i-1))]}"

      if [[ $match = $current ]]; then
        ((--i))
        break
      elif [[ $match > $current ]]; then
        break
      fi
    done

    if [[ ${#matches[@]} -ne 0 && $match < ${matches[$i]} ]]; then
      for ((j=${#matches[@]}; j != i; --j)); do
        matches[$j]="${matches[$((j-1))]}"
        ((counts[$j]=counts[j-1]))
      done
      ((counts[i]=0))
    fi

    matches[$i]="$match"
    ((++counts[i]))
  done

  for ((i=0; i != "${#matches[@]}"; ++i)); do
    printf "%4d %s\n" "${counts[$i]}" "${matches[$i]}"
  done
}

# Uses the '=~' operator and ${BASH_REMATCH[0]} to replace 'grep -o'.
# Uses an associative array to replace 'sort | uniq -c'.
# The output lines aren't sorted by the match strings, but the output can be
# piped into 'sort -k 2' if that matters.
using_bash_v4() {
  local pattern="$1"
  local -A matches=()
  local line

  while read -r line; do
    if [[ $line =~ $pattern ]]; then
      ((++matches["${BASH_REMATCH[0]}"]))
    fi
  done

  for match in "${!matches[@]}"; do
    printf "%4d %s\n" "${matches[$match]}" "$match"
  done
}

# Same as above, but using mapfile instead of 'while read'
using_bash_v4_with_mapfile() {
  local pattern="$1"
  local -A matches=()
  local line
  local lines=()

  mapfile lines

  for line in "${lines[@]}"; do
    if [[ $line =~ $pattern ]]; then
      ((++matches["${BASH_REMATCH[0]}"]))
    fi
  done

  for match in "${!matches[@]}"; do
    printf "%4d %s\n" "${matches[$match]}" "$match"
  done
}

# Same as above, but using a callback with 'mapfile' instead of looping.
using_bash_v4_with_mapfile_callback() {
  local pattern="$1"
  local -A matches=()

  store_match() {
    if [[ $2 =~ $pattern ]]; then
      ((++matches["${BASH_REMATCH[0]}"]))
    fi
  }

  mapfile -c 1 -C store_match

  for match in "${!matches[@]}"; do
    printf "%4d %s\n" "${matches[$match]}" "$match"
  done
}

using_perl() {
  local pat="${1//\//\\/}"
  perl -e "%matches;while(<>) { if (/$pat/) { \$matches{\$&}++; } }" \
    -e 'while(($k, $v) = each %matches) { printf("%4d %s\n", $v, $k); }'
}

using_python() {
  local program
  read -r -d '' program<<END_PYTHON
import re, sys
p=re.compile('$1')
m={}
for l in sys.stdin:
  s = p.search(l)
  if s:
    g = s.group(0)
    if g not in m:
      m[g]=1
    else:
      m[g]+=1
for k in m:
  print "%4d %s" % (m[k], k)
END_PYTHON
  python -OO -c "$program"
}

one_at_a_time() {
  local impl="$1"
  local pattern="$2"
  shift
  shift

  for f in "$@"; do
    echo "$f:"
    "$impl" "$pattern" < "$f"
  done
}

concatenated() {
  local impl="$1"
  local pattern="$2"
  shift
  shift

  cat "$@" | "$impl" "$pattern"
}

check_is_bash_v4() {
  flag="$1"

  if [[ "${BASH_VERSINFO[0]}" -lt 4 ]]; then
    printf -- "$flag requires at least bash version 4.x; %s\n" \
      "this is version $BASH_VERSION." >&2
    return 1
  fi
}

count_matching_lines() {
  if [[ $# -eq 0 ]]; then
    usage
    return
  fi

  local impl="using_bash_v${BASH_VERSINFO[0]}"

  if ! command -v "$impl" >/dev/null; then
    printf "This requires bash version 3.x or 4.x; this is version %s.\n" \
      "$BASH_VERSION" >&2
    return 1
  fi

  local provide_files='one_at_a_time'
  local arg
  local pattern
  local num_impl_flags=0

  while [[ $# -ne 0 ]]; do
    arg="$1"
    shift

    case "$arg" in
    --cat)
      provide_files='concatenated'
      ;;
    --grep|--perl|--python)
      impl="using_${arg#--}"
      ((++num_impl_flags))
      ;;
    --mapfile)
      if ! check_is_bash_v4 "$arg"; then
        return 1
      fi
      impl='using_bash_v4_with_mapfile'
      ((++num_impl_flags))
      ;;
    --callback)
      if ! check_is_bash_v4 "$arg"; then
        return 1
      fi
      impl='using_bash_v4_with_mapfile_callback'
      ((++num_impl_flags))
      ;;
    --bashv3)
      impl='using_bash_v3'
      ((++num_impl_flags))
      ;;
    *)
      pattern="$arg"
      break
      ;;
    esac

    if [[ $num_impl_flags -gt 1 ]]; then
      printf "Only one of --grep, --mapfile, --callback, or --bash3 %s\n" \
        "may be specified." >&2
      return 1
    fi
  done

  local lines
  local bytes
  local total_lines
  local total_bytes

  local file_method="${provide_files//_/ }"
  local impl_name="${impl#using_}"
  local id="${impl_name//_ / } $file_method: $pattern"

  echo "START $id"

  for f in "$@"; do
    while read -r lines bytes; do
      echo "  $f ($lines lines, $bytes bytes)"
      ((total_lines+=lines))
      ((total_bytes+=bytes))
    done <<< "$(wc -lc $f | awk '{print $1, $2}')"
  done

  printf "\nRUN $id\n\n"
  { time "$provide_files" "$impl" "$pattern" "$@"; } 2>&1
  printf "\nFINISH $id\n\n"

  printf 'Total files: %d\nTotal lines: %d\nTotal bytes: %d\n' \
    "$#" "$total_lines" "$total_bytes"
  printf 'Average lines: %d\nAverage bytes: %d\n\n' \
    "$((total_lines/$#))" "$((total_bytes/$#))"
  echo "END $id"
}

count_matching_lines "$@"
	#! /bin/bash
	#
	# Driver for the count-matching-lines script
	#
	# Author: Mike Bland <mbland@acm.org>
	# https://mike-bland.com/, https://github.com/mbland/
	# Date: 2016-08-19

	usage() {
	local message
	read -r -d '' message <<END_USAGE
	Driver for the count-matching-lines script

	Usage:

	${0##*/} <output directory> <pattern> file [file...]

	This script will run every combination of flags possible for the
	count-matching-lines script based on the number of input files and the version
	of bash used to run this script. This means:

	- If there is more than one input file, both the one-file-at-a-time and
	concatenated files input methods will run. If there's only one input file,
	there will not be a concatenated files run.

	- If the bash version is 4.x and above, the bash 3.x implementation will run,
	followed by all bash 4.x implementations. If the bash version is below 4.x,
	only the bash 3.x implementation will run.

	Each run will produce an output file in <output directory>, which will be
	created if it doesn't exist, but must be empty if it does. When all runs have
	completed, a "summary.txt" file will be generated in <output directory> and
	output to the terminal.

	If 'memusg' (https://gist.github.com/netj/526585) is in your \$PATH, the results
	will contain the peak memory usage for each implementation.
	END_USAGE
	echo "$message"
	}

	do_run() {
	local impl="$1"
	local outfile="$OUTPUT_DIR/"
	local memusg="$(command -v 'memusg')"

	# We need to ensure count-matching-lines runs with whatever version of bash
	# was used to launch this driver rather than relying on $PATH to find bash.

	local run_bash=("$BASH")
	local args=()

	if [[ -n $memusg ]]; then
	run_bash=("$memusg" "$BASH")
	fi

	case "$impl" in
	grep\|bashv3\|perl\|python)
	outfile+="$impl"
	args+=("--$impl")
	;;
	'')
	impl="bashv${BASH_VERSINFO[0]}"
	outfile+="$impl"
	;;
	*)
	outfile+="bashv${BASH_VERSINFO[0]}-$impl"
	args+=("--$impl")
	;;
	esac

	args+=("${ARGV[@]}")

	echo "Running $impl"

	# We have to enable job control because of memusg.
	#
	# This will be easier to understand after playing with a working example.
	# First create the following script; I'll call it "foo.sh":
	#
	# memusg ls >"foo0.out" 2>&1
	# memusg ls >"foo1.out" 2>&1
	#
	# Then:
	#
	# - Run it as 'bash foo.sh' and it will hang; run 'fg' to continue.
	# - Run it as 'bash -m foo.sh' and it will complete.
	#
	# Also try editing the output redirections per the following and running it
	# with 'bash foo.sh':
	#
	# - Remove the first "2>&1" redirection and it will hang.
	# - Remove the second "2>&1" redirection and it will complete.
	# - Remove both "2>&1" redirection and it will complete.
	# - Remove the first >"foo0.out" redirection and it will hang.
	# - Remove the second >"foo0.out" redirection and it will complete.
	#
	# This is what's happening to the best of my current knowledge:
	#
	# When it detects that it is running in the same process group as its parent,
	# memusg will try to launch its command arguments in a new interactive shell
	# ('exec bash -i -c "$cmd"', which seems contradictory) to make sure that the
	# command it's measuring is in its own process group. It uses this bash
	# invocation as a more portable alternative to the 'setsid' command, which is
	# available on Linux but not OS X, for example. See:
	#
	# https://gist.github.com/netj/526585#gistcomment-11943
	#
	# This normally works fine on the command line; it even works fine in a script
	# when the command's standard error is not redirected, or when it's redirected
	# only for the first 'memusg' invocation. In these cases, standard input will
	# redirect to a file just fine and memusg's background process will print
	# "memusg=: peak=..." either to the terminal when standard error isn't
	# redirected, or to the file when it is.
	#
	# However, on the second 'memusg' invocation with standard error redirected,
	# this new shell will somehow try to read from the terminal that launched the
	# script. When that happens, the script will receive a SIGTTIN signal, which
	# will push it into the background. (See the "JOB CONTROL" section of the bash
	# man page.) At that point, the user must repeatedly bring the process back
	# into the foreground manually with 'fg' until the final 'memusg' invocation
	# completes.
	#
	# "set -m" enables "Monitor mode", i.e. job control. Normally it only makes
	# sense for interactive shells, but the side effect we need is that all
	# processes are then started in a separate process group. This keeps the 'exec
	# bash -i -c "$cmd"' invocation from happening, which in turn prevents memusg
	# from suspending the script. I validated this by adding the following to
	# memusg before the 'exec bash -i -c "$cmd"' call:
	#
	# echo "EXECING FOR NEW PROCESS GROUP" >&2
	#
	# Then running the script with:
	#
	# - no job control and no standard error redirection:
	# message appears, process completes
	# - no job control and with standard error redirection:
	# message appears, process suspends
	# - with job control and no standard error redirection
	# message doesn't appear, process completes
	# - with job control and with standard error redirection
	# message doesn't appear, process completes
	#
	# The following reference helped me zero in on the need for "set -m":
	#
	# https://stackoverflow.com/questions/690266/why-cant-i-use-job-control-in-a-bash-script/690297#690297
	# (or https://stackoverflow.com/a/690297, but that redirects to http://.)
	#
	# Why this only happens starting with the second 'memusg' invocation with
	# standard error redirected to a file is still a mystery. If I find out, I'll
	# update this comment.

	set -m
	"${run_bash[@]}" "$COUNT_MATCHING_LINES" "${args[@]}" >"$outfile.txt" 2>&1

	if [[ $DO_CAT = 'true' ]]; then
	echo "Running $impl with concatenated files"
	"${run_bash[@]}" "$COUNT_MATCHING_LINES" --cat "${args[@]}" \
	>"$outfile-cat.txt" 2>&1
	fi
	set +m
	}

	summarize_timing() {
	printf "Timing:\n"

	grep 'real' "$OUTPUT_DIR"/* \| sort -k 2 \|
	sed -e "s#^$OUTPUT_DIR/##" -e 's/\.txt:real*//' \|
	awk '{printf " %s %s\n", $2, $1}'
	}

	wait_for_memusg() {
	while ps -e -o command \| grep -q "[m]emusg $BASH $COUNT_MATCHING_LINES"; do
	sleep 0.1
	done
	}

	summarize_memusg() {
	wait_for_memusg

	if ! grep -q 'memusg' "$OUTPUT_DIR"/*; then
	return
	fi

	printf "\nMemory usage:\n"
	grep 'memusg' "$OUTPUT_DIR"/* \|
	sed -e "s#^$OUTPUT_DIR/##" -e 's/.txt:memusg: peak=/ /' \| sort -n -k 2 \|
	awk '{printf " %s %s\n", $2, $1}'
	}

	summarize_file_stats() {
	printf "\nInput file stats:\n"
	egrep "^(Total\|Average) " $OUTPUT_DIR/grep.txt \| awk '{ printf " %s\n", $0 }'
	}

	summarize() {
	local summary_file="$OUTPUT_DIR/summary.txt"

	summarize_timing > "$summary_file"
	summarize_memusg >> "$summary_file"
	summarize_file_stats >> "$summary_file"

	cat "$summary_file"
	printf "\nAll output saved in: %s\nSummary saved in: %s\n\n" \
	"$OUTPUT_DIR" "$summary_file"
	}

	count_matching_lines_driver() {
	if [[ $# -eq 0 ]]; then
	usage
	return
	fi

	COUNT_MATCHING_LINES="${0%/*}/count-matching-lines"

	if [[ ! -f $COUNT_MATCHING_LINES ]]; then
	echo "count-matching-lines script should be in the same directory" >&2
	exit 1
	fi

	OUTPUT_DIR="$1"
	shift

	if [[ ! -d $OUTPUT_DIR ]] && ! mkdir "$OUTPUT_DIR"; then
	echo "Could not create output directory: $OUTPUT_DIR" >&2
	exit 1
	elif ! rmdir "$OUTPUT_DIR"; then
	echo "Remove all files from $OUTPUT_DIR and try again." >&2
	exit
	fi

	mkdir "$OUTPUT_DIR"

	# If there is more than one filename specified, also run with concatenated
	# input.
	if [[ $# -gt 2 ]]; then
	DO_CAT='true'
	fi

	ARGV=("$@")

	do_run 'grep'

	if [[ ${BASH_VERSINFO[0]} -ge 4 ]]; then
	do_run 'bashv3'
	fi

	do_run ''

	if [[ ${BASH_VERSINFO[0]} -ge 4 ]]; then
	do_run 'mapfile'
	do_run 'callback'
	fi

	local languages=('perl' 'python')

	for language in "${languages[@]}"; do
	if command -v "$language" >/dev/null; then
	do_run "$language"
	fi
	done

	printf "Done\n\n"
	summarize
	}

	count_matching_lines_driver "$@"
	#! /bin/bash
	#
	# Proof-of-concept bash-only implementation of 'grep -o $exp \| sort \| uniq -c'
	#
	# When processing a single, relatively small file (around 200 lines), the
	# performance of both the grep and bash-only implementations is fairly close.
	# When processing multiple relatively small files, the bash-only version begins
	# to pull ahead slightly, as it does not create a new process pipeline for each
	# file.
	#
	# Beyond two hundred lines of input, or when concatenating multiple files into
	# standard input, the grep pipeline is significantly faster. The advantage of
	# the bash version becomes a potentially smaller memory footprint (since it
	# doesn't launch new processes and only one copy of each matching string is
	# stored in memory at any time) and portability to platforms that lack grep,
	# sort, or uniq (unlikely if bash is installed, but possible).
	#
	# The bash 4.x version using an associative array significantly outperforms the
	# 3.x version, and uses even less memory. Using 'mapfile' in Bash 4.x instead of
	# 'while read -r' didn't significantly impact the performance on small files. On
	# very large files, the 'mapfile' version
	#
	# Author: Mike Bland <mbland@acm.org>
	# https://mike-bland.com/, https://github.com/mbland/
	# Date: 2016-08-18

	usage() {
	local message
	read -r -d '' message <<END_USAGE
	Proof-of-concept bash-only implementation of 'grep -o \$exp \| sort \| uniq -c'

	Usage:

	${0##*/} [--<flag>] <pattern> file [file...]

	Flags:
	--cat Concatenate all input files, rather than looping over them
	--grep Use the 'grep -o <pattern> \| sort \| uniq -c' implementation
	--mapfile In bash 4.x, replace the 'while read' loop with 'mapfile'
	--callback In bash 4.x, use a callback with 'mapfile' instead of looping
	--bashv3 Use the bash 3.x implementation, even when running under 4.x
	--perl Use the perl implementation
	--python Use the python implementation

	The default behavior is to loop over the files one-at-a-time, process them using
	the bash-only implementation matching the shell version, then output the
	matching lines and their frequency counts for each file.

	The first line of output will indicate the implementation used, whether the
	files were provided one-at-a-time or concatenated, and the number of lines and
	bytes per file.

	The process is then executed, the time reported, and the number of files, lines,
	and bytes summarized.

	NOTE ON BASH 4.X, PERL, AND PYTHON IMPLEMENTATIONS: The output will not
	necessarily be sorted by matching string as with the other implementations, as
	they use an associative array. The output from these implementations could then
	be sorted with 'sort -k 2'.

	END_USAGE
	echo "$message"
	}

	using_grep() {
	grep -o "$1" \| sort \| uniq -c
	}

	# Uses the '=~' operator and ${BASH_REMATCH[0]} to replace 'grep -o'.
	# Uses an insertion sort to replace 'sort \| uniq -c'.
	using_bash_v3() {
	local pattern="$1"
	local matches=()
	local counts=()
	local line
	local match
	local i
	local j
	local current

	while read -r line; do
	if [[ ! $line =~ $pattern ]]; then
	continue
	fi

	match="${BASH_REMATCH[0]}"

	for ((i=${#matches[@]}; i != 0; --i)); do
	current="${matches[$((i-1))]}"

	if [[ $match = $current ]]; then
	((--i))
	break
	elif [[ $match > $current ]]; then
	break
	fi
	done

	if [[ ${#matches[@]} -ne 0 && $match < ${matches[$i]} ]]; then
	for ((j=${#matches[@]}; j != i; --j)); do
	matches[$j]="${matches[$((j-1))]}"
	((counts[$j]=counts[j-1]))
	done
	((counts[i]=0))
	fi

	matches[$i]="$match"
	((++counts[i]))
	done

	for ((i=0; i != "${#matches[@]}"; ++i)); do
	printf "%4d %s\n" "${counts[$i]}" "${matches[$i]}"
	done
	}

	# Uses the '=~' operator and ${BASH_REMATCH[0]} to replace 'grep -o'.
	# Uses an associative array to replace 'sort \| uniq -c'.
	# The output lines aren't sorted by the match strings, but the output can be
	# piped into 'sort -k 2' if that matters.
	using_bash_v4() {
	local pattern="$1"
	local -A matches=()
	local line

	while read -r line; do
	if [[ $line =~ $pattern ]]; then
	((++matches["${BASH_REMATCH[0]}"]))
	fi
	done

	for match in "${!matches[@]}"; do
	printf "%4d %s\n" "${matches[$match]}" "$match"
	done
	}

	# Same as above, but using mapfile instead of 'while read'
	using_bash_v4_with_mapfile() {
	local pattern="$1"
	local -A matches=()
	local line
	local lines=()

	mapfile lines

	for line in "${lines[@]}"; do
	if [[ $line =~ $pattern ]]; then
	((++matches["${BASH_REMATCH[0]}"]))
	fi
	done

	for match in "${!matches[@]}"; do
	printf "%4d %s\n" "${matches[$match]}" "$match"
	done
	}

	# Same as above, but using a callback with 'mapfile' instead of looping.
	using_bash_v4_with_mapfile_callback() {
	local pattern="$1"
	local -A matches=()

	store_match() {
	if [[ $2 =~ $pattern ]]; then
	((++matches["${BASH_REMATCH[0]}"]))
	fi
	}

	mapfile -c 1 -C store_match

	for match in "${!matches[@]}"; do
	printf "%4d %s\n" "${matches[$match]}" "$match"
	done
	}

	using_perl() {
	local pat="${1//\//\\/}"
	perl -e "%matches;while(<>) { if (/$pat/) { \$matches{\$&}++; } }" \
	-e 'while(($k, $v) = each %matches) { printf("%4d %s\n", $v, $k); }'
	}

	using_python() {
	local program
	read -r -d '' program<<END_PYTHON
	import re, sys
	p=re.compile('$1')
	m={}
	for l in sys.stdin:
	s = p.search(l)
	if s:
	g = s.group(0)
	if g not in m:
	m[g]=1
	else:
	m[g]+=1
	for k in m:
	print "%4d %s" % (m[k], k)
	END_PYTHON
	python -OO -c "$program"
	}

	one_at_a_time() {
	local impl="$1"
	local pattern="$2"
	shift
	shift

	for f in "$@"; do
	echo "$f:"
	"$impl" "$pattern" < "$f"
	done
	}

	concatenated() {
	local impl="$1"
	local pattern="$2"
	shift
	shift

	cat "$@" \| "$impl" "$pattern"
	}

	check_is_bash_v4() {
	flag="$1"

	if [[ "${BASH_VERSINFO[0]}" -lt 4 ]]; then
	printf -- "$flag requires at least bash version 4.x; %s\n" \
	"this is version $BASH_VERSION." >&2
	return 1
	fi
	}

	count_matching_lines() {
	if [[ $# -eq 0 ]]; then
	usage
	return
	fi

	local impl="using_bash_v${BASH_VERSINFO[0]}"

	if ! command -v "$impl" >/dev/null; then
	printf "This requires bash version 3.x or 4.x; this is version %s.\n" \
	"$BASH_VERSION" >&2
	return 1
	fi

	local provide_files='one_at_a_time'
	local arg
	local pattern
	local num_impl_flags=0

	while [[ $# -ne 0 ]]; do
	arg="$1"
	shift

	case "$arg" in
	--cat)
	provide_files='concatenated'
	;;
	--grep\|--perl\|--python)
	impl="using_${arg#--}"
	((++num_impl_flags))
	;;
	--mapfile)
	if ! check_is_bash_v4 "$arg"; then
	return 1
	fi
	impl='using_bash_v4_with_mapfile'
	((++num_impl_flags))
	;;
	--callback)
	if ! check_is_bash_v4 "$arg"; then
	return 1
	fi
	impl='using_bash_v4_with_mapfile_callback'
	((++num_impl_flags))
	;;
	--bashv3)
	impl='using_bash_v3'
	((++num_impl_flags))
	;;
	*)
	pattern="$arg"
	break
	;;
	esac

	if [[ $num_impl_flags -gt 1 ]]; then
	printf "Only one of --grep, --mapfile, --callback, or --bash3 %s\n" \
	"may be specified." >&2
	return 1
	fi
	done

	local lines
	local bytes
	local total_lines
	local total_bytes

	local file_method="${provide_files//_/ }"
	local impl_name="${impl#using_}"
	local id="${impl_name//_ / } $file_method: $pattern"

	echo "START $id"

	for f in "$@"; do
	while read -r lines bytes; do
	echo " $f ($lines lines, $bytes bytes)"
	((total_lines+=lines))
	((total_bytes+=bytes))
	done <<< "$(wc -lc $f \| awk '{print $1, $2}')"
	done

	printf "\nRUN $id\n\n"
	{ time "$provide_files" "$impl" "$pattern" "$@"; } 2>&1
	printf "\nFINISH $id\n\n"

	printf 'Total files: %d\nTotal lines: %d\nTotal bytes: %d\n' \
	"$#" "$total_lines" "$total_bytes"
	printf 'Average lines: %d\nAverage bytes: %d\n\n' \
	"$((total_lines/$#))" "$((total_bytes/$#))"
	echo "END $id"
	}

	count_matching_lines "$@"