Skip to content

Instantly share code, notes, and snippets.

@arzga
Last active September 13, 2022 15:42
Show Gist options
  • Save arzga/9b8fd97215c7570ebe103dad2f965e5a to your computer and use it in GitHub Desktop.
Save arzga/9b8fd97215c7570ebe103dad2f965e5a to your computer and use it in GitHub Desktop.
Analyze word error rate (WER) and other sentence metrics between REF and HYP transcripts using 'sclite'.
#!/bin/bash
# Check for requirements:
if ! command -v sclite &> /dev/null; then
echo "Error: 'sclite' not found in path."
echo
echo "You need to install it for transcript substitution/insertion/deletion analysis."
echo "See: https://github.com/usnistgov/SCTK"
exit 1
fi
if ! command -v jq &> /dev/null; then
echo "Error: 'jq' not found in path."
echo
echo "You need to install it for jquery parsing."
echo "See: https://stedolan.github.io/jq/"
exit 1
fi
# Parse optional arguments
FIELD1=.transcript
FIELD2=.hypothesis
while [ $# -gt 0 ]
do
case "$1" in
-tt) FIELD1=.transcript
FIELD2=.transcript
;;
-hh) FIELD1=.hypothesis
FIELD2=.hypothesis
;;
-th) FIELD1=.transcript
FIELD2=.hypothesis
;;
--help) HELP=1
;;
-h) HELP=1
;;
--mode) OUT_MODE=$2
shift
;;
-m) OUT_MODE=$2
shift
;;
*) break
;;
esac
shift
done
# Display usage if requested so
if [[ $# -eq 0 || $HELP == 1 ]]; then
echo "Analyze word error rate (WER) and other sentence metrics between REF and HYP transcripts using 'sclite'."
echo
echo "usage: wer.sh [--help] [-th|-hh|-tt] [--mode <mode>]"
echo " <ref.ac> [<hyp.ac>]"
echo
echo "If only <ref.ac> is given, both REF and HYP fields are picked from it."
echo "If both <ref.ac> and <hyp.ac> are given, REF field is picked from <ref.ac> and HYP from <hyp.ac>."
echo
echo "Select fields to compare:"
echo
echo "-th Compare 'transcript' (REF) to 'hypothesis' (HYP) field (default)"
echo "-hh Compare 'hypothesis' (REF) to 'hypothesis' (HYP) field"
echo "-tt Compare 'transcript' (REF) to 'transcript' (HYP) field"
echo
echo "Select the 'sclite' report to display:"
echo "-m <mode>, --mode <mode>"
echo " 'wer' for word error rate (WER) (default)"
echo " 'sum' for ins/del/substitution summary"
echo " 'snt' for sentence diff report"
echo " 'dtl' for detailed report"
echo " 'prf' for detailed report, no word wrap"
echo " 'err' above with errors only"
exit 0
fi
# Extract .ac file path(s) from arguments
AC_FILE1=$1
if [ -z "$2" ]; then
AC_FILE2=$1
else
AC_FILE2=$2
fi
# Prepare a temp folder and clean it up upon exit with trap
function finish {
if [ ! -z "$TEMP_FOLDER" ]; then
rm -rf $TEMP_FOLDER
fi
}
TEMP_FOLDER=$(mktemp -d) || exit 1
trap finish EXIT
# Prepare 'sclite' compatible transcript (.trn) files from jsonl (.ac) files
jq -r '. | ['$FIELD1'," (sp1-",.utterance,")"] | join("")' $AC_FILE1 > $TEMP_FOLDER/ref.trn
jq -r '. | ['$FIELD2'," (sp1-",.utterance,")"] | join("")' $AC_FILE2 > $TEMP_FOLDER/hyp.trn
# Compare and analyze with 'sclite'
if [[ -z "$OUT_MODE" || $OUT_MODE == "wer" ]]; then
>&2 echo "Displaying word error rate (WER). Use 'wer.sh --help' for more options."
sclite -r $TEMP_FOLDER/ref.trn -h $TEMP_FOLDER/hyp.trn -i wsj -o dtl stdout|grep "Percent Total Error"
elif [[ $OUT_MODE == "err" ]]; then
sclite -r $TEMP_FOLDER/ref.trn -h $TEMP_FOLDER/hyp.trn -i wsj -o prf stdout|grep -E -B 5 "^Eval:.*\S+.*$"
else
sclite -r $TEMP_FOLDER/ref.trn -h $TEMP_FOLDER/hyp.trn -i wsj -o $OUT_MODE stdout
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment