Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save markgir/68b751a73676aec5185f40f671978ec4 to your computer and use it in GitHub Desktop.
Save markgir/68b751a73676aec5185f40f671978ec4 to your computer and use it in GitHub Desktop.
Whisper ASR Webservice | File Processing Script | 2023/09/23 | v2.8
#!/bin/bash
echo "Whisper ASR Webservice | File Proccessing Script | 2023/09/23 | v2.8"
echo "Author: Austin St. Aubin w/ a little help from ChatGPT."
echo "License: MIT License"
# This is a bash script that transcribes an audio file using a web service and outputs the transcript in various formats.
# The audio file is located at the SOURCE_PATH and the transcripts will be saved to the same directory with transcriptions as specified in TRANSCRIPT_EXTENSIONS.
# https://github.com/ahmetoner/whisper-asr-webservice/issues/93
# https://gist.github.com/AustinSaintAubin/a50b29ce52de5501a6dd05bf5d24cd44
# - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# bash /volume1/docker/whisper-asr-webservice/whisper-asr-webservice_file-processor.sh "/volume1/docker/whisper-asr-webservice/audio" --output "txt,tsv,srt,vtt,json" --gpu --reprocess
# -------------------------------------------------------
# Default Values
WISPER_ASR_SERVER_IP="localhost" # "192.168.20.27" # "192.168.10.91"
WISPER_ASR_SERVER_CPU="http://${WISPER_ASR_SERVER_IP}:9005" # Instance for CPU processing
WISPER_ASR_SERVER_GPU="http://${WISPER_ASR_SERVER_IP}:9006" # Instance for GPU processing
# WISPER_ASR_SERVER="${WISPER_ASR_SERVER:-WISPER_ASR_SERVER_GPU}" # "http://localhost:9000"
WISPER_ASR_TASK="transcribe"
WISPER_ASR_LANG="en"
WISPER_ASR_INITIAL_PROMPT="" # "- Hey how are you doing? - I'm doing good. How are you?"
TRANSCRIPT_EXTENSIONS="txt,tsv,srt" # "txt,tsv,srt,vtt,json" are the supported extensions for the transcripts.
DESTINATION_OUTPUT_PRINT=false # true/false | print the contents of the destination file to termnal
DESTINATION_REPROCESS=false # true/false | If true, the script will overwrite the destination file if it already exists.
SOURCE_TEMP_TRANSCODING_KEEP=false # true/false | delete the temparary transcoding file once script finishes
# User Output Helper
helper_output () {
echo "Usage: $(basename "$0") AUDIO-FILE [OPTIONS]
NOTE: Path to the source audio file/folder is the only input required. Can be passed without flag. All other parrameaters/options are optional.
-i, --input Path to input audio file. (mp3|mp4|wav|m4a)
-o, --output List of transcription output types, listed by extention, comma seperated. (txt,tsv,srt,vtt,json)
-s, --server Server Address. (http://localhost:9000)
-t, --task Task for the server. (transcribe)
-l, --lang Language of source audio file (en)
-e,--initial-prompt Intital Prompt for use in adding contect to wisper proccesing.
-p, --print Print contents of destination file when done.
-r, --reprocess Delete and reproccess selected transcription types.
-k, --keep-temp Keep converted tempararty audio files from M4A transcoding.
-d, --dest-dir Destination dirrectory, if differs from source.
-c/-g, --cpu / --gpu Secifies which Wisper ASR Webservice Server to use... the one for CPU or GPU realted task.
--help Shows this guide.
" >&2
echo "Example: bash $(realpath "$0") \"$(pwd)\" --output \"txt,srt\" --print --gpu"
}
# time durration
function format_time() {
local seconds=$1
local minutes=$(( seconds / 60 ))
local hours=$(( minutes / 60 ))
local remainder_seconds=$(( seconds % 60 ))
local remainder_minutes=$(( minutes % 60 ))
printf 'Duration: %02d hours, %02d minutes, %02d seconds' $hours $remainder_minutes $remainder_seconds
printf ' = %02d:%02d:%02d\n' $hours $remainder_minutes $remainder_seconds
}
# file size
function format_file_size() {
ls -l --all --human-readable --size "$@" | awk -F " " {'print $1'}
}
# file audio dirration
function format_audio_durration() {
ffmpeg -i "$@" 2>&1 | grep 'Duration' | grep -oP "[0-9]{2}:[0-9]{2}:[0-9]{2}" # [0-9]{2}:[0-9]{2}:[0-9]{2}.[0-9]{2}
}
function output_task_file_info() {
local seconds=$1
local filepath=$2
echo " └─ Task $(format_time ${seconds}) | File Size: $(format_file_size ${filepath}) | Audio Durration: $(format_audio_durration ${filepath})"
}
# Parse Options
options=$(getopt --quiet -o i:o:s:t:l:e:d:prkcgh --long input:,output:,server:,task:,lang:,initial-prompt:,dest-dir:,print,reprocess,keep-temp,cpu,gpu,help -- "$@") >&2
if [ $? -ne 0 ]; then
echo "ERROR: UNKNOWN OPTIONS INPUT!"
helper_output
exit 1
fi
# Parser
eval set -- "$options"
while true; do
case $1 in
-i|--input)
SOURCE_PATH="${2}"
shift 2
;;
-o|--output)
TRANSCRIPT_EXTENSIONS="${2}"
shift 2
;;
-s|--server)
WISPER_ASR_SERVER="${2}"
shift 2
;;
-t|--task)
WISPER_ASR_TASK="${2}"
shift 2
;;
-l|--lang)
WISPER_ASR_LANG="${2}"
shift 2
;;
-e|--initial-prompt)
WISPER_ASR_INITIAL_PROMPT="${2}"
shift 2
;;
-d|--dest-dir)
USER_DEFINED_DESTINATION_PATH_DIRECTORY="${2}"
shift 2
;;
-p|--print)
DESTINATION_OUTPUT_PRINT=true
shift
;;
-r|--reprocess)
DESTINATION_REPROCESS=true
shift
;;
-k|--keep-temp)
SOURCE_TEMP_TRANSCODING_KEEP=true
shift
;;
-c|--cpu)
WISPER_ASR_SERVER="${WISPER_ASR_SERVER_CPU}"
shift
;;
-g|--gpu)
WISPER_ASR_SERVER="${WISPER_ASR_SERVER_GPU}"
shift
;;
-h|--help)
helper_output
exit 0
;;
--)
shift
break
;;
*)
echo "ERROR: UNKNOWN OPTIONS CASE!"
helper_output
exit 1
;;
esac
done
# Shift the parsed options to leave only the remaining arguments
shift "$(($OPTIND -1))"
# Handle remaining arguments
if [ $# -eq 1 ]; then
if [ -z "${SOURCE_PATH}" ]; then
SOURCE_PATH="${1}"
else
echo "ERROR: Too many arguments. Seems like two source file paths specifited."
helper_output
exit 1
fi
elif [ $# -gt 1 ]; then
echo "ERROR: Too many unflagged arguments"
helper_output
exit 1
fi
# Error Checking
# check if the parent_folder and transcript_output_extention argument was provided
if [[ $# -lt 1 ]]; then
echo "Usage: $(basename "$0") SOURCE_PATH TRANSCRIPT_EXTENSIONS DESTINATION_REPROCESS SOURCE_TEMP_TRANSCODING_KEEP"
echo "Example: bash $(realpath "$0") \"$(pwd)\" \"txt,tsv\" \"true\" \"false\" \"false\""
exit 1
fi
# define the transcript file extension(s)
IFS=',' read -ra TRANSCRIPT_EXTENSIONS <<< "${2:-$TRANSCRIPT_EXTENSIONS}" # txt, vtt, srt, tsv, json
# check if any transcript extensions were provided
if [ ${#TRANSCRIPT_EXTENSIONS[@]} -eq 0 ]; then
echo "No transcript extensions were provided."
exit 1
fi
# check if the parent folder exists
if [[ ! -e "${SOURCE_PATH}" ]]; then # True if the FILE exists and is a file, regardless of type (node, directory, socket, etc.).
echo "Source Path '${SOURCE_PATH}' does not exist"
exit 1
fi
# set default server if not set
WISPER_ASR_SERVER="${WISPER_ASR_SERVER:-$WISPER_ASR_SERVER_GPU}"
# Formate Intial-Promot to URL-Encoded
WISPER_ASR_INITIAL_PROMPT_URL_ENCODED="$(jq -rn --arg x "${WISPER_ASR_INITIAL_PROMPT}" '$x|@uri')"
# Print Header
echo " - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -"
echo "Whisper ASR Webservice | v$(curl -s "${WISPER_ASR_SERVER}/openapi.json" | jq -r '.info.version') " # | ${WISPER_ASR_SERVER} | ${WISPER_ASR_TASK} | ${WISPER_ASR_LANG} | ${WISPER_ASR_INITIAL_PROMPT}"
echo "$(curl -s "${WISPER_ASR_SERVER}/openapi.json" | jq -r '.info.description')"
echo "Server: ${WISPER_ASR_SERVER}"
echo "Task: ${WISPER_ASR_TASK}"
echo "Lang: ${WISPER_ASR_LANG}"
echo "Initial Prompt: ${WISPER_ASR_INITIAL_PROMPT}"
# echo "Encoded Prompt: ${WISPER_ASR_INITIAL_PROMPT_URL_ENCODED}"
echo " - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -"
echo "Source Path: ${SOURCE_PATH}"
if [[ -v USER_DEFINED_DESTINATION_PATH_DIRECTORY ]]; then echo "${USER_DEFINED_DESTINATION_PATH_DIRECTORY}";fi
echo "Transcript Extensions: ${TRANSCRIPT_EXTENSIONS[*]}"
echo "Destination Output Print: ${DESTINATION_OUTPUT_PRINT}"
echo "Destination Reprocess: ${DESTINATION_REPROCESS}"
echo "Source Temp Transcoding Keep: ${SOURCE_TEMP_TRANSCODING_KEEP}"
echo "============================================================================"
# find all audio files in the source path folder and its subdirectories
find "${SOURCE_PATH}" -type f \( -name "*.mp3" -o -name "*.mp4" -o -name "*.wav" -o -name "*.m4a" \) | \
while read -r SOURCE_PATH_FILE; do
# Path Varables
SOURCE_PATH_DIRECTORY="${SOURCE_PATH_FILE%/*}"
DESTINATION_PATH_DIRECTORY="${USER_DEFINED_DESTINATION_PATH_DIRECTORY:-$SOURCE_PATH_DIRECTORY}"
SOURCE_PATH_BASENAME="$(basename -- "$SOURCE_PATH_FILE")"
SOURCE_PATH_FILENAME="${SOURCE_PATH_BASENAME%.*}"
#SOURCE_PATH_EXTENSION="${basename##*.}"
SOURCE_TEMP_TRANSCODING_BASENAME="${SOURCE_PATH_FILENAME}.temp.mp3"
SOURCE_TEMP_TRANSCODING_PATH="${DESTINATION_PATH_DIRECTORY}/${SOURCE_TEMP_TRANSCODING_BASENAME}"
# Loop through each transcript extension
IFS=,
for TRANSCRIPT_EXTENSION in ${TRANSCRIPT_EXTENSIONS[@]}; do
# Remove any white space from TRANSCRIPT_EXTENSION
TRANSCRIPT_EXTENSION=$(echo "${TRANSCRIPT_EXTENSION}" | xargs)
# Set the destination path for the current transcript extension
# DESTINATION_PATH="${SOURCE_PATH_FILE%.*}.${TRANSCRIPT_EXTENSION}"
DESTINATION_PATH="${DESTINATION_PATH_DIRECTORY}/${SOURCE_PATH_FILENAME}.${TRANSCRIPT_EXTENSION}"
# If destination reprocessing is true and the destination file already exists, delete it
if [[ ${DESTINATION_REPROCESS} == true ]] && [ -e "${DESTINATION_PATH}" ]; then
echo "Deleting old destination file: ${DESTINATION_PATH}"
rm "${DESTINATION_PATH}"
fi
# then, check if a transcript file already exists for this audio file
if [ -f "${DESTINATION_PATH}" ]; then
echo "Transcript already exists: ${SOURCE_PATH_FILE} | ${TRANSCRIPT_EXTENSION}"
else
# transcoding m4a to mp3 or wav so is usable for transcribing, unless already transscribed (file exist and is not empty).
if [[ "${SOURCE_PATH_FILE}" == *.m4a ]] && ([[ ! -f "${SOURCE_TEMP_TRANSCODING_PATH}" ]] || [[ ! -s "${SOURCE_TEMP_TRANSCODING_PATH}" ]]); then
# if ([[ "${SOURCE_PATH_FILE}" == *.m4a ]] || [[ "${SOURCE_PATH_FILE}" == *.mp4 ]]) && ([[ ! -f "${SOURCE_TEMP_TRANSCODING_PATH}" ]] || [[ ! -s "${SOURCE_TEMP_TRANSCODING_PATH}" ]]); then
echo "Transcoding to MP3: ${SOURCE_PATH_FILE} -> ${SOURCE_TEMP_TRANSCODING_BASENAME} | ${TRANSCRIPT_EXTENSION}"
# convert the file to mp3 using FFmpeg container
SECONDS=0
docker run --rm --volume "${SOURCE_PATH_DIRECTORY}:${SOURCE_PATH_DIRECTORY}" --volume "${DESTINATION_PATH_DIRECTORY}:${DESTINATION_PATH_DIRECTORY}" --workdir "${SOURCE_PATH_DIRECTORY}" jrottenberg/ffmpeg \
-i "${SOURCE_PATH_BASENAME}" \
-loglevel fatal -hide_banner -stats \
-acodec libmp3lame -ac 1 -ar 16000 -ab 192k -y "${DESTINATION_PATH_DIRECTORY}/${SOURCE_TEMP_TRANSCODING_BASENAME}" && output_task_file_info ${SECONDS} ${SOURCE_PATH_FILE} || { echo "Failed to convert file to MP3."; [ -e "${SOURCE_TEMP_TRANSCODING_PATH}" ] && rm "${SOURCE_TEMP_TRANSCODING_PATH}"; exit 1; }
# -acodec libmp3lame -ac 1 -ar 16000 -ab 192k -y "${SOURCE_TEMP_TRANSCODING_BASENAME}" || { echo "Failed to convert file to MP3."; [ -e "${SOURCE_TEMP_TRANSCODING_PATH}" ] && rm "${SOURCE_TEMP_TRANSCODING_PATH}"; exit 1; }
# MP3 # -acodec libmp3lame -ac 2 -ar 44100 -ab 192k -y "${SOURCE_TEMP_TRANSCODING_BASENAME}" || { echo "Failed to convert file to MP3."; [ -e "${SOURCE_TEMP_TRANSCODING_PATH}" ] && rm "${SOURCE_TEMP_TRANSCODING_PATH}"; exit 1; }
# WAV # -acodec pcm_s16le -ac 1 -ar 16000 -y "${SOURCE_TEMP_TRANSCODING_BASENAME}" || { echo "Failed to convert file to WAV."; [ -e "${SOURCE_TEMP_TRANSCODING_PATH}" ] && rm "${SOURCE_TEMP_TRANSCODING_PATH}"; exit 1; }
fi
# set asr source path to source path of raw file, or transcoded file
if [[ "${SOURCE_PATH_FILE}" == *.m4a ]] && [[ -f "${SOURCE_TEMP_TRANSCODING_PATH}" ]]; then
# if ([[ "${SOURCE_PATH_FILE}" == *.m4a ]] || [[ "${SOURCE_PATH_FILE}" == *.mp4 ]]) && [[ -f "${SOURCE_TEMP_TRANSCODING_PATH}" ]]; then
ASR_SOURCE_PATH="${SOURCE_TEMP_TRANSCODING_PATH}"
else
ASR_SOURCE_PATH="${SOURCE_PATH_FILE}"
fi
# Print the source and destination paths along with the transcript extension
echo "Sending to ${WISPER_ASR_TASK^}: ${ASR_SOURCE_PATH} -> $(basename -- "$DESTINATION_PATH") [$(format_file_size ${ASR_SOURCE_PATH})|$(format_audio_durration ${ASR_SOURCE_PATH})]" # | $([[ -f "${DESTINATION_PATH}" ]] && echo true || echo false)"
# Send the audio file to the web service for transcription
SECONDS=0
if ! curl --progress-bar --request 'POST' \
"${WISPER_ASR_SERVER}/asr?task=${WISPER_ASR_TASK}&language=${WISPER_ASR_LANG}&initial_prompt=${WISPER_ASR_INITIAL_PROMPT_URL_ENCODED}&output=${TRANSCRIPT_EXTENSION}" \
--header 'accept: application/json' \
--header 'Content-Type: multipart/form-data' \
--form "audio_file=@${ASR_SOURCE_PATH};type=audio/mpeg" \
--output "${DESTINATION_PATH}"; then
echo "Failed to transcribe file: ${DESTINATION_PATH}"
exit 1
elif [[ -f "${DESTINATION_PATH}" ]]; then
echo "Successful Transcription: ${DESTINATION_PATH}"
output_task_file_info ${SECONDS} ${SOURCE_PATH_FILE}
else
echo "Unknown Transcription Failure!"
fi
# print output from destination file.
if [[ "${DESTINATION_OUTPUT_PRINT}" == true ]]; then
# Print a separator
echo " - - - - - - - - - - - - - - - - - - -"
# Print the contents of the destination file
cat "${DESTINATION_PATH}"
fi
fi
# Print a separator
if [[ ${#TRANSCRIPT_EXTENSIONS[@]} -gt 1 ]]; then
echo " - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -"
fi
done
# If source temporary transcoding file exist, delete it.
if [[ -e "${SOURCE_TEMP_TRANSCODING_PATH}" ]] && [[ ${SOURCE_TEMP_TRANSCODING_KEEP} != true ]]; then
echo "Deleting Temp Transconding Audio File: ${SOURCE_TEMP_TRANSCODING_PATH}"
rm "${SOURCE_TEMP_TRANSCODING_PATH}"
fi
# Print a separator
echo "----------------------------------------------------------------------------"
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment