markizano/Dockerfile

## Dockerfile
FROM devuan/devuan:chimaera

ARG WHISPER_MODEL=guillaumekln/faster-whisper-large-v2
ENV LOG_LEVEL INFO

RUN apt-get -q update
RUN apt-get install -qqqy python3-pip ffmpeg git-core
RUN pip3 install nvidia-pyindex

COPY requirements.txt .
RUN pip3 install -r requirements.txt

RUN pip3 install git+https://github.com/markizano/faster-whisper

COPY voice2text.py /usr/local/bin/voice2text
RUN chmod +x /usr/local/bin/voice2text

# Uncomment if you have cuda drivers available.
#ENV LD_LIBRARY_PATH /usr/local/lib/python3.9/dist-packages/nvidia/cudnn/lib
#COPY cuda_11.6.2_510.47.03_linux.run .
#RUN ./cuda_11.6.2_510.47.03_linux.run --silent --toolkit --toolkitpath=/usr/local/cuda-11.6 --override && rm -v cuda_11.6.2_510.47.03_linux.run

# Used to prevent having to change permissions.
RUN adduser -q --system --gid=1000 --uid=1000 --shell=/bin/bash --uid=1000 user
USER user
WORKDIR /tmp/workspace

# Convince openai-whisper to go ahead and download an English model.
RUN python3 -c "import faster_whisper as whisper; whisper.utils.download_model('${WHISPER_MODEL}')"
ENTRYPOINT ["/usr/local/bin/voice2text"]
CMD ["# (noop)"]

## requirements.txt
ffmpeg-python
numba==0.56.0
PyYAML
kizano
nvidia-cudnn

## voice2text.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
'''
This is an implementation of the [faster-whisper](https://github.com/guillaumekln/faster-whisper) project.
It is a CLI tool that takes in a video file and outputs a video file with subtitles.
It optionally can just create the .srt file and overwrite the video file.

Usage:

voice2text [-h]
               [--model {tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2}]
               [--device {cpu,cuda,auto}]
               [--compute_type COMPUTE_TYPE]
               [--output_dir OUTPUT_DIR] [--output_srt]
               [--srt_only] [--verbose]
               [--task {transcribe,translate}]
               [--language {auto,af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,zh}]
               video [video ...]

positional arguments:
  video                 paths to video files to transcribe

optional arguments:
  -h, --help            show this help message and exit
  --model {tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2}
                        name of the Whisper model to use
                        (default: guillaumekln/faster-whisper-
                        large-v2)
  --device {cpu,cuda,auto}
                        name of the Whisper model to use
                        (default: auto)
  --compute_type COMPUTE_TYPE
                        Type to use for computation. See https
                        ://opennmt.net/CTranslate2/quantizatio
                        n.html. (default: auto)
  --output_dir OUTPUT_DIR, -o OUTPUT_DIR
                        directory to save the outputs
                        (default: .)
  --output_srt          whether to output the .srt file along
                        with the video files (default: False)
  --srt_only            only generate the .srt file and not
                        create overlayed video (default:
                        False)
  --verbose             whether to print out the progress and
                        debug messages (default: False)
  --task {transcribe,translate}
                        whether to perform X->X speech
                        recognition ('transcribe') or
                        X->English translation ('translate')
                        (default: transcribe)
  --language {auto,af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,zh}
                        What is the origin language of the
                        video? If unset, it is detected
                        automatically. (default: en)

'''

import os, io
import ffmpeg
import argparse
import warnings
import tempfile
from kizano import getLogger
from typing import Iterator, TextIO
from faster_whisper import WhisperModel
from faster_whisper.utils import _MODELS as AVAILABLE_MODELS, format_timestamp

log = getLogger(__name__)

WHISPER_MODEL = os.getenv('WHISPER_MODEL', 'guillaumekln/faster-whisper-large-v2')

DEVICES = ['cpu', 'cuda', 'auto']

LANGS = ["auto","af","am","ar","as","az","ba","be","bg","bn","bo","br","bs","ca",
"cs","cy","da","de","el","en","es","et","eu","fa","fi","fo","fr","gl","gu","ha",
"haw","he","hi","hr","ht","hu","hy","id","is","it","ja","jw","ka","kk","km","kn",
"ko","la","lb","ln","lo","lt","lv","mg","mi","mk","ml","mn","mr","ms","mt","my",
"ne","nl","nn","no","oc","pa","pl","ps","pt","ro","ru","sa","sd","si","sk","sl",
"sn","so","sq","sr","su","sv","sw","ta","te","tg","th","tk","tl","tr","tt","uk",
"ur","uz","vi","yi","yo","zh"]

def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument("video",
        nargs="+",
        type=str,
        help="paths to video files to transcribe"
    )
    parser.add_argument("--model",
        default=WHISPER_MODEL,
        type=str,
        choices=AVAILABLE_MODELS,
        help="name of the Whisper model to use"
    )
    parser.add_argument("--device",
        default='auto',
        type=str,
        choices=DEVICES,
        help="name of the Whisper model to use"
    )
    parser.add_argument("--compute_type",
        default="auto",
        type=str,
        help="Type to use for computation. See https://opennmt.net/CTranslate2/quantization.html."
    )
    parser.add_argument("--output_dir", "-o",
        type=str,
        default=".",
        help="directory to save the outputs"
    )
    parser.add_argument("--output_srt",
        action='store_true',
        default=False,
        help="whether to output the .srt file along with the video files"
    )
    parser.add_argument("--srt_only",
        action='store_true',
        default=False,
        help="only generate the .srt file and not create overlayed video"
    )
    parser.add_argument("--verbose",
        action='store_true',
        default=False,
        help="whether to print out the progress and debug messages"
    )
    parser.add_argument("--task",
        type=str,
        default="transcribe",
        choices=["transcribe", "translate"],
        help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')"
    )
    parser.add_argument("--language",
        type=str,
        default="en",
        choices=LANGS,
        help="What is the origin language of the video? If unset, it is detected automatically."
    )

    args = parser.parse_args().__dict__
    model_name: str = args.pop("model")
    device: str = args.pop("device")
    compute_type: str = args.pop("compute_type")
    output_dir: str = args.pop("output_dir")
    output_srt: bool = args.pop("output_srt")
    srt_only: bool = args.pop("srt_only")
    language: str = args.pop("language")
    verbose: bool = args.pop("verbose")
    args['word_timestamps'] = True
    os.makedirs(output_dir, exist_ok=True)

    if model_name.endswith(".en"):
        warnings.warn(
            f"{model_name} is an English-only model, forcing English detection.")
        args["language"] = "en"
    # if translate task used and language argument is set, then use it
    elif language != "auto":
        args["language"] = language

    model = WhisperModel(model_name, device=device, compute_type=compute_type)
    audios = get_audio(args.pop("video"))
    subtitles = get_subtitles(
        audios,
        output_srt or srt_only,
        output_dir,
        lambda audio_path: model.transcribe(audio_path, **args),
        verbose
    )
    log.info('Collected subtitles!')

    if srt_only:
        return

    for path, srt_path in subtitles.items():
        out_path = os.path.join(output_dir, f"{filename(path)}.mp4")

        log.info(f"Adding subtitles to {filename(path)}...")

        video = ffmpeg.input(path)
        audio = video.audio

        ffmpeg.concat(
            video.filter('subtitles', srt_path, force_style="OutlineColour=&H40000000,BorderStyle=3"), audio, v=1, a=1
        ).output(out_path).run(quiet=True, overwrite_output=True)

        log.info(f"Saved subtitled video to {os.path.abspath(out_path)}.")
    log.info('Complete!')

def get_audio(paths: list):
    temp_dir = tempfile.gettempdir()
    audio_paths = {}
    for path in paths:
        log.info(f"Extracting audio from {filename(path)}...")
        output_path = os.path.join(temp_dir, f"{filename(path)}.wav")

        ffmpeg.input(path).output(
            output_path,
            acodec="pcm_s16le", ac=1, ar="16k"
        ).run(quiet=True, overwrite_output=True)

        audio_paths[path] = output_path

    log.info('Done extracting audio!')
    return audio_paths

def write_srt(transcript, srt_file: TextIO, verbose: bool = False) -> None:
    log.info('Writing out SRT file...')
    i = 1
    srt_tpl = '%d\n%s --> %s\n%s\n\n'
    for segment in transcript:
        # Segment(id=1, seek=2940, start=0.0, end=1.28,
        #   text=' Why should I face reality?',
        #   tokens=[50364, 1545, 820, 286, 1851, 4103, 30, 50440],
        #   temperature=0.0,
        #   avg_logprob=-0.22900006071560913,
        #   compression_ratio=1.6572327044025157,
        #   no_speech_prob=0.010524801909923553,
        #   words=[
        #     Word(start=0.0, end=0.24, word=' Why', probability=0.48081842064857483),
        #     Word(start=0.24, end=0.44, word=' should', probability=0.8993381261825562),
        #     Word(start=0.44, end=0.56, word=' I', probability=0.971877932548523),
        #     Word(start=0.56, end=0.82, word=' face', probability=0.8046457171440125),
        #     Word(start=0.82, end=1.28, word=' reality?', probability=0.869381308555603)
        #   ]
        # )

        log.debug(segment)
        buffer = []
        if verbose:
            log.info(f"{i}[{segment.start} --> {segment.end}]: {segment.text}")
        for word in segment.words:
            buffer.append(word)
            text = ''.join([ x.word for x in buffer ]).strip().replace('-->', '->')
            charlen = len(text)
            stime = format_timestamp(buffer[0].start, always_include_hours=True)
            etime = format_timestamp(buffer[-1].end, always_include_hours=True)
            if len(buffer) > 6 or charlen > 32:
                srt_file.write( srt_tpl % ( i, stime, etime, text ) )
                i += 1
                buffer = []
        if len(buffer) > 0:
            srt_file.write( srt_tpl % ( i, stime, etime, text ) )
            i += 1
    srt_file.flush()
    log.info('Done writing SRT file!')

def filename(path) -> str:
    return os.path.splitext(os.path.basename(path))[0]

def get_subtitles(audio_paths: list, output_srt: bool, output_dir: str, transcribe: callable, verbose: bool = False):
    subtitles_path = {}

    for path, audio_path in audio_paths.items():
        srt_path = output_dir if output_srt else tempfile.gettempdir()
        srt_path = os.path.join(srt_path, f"{filename(path)}.srt")

        log.info(f"Generating subtitles for {filename(path)}... This might take a while.")

        warnings.filterwarnings("ignore")
        result, info = transcribe(audio_path)
        warnings.filterwarnings("default")
        log.debug(info)
        log.info(f"Subtitles generated!")

        with io.open(srt_path, "w", encoding="utf-8") as srt:
            write_srt(result, srt_file=srt, verbose=verbose)
        subtitles_path[path] = srt_path

    return subtitles_path

if __name__ == '__main__':
    main()

## voice2text.sh
#!/bin/bash

# Run just as `voice2text ./video.mp4` and it'll produce a video.srt with captions/subtitles.

# Takes the rough edges off docker and makes it look like this script runs natively in the system.
. common.sh

grep -qiP "\x2D-?h(?:elp)?"<<<"$1" && {
    echo "Usage: $0 [options] <audio file>"
    echo "Options:"
    echo "  -h, --help: Show this help message"
    echo "  -l, --language: Language of the audio file (default: en-US)"
    echo "  -o, --output: Output file (default: <audio file>.srt)"
    exit 0
}

while [[ $# -gt 0 ]]; do
    case "$1" in
        -l|--language)
            LANGUAGE="$2"
            shift
            ;;
        -o|--output)
            OUTPUT="$2"
            shift
            ;;
        *)
            AUDIO_FILE="`realpath $1`"
            ;;
    esac
    shift
done

log_info_msg "Operating on ${AUDIO_FILE}"
docker run --name voice2text -it --rm \
  -v "$(dirname $AUDIO_FILE):/tmp/workspace" \
  -w "/tmp/workspace" \
  faster-whisper --language en --verbose --srt_only "/tmp/workspace/$(basename $AUDIO_FILE)"
	FROM devuan/devuan:chimaera

	ARG WHISPER_MODEL=guillaumekln/faster-whisper-large-v2
	ENV LOG_LEVEL INFO

	RUN apt-get -q update
	RUN apt-get install -qqqy python3-pip ffmpeg git-core
	RUN pip3 install nvidia-pyindex

	COPY requirements.txt .
	RUN pip3 install -r requirements.txt

	RUN pip3 install git+https://github.com/markizano/faster-whisper

	COPY voice2text.py /usr/local/bin/voice2text
	RUN chmod +x /usr/local/bin/voice2text

	# Uncomment if you have cuda drivers available.
	#ENV LD_LIBRARY_PATH /usr/local/lib/python3.9/dist-packages/nvidia/cudnn/lib
	#COPY cuda_11.6.2_510.47.03_linux.run .
	#RUN ./cuda_11.6.2_510.47.03_linux.run --silent --toolkit --toolkitpath=/usr/local/cuda-11.6 --override && rm -v cuda_11.6.2_510.47.03_linux.run

	# Used to prevent having to change permissions.
	RUN adduser -q --system --gid=1000 --uid=1000 --shell=/bin/bash --uid=1000 user
	USER user
	WORKDIR /tmp/workspace

	# Convince openai-whisper to go ahead and download an English model.
	RUN python3 -c "import faster_whisper as whisper; whisper.utils.download_model('${WHISPER_MODEL}')"
	ENTRYPOINT ["/usr/local/bin/voice2text"]
	CMD ["# (noop)"]
	#!/usr/bin/env python3
	# -- coding: utf-8 --
	'''
	This is an implementation of the [faster-whisper](https://github.com/guillaumekln/faster-whisper) project.
	It is a CLI tool that takes in a video file and outputs a video file with subtitles.
	It optionally can just create the .srt file and overwrite the video file.

	Usage:

	voice2text [-h]
	[--model {tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2}]
	[--device {cpu,cuda,auto}]
	[--compute_type COMPUTE_TYPE]
	[--output_dir OUTPUT_DIR] [--output_srt]
	[--srt_only] [--verbose]
	[--task {transcribe,translate}]
	[--language {auto,af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,zh}]
	video [video ...]

	positional arguments:
	video paths to video files to transcribe

	optional arguments:
	-h, --help show this help message and exit
	--model {tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2}
	name of the Whisper model to use
	(default: guillaumekln/faster-whisper-
	large-v2)
	--device {cpu,cuda,auto}
	name of the Whisper model to use
	(default: auto)
	--compute_type COMPUTE_TYPE
	Type to use for computation. See https
	://opennmt.net/CTranslate2/quantizatio
	n.html. (default: auto)
	--output_dir OUTPUT_DIR, -o OUTPUT_DIR
	directory to save the outputs
	(default: .)
	--output_srt whether to output the .srt file along
	with the video files (default: False)
	--srt_only only generate the .srt file and not
	create overlayed video (default:
	False)
	--verbose whether to print out the progress and
	debug messages (default: False)
	--task {transcribe,translate}
	whether to perform X->X speech
	recognition ('transcribe') or
	X->English translation ('translate')
	(default: transcribe)
	--language {auto,af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,zh}
	What is the origin language of the
	video? If unset, it is detected
	automatically. (default: en)

	'''

	import os, io
	import ffmpeg
	import argparse
	import warnings
	import tempfile
	from kizano import getLogger
	from typing import Iterator, TextIO
	from faster_whisper import WhisperModel
	from faster_whisper.utils import _MODELS as AVAILABLE_MODELS, format_timestamp

	log = getLogger(__name__)

	WHISPER_MODEL = os.getenv('WHISPER_MODEL', 'guillaumekln/faster-whisper-large-v2')

	DEVICES = ['cpu', 'cuda', 'auto']

	LANGS = ["auto","af","am","ar","as","az","ba","be","bg","bn","bo","br","bs","ca",
	"cs","cy","da","de","el","en","es","et","eu","fa","fi","fo","fr","gl","gu","ha",
	"haw","he","hi","hr","ht","hu","hy","id","is","it","ja","jw","ka","kk","km","kn",
	"ko","la","lb","ln","lo","lt","lv","mg","mi","mk","ml","mn","mr","ms","mt","my",
	"ne","nl","nn","no","oc","pa","pl","ps","pt","ro","ru","sa","sd","si","sk","sl",
	"sn","so","sq","sr","su","sv","sw","ta","te","tg","th","tk","tl","tr","tt","uk",
	"ur","uz","vi","yi","yo","zh"]

	def main():
	parser = argparse.ArgumentParser(
	formatter_class=argparse.ArgumentDefaultsHelpFormatter
	)
	parser.add_argument("video",
	nargs="+",
	type=str,
	help="paths to video files to transcribe"
	)
	parser.add_argument("--model",
	default=WHISPER_MODEL,
	type=str,
	choices=AVAILABLE_MODELS,
	help="name of the Whisper model to use"
	)
	parser.add_argument("--device",
	default='auto',
	type=str,
	choices=DEVICES,
	help="name of the Whisper model to use"
	)
	parser.add_argument("--compute_type",
	default="auto",
	type=str,
	help="Type to use for computation. See https://opennmt.net/CTranslate2/quantization.html."
	)
	parser.add_argument("--output_dir", "-o",
	type=str,
	default=".",
	help="directory to save the outputs"
	)
	parser.add_argument("--output_srt",
	action='store_true',
	default=False,
	help="whether to output the .srt file along with the video files"
	)
	parser.add_argument("--srt_only",
	action='store_true',
	default=False,
	help="only generate the .srt file and not create overlayed video"
	)
	parser.add_argument("--verbose",
	action='store_true',
	default=False,
	help="whether to print out the progress and debug messages"
	)
	parser.add_argument("--task",
	type=str,
	default="transcribe",
	choices=["transcribe", "translate"],
	help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')"
	)
	parser.add_argument("--language",
	type=str,
	default="en",
	choices=LANGS,
	help="What is the origin language of the video? If unset, it is detected automatically."
	)

	args = parser.parse_args().__dict__
	model_name: str = args.pop("model")
	device: str = args.pop("device")
	compute_type: str = args.pop("compute_type")
	output_dir: str = args.pop("output_dir")
	output_srt: bool = args.pop("output_srt")
	srt_only: bool = args.pop("srt_only")
	language: str = args.pop("language")
	verbose: bool = args.pop("verbose")
	args['word_timestamps'] = True
	os.makedirs(output_dir, exist_ok=True)

	if model_name.endswith(".en"):
	warnings.warn(
	f"{model_name} is an English-only model, forcing English detection.")
	args["language"] = "en"
	# if translate task used and language argument is set, then use it
	elif language != "auto":
	args["language"] = language

	model = WhisperModel(model_name, device=device, compute_type=compute_type)
	audios = get_audio(args.pop("video"))
	subtitles = get_subtitles(
	audios,
	output_srt or srt_only,
	output_dir,
	lambda audio_path: model.transcribe(audio_path, **args),
	verbose
	)
	log.info('Collected subtitles!')

	if srt_only:
	return

	for path, srt_path in subtitles.items():
	out_path = os.path.join(output_dir, f"{filename(path)}.mp4")

	log.info(f"Adding subtitles to {filename(path)}...")

	video = ffmpeg.input(path)
	audio = video.audio

	ffmpeg.concat(
	video.filter('subtitles', srt_path, force_style="OutlineColour=&H40000000,BorderStyle=3"), audio, v=1, a=1
	).output(out_path).run(quiet=True, overwrite_output=True)

	log.info(f"Saved subtitled video to {os.path.abspath(out_path)}.")
	log.info('Complete!')

	def get_audio(paths: list):
	temp_dir = tempfile.gettempdir()
	audio_paths = {}
	for path in paths:
	log.info(f"Extracting audio from {filename(path)}...")
	output_path = os.path.join(temp_dir, f"{filename(path)}.wav")

	ffmpeg.input(path).output(
	output_path,
	acodec="pcm_s16le", ac=1, ar="16k"
	).run(quiet=True, overwrite_output=True)

	audio_paths[path] = output_path

	log.info('Done extracting audio!')
	return audio_paths

	def write_srt(transcript, srt_file: TextIO, verbose: bool = False) -> None:
	log.info('Writing out SRT file...')
	i = 1
	srt_tpl = '%d\n%s --> %s\n%s\n\n'
	for segment in transcript:
	# Segment(id=1, seek=2940, start=0.0, end=1.28,
	# text=' Why should I face reality?',
	# tokens=[50364, 1545, 820, 286, 1851, 4103, 30, 50440],
	# temperature=0.0,
	# avg_logprob=-0.22900006071560913,
	# compression_ratio=1.6572327044025157,
	# no_speech_prob=0.010524801909923553,
	# words=[
	# Word(start=0.0, end=0.24, word=' Why', probability=0.48081842064857483),
	# Word(start=0.24, end=0.44, word=' should', probability=0.8993381261825562),
	# Word(start=0.44, end=0.56, word=' I', probability=0.971877932548523),
	# Word(start=0.56, end=0.82, word=' face', probability=0.8046457171440125),
	# Word(start=0.82, end=1.28, word=' reality?', probability=0.869381308555603)
	# ]
	# )

	log.debug(segment)
	buffer = []
	if verbose:
	log.info(f"{i}[{segment.start} --> {segment.end}]: {segment.text}")
	for word in segment.words:
	buffer.append(word)
	text = ''.join([ x.word for x in buffer ]).strip().replace('-->', '->')
	charlen = len(text)
	stime = format_timestamp(buffer[0].start, always_include_hours=True)
	etime = format_timestamp(buffer[-1].end, always_include_hours=True)
	if len(buffer) > 6 or charlen > 32:
	srt_file.write( srt_tpl % ( i, stime, etime, text ) )
	i += 1
	buffer = []
	if len(buffer) > 0:
	srt_file.write( srt_tpl % ( i, stime, etime, text ) )
	i += 1
	srt_file.flush()
	log.info('Done writing SRT file!')

	def filename(path) -> str:
	return os.path.splitext(os.path.basename(path))[0]

	def get_subtitles(audio_paths: list, output_srt: bool, output_dir: str, transcribe: callable, verbose: bool = False):
	subtitles_path = {}

	for path, audio_path in audio_paths.items():
	srt_path = output_dir if output_srt else tempfile.gettempdir()
	srt_path = os.path.join(srt_path, f"{filename(path)}.srt")

	log.info(f"Generating subtitles for {filename(path)}... This might take a while.")

	warnings.filterwarnings("ignore")
	result, info = transcribe(audio_path)
	warnings.filterwarnings("default")
	log.debug(info)
	log.info(f"Subtitles generated!")

	with io.open(srt_path, "w", encoding="utf-8") as srt:
	write_srt(result, srt_file=srt, verbose=verbose)
	subtitles_path[path] = srt_path

	return subtitles_path

	if __name__ == '__main__':
	main()
	#!/bin/bash

	# Run just as `voice2text ./video.mp4` and it'll produce a video.srt with captions/subtitles.

	# Takes the rough edges off docker and makes it look like this script runs natively in the system.
	. common.sh

	grep -qiP "\x2D-?h(?:elp)?"<<<"$1" && {
	echo "Usage: $0 [options] <audio file>"
	echo "Options:"
	echo " -h, --help: Show this help message"
	echo " -l, --language: Language of the audio file (default: en-US)"
	echo " -o, --output: Output file (default: <audio file>.srt)"
	exit 0
	}

	while [[ $# -gt 0 ]]; do
	case "$1" in
	-l\|--language)
	LANGUAGE="$2"
	shift
	;;
	-o\|--output)
	OUTPUT="$2"
	shift
	;;
	*)
	AUDIO_FILE="`realpath $1`"
	;;
	esac
	shift
	done

	log_info_msg "Operating on ${AUDIO_FILE}"
	docker run --name voice2text -it --rm \
	-v "$(dirname $AUDIO_FILE):/tmp/workspace" \
	-w "/tmp/workspace" \
	faster-whisper --language en --verbose --srt_only "/tmp/workspace/$(basename $AUDIO_FILE)"