Skip to content

Instantly share code, notes, and snippets.

@markizano
Last active August 25, 2023 15:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save markizano/1199e70e54bbffdf8b12cd17b11cc99c to your computer and use it in GitHub Desktop.
Save markizano/1199e70e54bbffdf8b12cd17b11cc99c to your computer and use it in GitHub Desktop.
Voice 2 Text Script based on Whisper
FROM devuan/devuan:chimaera
ARG WHISPER_MODEL=guillaumekln/faster-whisper-large-v2
ENV LOG_LEVEL INFO
RUN apt-get -q update
RUN apt-get install -qqqy python3-pip ffmpeg git-core
RUN pip3 install nvidia-pyindex
COPY requirements.txt .
RUN pip3 install -r requirements.txt
RUN pip3 install git+https://github.com/markizano/faster-whisper
COPY voice2text.py /usr/local/bin/voice2text
RUN chmod +x /usr/local/bin/voice2text
# Uncomment if you have cuda drivers available.
#ENV LD_LIBRARY_PATH /usr/local/lib/python3.9/dist-packages/nvidia/cudnn/lib
#COPY cuda_11.6.2_510.47.03_linux.run .
#RUN ./cuda_11.6.2_510.47.03_linux.run --silent --toolkit --toolkitpath=/usr/local/cuda-11.6 --override && rm -v cuda_11.6.2_510.47.03_linux.run
# Used to prevent having to change permissions.
RUN adduser -q --system --gid=1000 --uid=1000 --shell=/bin/bash --uid=1000 user
USER user
WORKDIR /tmp/workspace
# Convince openai-whisper to go ahead and download an English model.
RUN python3 -c "import faster_whisper as whisper; whisper.utils.download_model('${WHISPER_MODEL}')"
ENTRYPOINT ["/usr/local/bin/voice2text"]
CMD ["# (noop)"]
ffmpeg-python
numba==0.56.0
PyYAML
kizano
nvidia-cudnn
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
'''
This is an implementation of the [faster-whisper](https://github.com/guillaumekln/faster-whisper) project.
It is a CLI tool that takes in a video file and outputs a video file with subtitles.
It optionally can just create the .srt file and overwrite the video file.
Usage:
voice2text [-h]
[--model {tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2}]
[--device {cpu,cuda,auto}]
[--compute_type COMPUTE_TYPE]
[--output_dir OUTPUT_DIR] [--output_srt]
[--srt_only] [--verbose]
[--task {transcribe,translate}]
[--language {auto,af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,zh}]
video [video ...]
positional arguments:
video paths to video files to transcribe
optional arguments:
-h, --help show this help message and exit
--model {tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2}
name of the Whisper model to use
(default: guillaumekln/faster-whisper-
large-v2)
--device {cpu,cuda,auto}
name of the Whisper model to use
(default: auto)
--compute_type COMPUTE_TYPE
Type to use for computation. See https
://opennmt.net/CTranslate2/quantizatio
n.html. (default: auto)
--output_dir OUTPUT_DIR, -o OUTPUT_DIR
directory to save the outputs
(default: .)
--output_srt whether to output the .srt file along
with the video files (default: False)
--srt_only only generate the .srt file and not
create overlayed video (default:
False)
--verbose whether to print out the progress and
debug messages (default: False)
--task {transcribe,translate}
whether to perform X->X speech
recognition ('transcribe') or
X->English translation ('translate')
(default: transcribe)
--language {auto,af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,zh}
What is the origin language of the
video? If unset, it is detected
automatically. (default: en)
'''
import os, io
import ffmpeg
import argparse
import warnings
import tempfile
from kizano import getLogger
from typing import Iterator, TextIO
from faster_whisper import WhisperModel
from faster_whisper.utils import _MODELS as AVAILABLE_MODELS, format_timestamp
log = getLogger(__name__)
WHISPER_MODEL = os.getenv('WHISPER_MODEL', 'guillaumekln/faster-whisper-large-v2')
DEVICES = ['cpu', 'cuda', 'auto']
LANGS = ["auto","af","am","ar","as","az","ba","be","bg","bn","bo","br","bs","ca",
"cs","cy","da","de","el","en","es","et","eu","fa","fi","fo","fr","gl","gu","ha",
"haw","he","hi","hr","ht","hu","hy","id","is","it","ja","jw","ka","kk","km","kn",
"ko","la","lb","ln","lo","lt","lv","mg","mi","mk","ml","mn","mr","ms","mt","my",
"ne","nl","nn","no","oc","pa","pl","ps","pt","ro","ru","sa","sd","si","sk","sl",
"sn","so","sq","sr","su","sv","sw","ta","te","tg","th","tk","tl","tr","tt","uk",
"ur","uz","vi","yi","yo","zh"]
def main():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument("video",
nargs="+",
type=str,
help="paths to video files to transcribe"
)
parser.add_argument("--model",
default=WHISPER_MODEL,
type=str,
choices=AVAILABLE_MODELS,
help="name of the Whisper model to use"
)
parser.add_argument("--device",
default='auto',
type=str,
choices=DEVICES,
help="name of the Whisper model to use"
)
parser.add_argument("--compute_type",
default="auto",
type=str,
help="Type to use for computation. See https://opennmt.net/CTranslate2/quantization.html."
)
parser.add_argument("--output_dir", "-o",
type=str,
default=".",
help="directory to save the outputs"
)
parser.add_argument("--output_srt",
action='store_true',
default=False,
help="whether to output the .srt file along with the video files"
)
parser.add_argument("--srt_only",
action='store_true',
default=False,
help="only generate the .srt file and not create overlayed video"
)
parser.add_argument("--verbose",
action='store_true',
default=False,
help="whether to print out the progress and debug messages"
)
parser.add_argument("--task",
type=str,
default="transcribe",
choices=["transcribe", "translate"],
help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')"
)
parser.add_argument("--language",
type=str,
default="en",
choices=LANGS,
help="What is the origin language of the video? If unset, it is detected automatically."
)
args = parser.parse_args().__dict__
model_name: str = args.pop("model")
device: str = args.pop("device")
compute_type: str = args.pop("compute_type")
output_dir: str = args.pop("output_dir")
output_srt: bool = args.pop("output_srt")
srt_only: bool = args.pop("srt_only")
language: str = args.pop("language")
verbose: bool = args.pop("verbose")
args['word_timestamps'] = True
os.makedirs(output_dir, exist_ok=True)
if model_name.endswith(".en"):
warnings.warn(
f"{model_name} is an English-only model, forcing English detection.")
args["language"] = "en"
# if translate task used and language argument is set, then use it
elif language != "auto":
args["language"] = language
model = WhisperModel(model_name, device=device, compute_type=compute_type)
audios = get_audio(args.pop("video"))
subtitles = get_subtitles(
audios,
output_srt or srt_only,
output_dir,
lambda audio_path: model.transcribe(audio_path, **args),
verbose
)
log.info('Collected subtitles!')
if srt_only:
return
for path, srt_path in subtitles.items():
out_path = os.path.join(output_dir, f"{filename(path)}.mp4")
log.info(f"Adding subtitles to {filename(path)}...")
video = ffmpeg.input(path)
audio = video.audio
ffmpeg.concat(
video.filter('subtitles', srt_path, force_style="OutlineColour=&H40000000,BorderStyle=3"), audio, v=1, a=1
).output(out_path).run(quiet=True, overwrite_output=True)
log.info(f"Saved subtitled video to {os.path.abspath(out_path)}.")
log.info('Complete!')
def get_audio(paths: list):
temp_dir = tempfile.gettempdir()
audio_paths = {}
for path in paths:
log.info(f"Extracting audio from {filename(path)}...")
output_path = os.path.join(temp_dir, f"{filename(path)}.wav")
ffmpeg.input(path).output(
output_path,
acodec="pcm_s16le", ac=1, ar="16k"
).run(quiet=True, overwrite_output=True)
audio_paths[path] = output_path
log.info('Done extracting audio!')
return audio_paths
def write_srt(transcript, srt_file: TextIO, verbose: bool = False) -> None:
log.info('Writing out SRT file...')
i = 1
srt_tpl = '%d\n%s --> %s\n%s\n\n'
for segment in transcript:
# Segment(id=1, seek=2940, start=0.0, end=1.28,
# text=' Why should I face reality?',
# tokens=[50364, 1545, 820, 286, 1851, 4103, 30, 50440],
# temperature=0.0,
# avg_logprob=-0.22900006071560913,
# compression_ratio=1.6572327044025157,
# no_speech_prob=0.010524801909923553,
# words=[
# Word(start=0.0, end=0.24, word=' Why', probability=0.48081842064857483),
# Word(start=0.24, end=0.44, word=' should', probability=0.8993381261825562),
# Word(start=0.44, end=0.56, word=' I', probability=0.971877932548523),
# Word(start=0.56, end=0.82, word=' face', probability=0.8046457171440125),
# Word(start=0.82, end=1.28, word=' reality?', probability=0.869381308555603)
# ]
# )
log.debug(segment)
buffer = []
if verbose:
log.info(f"{i}[{segment.start} --> {segment.end}]: {segment.text}")
for word in segment.words:
buffer.append(word)
text = ''.join([ x.word for x in buffer ]).strip().replace('-->', '->')
charlen = len(text)
stime = format_timestamp(buffer[0].start, always_include_hours=True)
etime = format_timestamp(buffer[-1].end, always_include_hours=True)
if len(buffer) > 6 or charlen > 32:
srt_file.write( srt_tpl % ( i, stime, etime, text ) )
i += 1
buffer = []
if len(buffer) > 0:
srt_file.write( srt_tpl % ( i, stime, etime, text ) )
i += 1
srt_file.flush()
log.info('Done writing SRT file!')
def filename(path) -> str:
return os.path.splitext(os.path.basename(path))[0]
def get_subtitles(audio_paths: list, output_srt: bool, output_dir: str, transcribe: callable, verbose: bool = False):
subtitles_path = {}
for path, audio_path in audio_paths.items():
srt_path = output_dir if output_srt else tempfile.gettempdir()
srt_path = os.path.join(srt_path, f"{filename(path)}.srt")
log.info(f"Generating subtitles for {filename(path)}... This might take a while.")
warnings.filterwarnings("ignore")
result, info = transcribe(audio_path)
warnings.filterwarnings("default")
log.debug(info)
log.info(f"Subtitles generated!")
with io.open(srt_path, "w", encoding="utf-8") as srt:
write_srt(result, srt_file=srt, verbose=verbose)
subtitles_path[path] = srt_path
return subtitles_path
if __name__ == '__main__':
main()
#!/bin/bash
# Run just as `voice2text ./video.mp4` and it'll produce a video.srt with captions/subtitles.
# Takes the rough edges off docker and makes it look like this script runs natively in the system.
. common.sh
grep -qiP "\x2D-?h(?:elp)?"<<<"$1" && {
echo "Usage: $0 [options] <audio file>"
echo "Options:"
echo " -h, --help: Show this help message"
echo " -l, --language: Language of the audio file (default: en-US)"
echo " -o, --output: Output file (default: <audio file>.srt)"
exit 0
}
while [[ $# -gt 0 ]]; do
case "$1" in
-l|--language)
LANGUAGE="$2"
shift
;;
-o|--output)
OUTPUT="$2"
shift
;;
*)
AUDIO_FILE="`realpath $1`"
;;
esac
shift
done
log_info_msg "Operating on ${AUDIO_FILE}"
docker run --name voice2text -it --rm \
-v "$(dirname $AUDIO_FILE):/tmp/workspace" \
-w "/tmp/workspace" \
faster-whisper --language en --verbose --srt_only "/tmp/workspace/$(basename $AUDIO_FILE)"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment