Last active
August 25, 2023 15:57
-
-
Save markizano/1199e70e54bbffdf8b12cd17b11cc99c to your computer and use it in GitHub Desktop.
Voice 2 Text Script based on Whisper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
FROM devuan/devuan:chimaera | |
ARG WHISPER_MODEL=guillaumekln/faster-whisper-large-v2 | |
ENV LOG_LEVEL INFO | |
RUN apt-get -q update | |
RUN apt-get install -qqqy python3-pip ffmpeg git-core | |
RUN pip3 install nvidia-pyindex | |
COPY requirements.txt . | |
RUN pip3 install -r requirements.txt | |
RUN pip3 install git+https://github.com/markizano/faster-whisper | |
COPY voice2text.py /usr/local/bin/voice2text | |
RUN chmod +x /usr/local/bin/voice2text | |
# Uncomment if you have cuda drivers available. | |
#ENV LD_LIBRARY_PATH /usr/local/lib/python3.9/dist-packages/nvidia/cudnn/lib | |
#COPY cuda_11.6.2_510.47.03_linux.run . | |
#RUN ./cuda_11.6.2_510.47.03_linux.run --silent --toolkit --toolkitpath=/usr/local/cuda-11.6 --override && rm -v cuda_11.6.2_510.47.03_linux.run | |
# Used to prevent having to change permissions. | |
RUN adduser -q --system --gid=1000 --uid=1000 --shell=/bin/bash --uid=1000 user | |
USER user | |
WORKDIR /tmp/workspace | |
# Convince openai-whisper to go ahead and download an English model. | |
RUN python3 -c "import faster_whisper as whisper; whisper.utils.download_model('${WHISPER_MODEL}')" | |
ENTRYPOINT ["/usr/local/bin/voice2text"] | |
CMD ["# (noop)"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
ffmpeg-python | |
numba==0.56.0 | |
PyYAML | |
kizano | |
nvidia-cudnn |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
''' | |
This is an implementation of the [faster-whisper](https://github.com/guillaumekln/faster-whisper) project. | |
It is a CLI tool that takes in a video file and outputs a video file with subtitles. | |
It optionally can just create the .srt file and overwrite the video file. | |
Usage: | |
voice2text [-h] | |
[--model {tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2}] | |
[--device {cpu,cuda,auto}] | |
[--compute_type COMPUTE_TYPE] | |
[--output_dir OUTPUT_DIR] [--output_srt] | |
[--srt_only] [--verbose] | |
[--task {transcribe,translate}] | |
[--language {auto,af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,zh}] | |
video [video ...] | |
positional arguments: | |
video paths to video files to transcribe | |
optional arguments: | |
-h, --help show this help message and exit | |
--model {tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2} | |
name of the Whisper model to use | |
(default: guillaumekln/faster-whisper- | |
large-v2) | |
--device {cpu,cuda,auto} | |
name of the Whisper model to use | |
(default: auto) | |
--compute_type COMPUTE_TYPE | |
Type to use for computation. See https | |
://opennmt.net/CTranslate2/quantizatio | |
n.html. (default: auto) | |
--output_dir OUTPUT_DIR, -o OUTPUT_DIR | |
directory to save the outputs | |
(default: .) | |
--output_srt whether to output the .srt file along | |
with the video files (default: False) | |
--srt_only only generate the .srt file and not | |
create overlayed video (default: | |
False) | |
--verbose whether to print out the progress and | |
debug messages (default: False) | |
--task {transcribe,translate} | |
whether to perform X->X speech | |
recognition ('transcribe') or | |
X->English translation ('translate') | |
(default: transcribe) | |
--language {auto,af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,zh} | |
What is the origin language of the | |
video? If unset, it is detected | |
automatically. (default: en) | |
''' | |
import os, io | |
import ffmpeg | |
import argparse | |
import warnings | |
import tempfile | |
from kizano import getLogger | |
from typing import Iterator, TextIO | |
from faster_whisper import WhisperModel | |
from faster_whisper.utils import _MODELS as AVAILABLE_MODELS, format_timestamp | |
log = getLogger(__name__) | |
WHISPER_MODEL = os.getenv('WHISPER_MODEL', 'guillaumekln/faster-whisper-large-v2') | |
DEVICES = ['cpu', 'cuda', 'auto'] | |
LANGS = ["auto","af","am","ar","as","az","ba","be","bg","bn","bo","br","bs","ca", | |
"cs","cy","da","de","el","en","es","et","eu","fa","fi","fo","fr","gl","gu","ha", | |
"haw","he","hi","hr","ht","hu","hy","id","is","it","ja","jw","ka","kk","km","kn", | |
"ko","la","lb","ln","lo","lt","lv","mg","mi","mk","ml","mn","mr","ms","mt","my", | |
"ne","nl","nn","no","oc","pa","pl","ps","pt","ro","ru","sa","sd","si","sk","sl", | |
"sn","so","sq","sr","su","sv","sw","ta","te","tg","th","tk","tl","tr","tt","uk", | |
"ur","uz","vi","yi","yo","zh"] | |
def main(): | |
parser = argparse.ArgumentParser( | |
formatter_class=argparse.ArgumentDefaultsHelpFormatter | |
) | |
parser.add_argument("video", | |
nargs="+", | |
type=str, | |
help="paths to video files to transcribe" | |
) | |
parser.add_argument("--model", | |
default=WHISPER_MODEL, | |
type=str, | |
choices=AVAILABLE_MODELS, | |
help="name of the Whisper model to use" | |
) | |
parser.add_argument("--device", | |
default='auto', | |
type=str, | |
choices=DEVICES, | |
help="name of the Whisper model to use" | |
) | |
parser.add_argument("--compute_type", | |
default="auto", | |
type=str, | |
help="Type to use for computation. See https://opennmt.net/CTranslate2/quantization.html." | |
) | |
parser.add_argument("--output_dir", "-o", | |
type=str, | |
default=".", | |
help="directory to save the outputs" | |
) | |
parser.add_argument("--output_srt", | |
action='store_true', | |
default=False, | |
help="whether to output the .srt file along with the video files" | |
) | |
parser.add_argument("--srt_only", | |
action='store_true', | |
default=False, | |
help="only generate the .srt file and not create overlayed video" | |
) | |
parser.add_argument("--verbose", | |
action='store_true', | |
default=False, | |
help="whether to print out the progress and debug messages" | |
) | |
parser.add_argument("--task", | |
type=str, | |
default="transcribe", | |
choices=["transcribe", "translate"], | |
help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')" | |
) | |
parser.add_argument("--language", | |
type=str, | |
default="en", | |
choices=LANGS, | |
help="What is the origin language of the video? If unset, it is detected automatically." | |
) | |
args = parser.parse_args().__dict__ | |
model_name: str = args.pop("model") | |
device: str = args.pop("device") | |
compute_type: str = args.pop("compute_type") | |
output_dir: str = args.pop("output_dir") | |
output_srt: bool = args.pop("output_srt") | |
srt_only: bool = args.pop("srt_only") | |
language: str = args.pop("language") | |
verbose: bool = args.pop("verbose") | |
args['word_timestamps'] = True | |
os.makedirs(output_dir, exist_ok=True) | |
if model_name.endswith(".en"): | |
warnings.warn( | |
f"{model_name} is an English-only model, forcing English detection.") | |
args["language"] = "en" | |
# if translate task used and language argument is set, then use it | |
elif language != "auto": | |
args["language"] = language | |
model = WhisperModel(model_name, device=device, compute_type=compute_type) | |
audios = get_audio(args.pop("video")) | |
subtitles = get_subtitles( | |
audios, | |
output_srt or srt_only, | |
output_dir, | |
lambda audio_path: model.transcribe(audio_path, **args), | |
verbose | |
) | |
log.info('Collected subtitles!') | |
if srt_only: | |
return | |
for path, srt_path in subtitles.items(): | |
out_path = os.path.join(output_dir, f"{filename(path)}.mp4") | |
log.info(f"Adding subtitles to {filename(path)}...") | |
video = ffmpeg.input(path) | |
audio = video.audio | |
ffmpeg.concat( | |
video.filter('subtitles', srt_path, force_style="OutlineColour=&H40000000,BorderStyle=3"), audio, v=1, a=1 | |
).output(out_path).run(quiet=True, overwrite_output=True) | |
log.info(f"Saved subtitled video to {os.path.abspath(out_path)}.") | |
log.info('Complete!') | |
def get_audio(paths: list): | |
temp_dir = tempfile.gettempdir() | |
audio_paths = {} | |
for path in paths: | |
log.info(f"Extracting audio from {filename(path)}...") | |
output_path = os.path.join(temp_dir, f"{filename(path)}.wav") | |
ffmpeg.input(path).output( | |
output_path, | |
acodec="pcm_s16le", ac=1, ar="16k" | |
).run(quiet=True, overwrite_output=True) | |
audio_paths[path] = output_path | |
log.info('Done extracting audio!') | |
return audio_paths | |
def write_srt(transcript, srt_file: TextIO, verbose: bool = False) -> None: | |
log.info('Writing out SRT file...') | |
i = 1 | |
srt_tpl = '%d\n%s --> %s\n%s\n\n' | |
for segment in transcript: | |
# Segment(id=1, seek=2940, start=0.0, end=1.28, | |
# text=' Why should I face reality?', | |
# tokens=[50364, 1545, 820, 286, 1851, 4103, 30, 50440], | |
# temperature=0.0, | |
# avg_logprob=-0.22900006071560913, | |
# compression_ratio=1.6572327044025157, | |
# no_speech_prob=0.010524801909923553, | |
# words=[ | |
# Word(start=0.0, end=0.24, word=' Why', probability=0.48081842064857483), | |
# Word(start=0.24, end=0.44, word=' should', probability=0.8993381261825562), | |
# Word(start=0.44, end=0.56, word=' I', probability=0.971877932548523), | |
# Word(start=0.56, end=0.82, word=' face', probability=0.8046457171440125), | |
# Word(start=0.82, end=1.28, word=' reality?', probability=0.869381308555603) | |
# ] | |
# ) | |
log.debug(segment) | |
buffer = [] | |
if verbose: | |
log.info(f"{i}[{segment.start} --> {segment.end}]: {segment.text}") | |
for word in segment.words: | |
buffer.append(word) | |
text = ''.join([ x.word for x in buffer ]).strip().replace('-->', '->') | |
charlen = len(text) | |
stime = format_timestamp(buffer[0].start, always_include_hours=True) | |
etime = format_timestamp(buffer[-1].end, always_include_hours=True) | |
if len(buffer) > 6 or charlen > 32: | |
srt_file.write( srt_tpl % ( i, stime, etime, text ) ) | |
i += 1 | |
buffer = [] | |
if len(buffer) > 0: | |
srt_file.write( srt_tpl % ( i, stime, etime, text ) ) | |
i += 1 | |
srt_file.flush() | |
log.info('Done writing SRT file!') | |
def filename(path) -> str: | |
return os.path.splitext(os.path.basename(path))[0] | |
def get_subtitles(audio_paths: list, output_srt: bool, output_dir: str, transcribe: callable, verbose: bool = False): | |
subtitles_path = {} | |
for path, audio_path in audio_paths.items(): | |
srt_path = output_dir if output_srt else tempfile.gettempdir() | |
srt_path = os.path.join(srt_path, f"{filename(path)}.srt") | |
log.info(f"Generating subtitles for {filename(path)}... This might take a while.") | |
warnings.filterwarnings("ignore") | |
result, info = transcribe(audio_path) | |
warnings.filterwarnings("default") | |
log.debug(info) | |
log.info(f"Subtitles generated!") | |
with io.open(srt_path, "w", encoding="utf-8") as srt: | |
write_srt(result, srt_file=srt, verbose=verbose) | |
subtitles_path[path] = srt_path | |
return subtitles_path | |
if __name__ == '__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Run just as `voice2text ./video.mp4` and it'll produce a video.srt with captions/subtitles. | |
# Takes the rough edges off docker and makes it look like this script runs natively in the system. | |
. common.sh | |
grep -qiP "\x2D-?h(?:elp)?"<<<"$1" && { | |
echo "Usage: $0 [options] <audio file>" | |
echo "Options:" | |
echo " -h, --help: Show this help message" | |
echo " -l, --language: Language of the audio file (default: en-US)" | |
echo " -o, --output: Output file (default: <audio file>.srt)" | |
exit 0 | |
} | |
while [[ $# -gt 0 ]]; do | |
case "$1" in | |
-l|--language) | |
LANGUAGE="$2" | |
shift | |
;; | |
-o|--output) | |
OUTPUT="$2" | |
shift | |
;; | |
*) | |
AUDIO_FILE="`realpath $1`" | |
;; | |
esac | |
shift | |
done | |
log_info_msg "Operating on ${AUDIO_FILE}" | |
docker run --name voice2text -it --rm \ | |
-v "$(dirname $AUDIO_FILE):/tmp/workspace" \ | |
-w "/tmp/workspace" \ | |
faster-whisper --language en --verbose --srt_only "/tmp/workspace/$(basename $AUDIO_FILE)" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment