Last active
September 21, 2021 04:34
-
-
Save solace/abbabb693f37d49ccbd9348178c5963b to your computer and use it in GitHub Desktop.
Google Cloud Speech-to-Text SRT Generator
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Check out STEAM Powered (https://steampoweredshow.com/) where I have conversations | |
with women in STEAM to learn a bit about what they do and who they are. | |
https://steampoweredshow.com/learn-more/ | |
""" | |
""" | |
Requirements | |
************ | |
* Install `ffmpeg <https://www.ffmpeg.org/>`_ | |
* Setup virtual env `<https://cloud.google.com/python/docs/setup>`_ | |
* `pip install google-cloud-speech` | |
* `pip install pydub` | |
* `pip install srt` | |
You will also need API Credentials with Cloud Speech-to-Text API enabled. | |
Instructions here: `<https://cloud.google.com/speech-to-text/docs/quickstart-client-libraries>`_ | |
**Disclaimer:** Be aware that this is technically not a free service. Check Google's pricing to confirm whether this service | |
is suitable for your requirements. | |
Notes | |
***** | |
* This has been tested with .mov, and that's it. | |
* Accuracy determined by the quality of the source video, and the service. | |
* Video length cannot exceed 60 seconds with this Google service. The `ffmpeg` command will only send the first 60 seconds. | |
YMMV | |
Usage | |
***** | |
usage: makesrt.py [-h] [-b BIN] -c CREDENTIALS [-l [LANG]] [-m [{command_and_search,phone_call,video,default}]] [-r BREAK] [filename] | |
positional arguments: | |
filename video filename | |
optional arguments: | |
-h, --help show this help message and exit | |
-b BIN, --bin BIN caption length, defaults to 3 seconds | |
-c CREDENTIALS, --credentials CREDENTIALS | |
Google credentials JSON | |
-l [LANG], --lang [LANG] | |
language code, defaults to en-US. See https://cloud.google.com/speech-to-text/docs/languages | |
-m [{command_and_search,phone_call,video,default}], --model [{command_and_search,phone_call,video,default}] | |
transcription model, defaults to default. Note: video is a premium model. | |
-r BREAK, --break BREAK | |
line break length, defaults to 37 characters | |
Acknowledgements | |
**************** | |
Cobbled together from - | |
* `autosub <https://github.com/agermanidis/autosub/>`_ | |
* `<https://github.com/darshan-majithiya/Generate-SRT-File-using-Google-Cloud-s-Speech-to-Text-API>`_ | |
""" | |
import argparse, datetime, io, os, subprocess, sys, tempfile, textwrap | |
from google.cloud import speech | |
from pydub.utils import mediainfo | |
import srt | |
def build_subs(response): | |
transcript = [] | |
for result in response.results: | |
for alternative in result.alternatives: | |
if not alternative: | |
continue | |
try: | |
""" Initialise timing """ | |
if alternative.words[0].start_time.seconds: | |
start_time = alternative.words[0].start_time | |
else: | |
start_time = datetime.timedelta(seconds=0, microseconds=0) | |
end_time = alternative.words[-1].end_time | |
subtitle = '' | |
previous_word = None | |
for word in alternative.words: | |
try: | |
if word.end_time < start_time + datetime.timedelta(seconds=args.bin): | |
subtitle = subtitle + " " + word.word | |
elif previous_word: | |
subtitle = textwrap.dedent(subtitle) | |
subtitle = '\n'.join(l for line in subtitle.splitlines() | |
for l in textwrap.wrap(line, width=37) | |
) | |
transcript.append( | |
srt.Subtitle(len(transcript) + 1, start_time, previous_word.end_time, subtitle) | |
) | |
start_time = word.start_time | |
end_time = word.end_time + datetime.timedelta(seconds=args.bin) | |
subtitle = word.word | |
previous_word = word | |
except IndexError: | |
pass | |
# Last one | |
transcript.append( | |
srt.Subtitle(len(transcript) + 1, start_time, previous_word.end_time, subtitle) | |
) | |
except IndexError: | |
pass | |
return srt.compose(transcript) | |
parser = argparse.ArgumentParser() | |
parser.add_argument('-b', '--bin', help='caption length, defaults to 3 seconds', default=3) | |
parser.add_argument('-c', '--credentials', help='Google credentials JSON', required=True) | |
parser.add_argument('-l', '--lang', help='language code, defaults to en-US. See https://cloud.google.com/speech-to-text/docs/languages', nargs='?', default='en-US') | |
parser.add_argument( | |
'-m', | |
'--model', | |
help='transcription model, defaults to default. Note: video is a premium model.', | |
nargs='?', | |
choices=['command_and_search', 'phone_call', 'video', 'default'], | |
default='default' | |
) | |
parser.add_argument('-r', '--linebreak', help='line break length, defaults to 37 characters', default=37) | |
parser.add_argument('filename', help='video filename', nargs='?', type=str) | |
args = parser.parse_args() | |
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = args.credentials | |
file_name = args.filename | |
""" Generate the temporary audio file """ | |
video_data = mediainfo(file_name) | |
channels = video_data["channels"] | |
bit_rate = video_data["bit_rate"] | |
sample_rate = video_data["sample_rate"] | |
temp_file = tempfile.NamedTemporaryFile(suffix='.flac', delete=False) | |
command = f"ffmpeg -i \"{file_name}\" -ss 0 -t 60 -b:a {bit_rate} -ac {channels} -ar {sample_rate} -vn \"{temp_file.name}\"" | |
subprocess.call(command, shell=True) | |
""" Prepare the audio """ | |
with io.open(temp_file.name, "rb") as audio_file: | |
content = audio_file.read() | |
audio = speech.RecognitionAudio(content=content) | |
config = speech.RecognitionConfig( | |
language_code = args.lang, | |
sample_rate_hertz = int(sample_rate), | |
encoding = "FLAC", | |
audio_channel_count = int(channels), | |
enable_word_time_offsets = True, | |
model = args.model, | |
enable_automatic_punctuation = True | |
) | |
client = speech.SpeechClient() | |
response = client.recognize(request={"config": config, "audio": audio}) | |
""" Build and output SRT """ | |
subs = build_subs(response) | |
basename, ext = os.path.splitext(file_name) | |
lang = args.lang.replace('-', '_') | |
with open("{basename}.{lang}.srt".format(basename=basename, lang=lang), "w") as f: | |
f.write(subs) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment