Skip to content

Instantly share code, notes, and snippets.

@solace
Last active September 21, 2021 04:34
Show Gist options
  • Save solace/abbabb693f37d49ccbd9348178c5963b to your computer and use it in GitHub Desktop.
Save solace/abbabb693f37d49ccbd9348178c5963b to your computer and use it in GitHub Desktop.
Google Cloud Speech-to-Text SRT Generator
"""
Check out STEAM Powered (https://steampoweredshow.com/) where I have conversations
with women in STEAM to learn a bit about what they do and who they are.
https://steampoweredshow.com/learn-more/
"""
"""
Requirements
************
* Install `ffmpeg <https://www.ffmpeg.org/>`_
* Setup virtual env `<https://cloud.google.com/python/docs/setup>`_
* `pip install google-cloud-speech`
* `pip install pydub`
* `pip install srt`
You will also need API Credentials with Cloud Speech-to-Text API enabled.
Instructions here: `<https://cloud.google.com/speech-to-text/docs/quickstart-client-libraries>`_
**Disclaimer:** Be aware that this is technically not a free service. Check Google's pricing to confirm whether this service
is suitable for your requirements.
Notes
*****
* This has been tested with .mov, and that's it.
* Accuracy determined by the quality of the source video, and the service.
* Video length cannot exceed 60 seconds with this Google service. The `ffmpeg` command will only send the first 60 seconds.
YMMV
Usage
*****
usage: makesrt.py [-h] [-b BIN] -c CREDENTIALS [-l [LANG]] [-m [{command_and_search,phone_call,video,default}]] [-r BREAK] [filename]
positional arguments:
filename video filename
optional arguments:
-h, --help show this help message and exit
-b BIN, --bin BIN caption length, defaults to 3 seconds
-c CREDENTIALS, --credentials CREDENTIALS
Google credentials JSON
-l [LANG], --lang [LANG]
language code, defaults to en-US. See https://cloud.google.com/speech-to-text/docs/languages
-m [{command_and_search,phone_call,video,default}], --model [{command_and_search,phone_call,video,default}]
transcription model, defaults to default. Note: video is a premium model.
-r BREAK, --break BREAK
line break length, defaults to 37 characters
Acknowledgements
****************
Cobbled together from -
* `autosub <https://github.com/agermanidis/autosub/>`_
* `<https://github.com/darshan-majithiya/Generate-SRT-File-using-Google-Cloud-s-Speech-to-Text-API>`_
"""
import argparse, datetime, io, os, subprocess, sys, tempfile, textwrap
from google.cloud import speech
from pydub.utils import mediainfo
import srt
def build_subs(response):
transcript = []
for result in response.results:
for alternative in result.alternatives:
if not alternative:
continue
try:
""" Initialise timing """
if alternative.words[0].start_time.seconds:
start_time = alternative.words[0].start_time
else:
start_time = datetime.timedelta(seconds=0, microseconds=0)
end_time = alternative.words[-1].end_time
subtitle = ''
previous_word = None
for word in alternative.words:
try:
if word.end_time < start_time + datetime.timedelta(seconds=args.bin):
subtitle = subtitle + " " + word.word
elif previous_word:
subtitle = textwrap.dedent(subtitle)
subtitle = '\n'.join(l for line in subtitle.splitlines()
for l in textwrap.wrap(line, width=37)
)
transcript.append(
srt.Subtitle(len(transcript) + 1, start_time, previous_word.end_time, subtitle)
)
start_time = word.start_time
end_time = word.end_time + datetime.timedelta(seconds=args.bin)
subtitle = word.word
previous_word = word
except IndexError:
pass
# Last one
transcript.append(
srt.Subtitle(len(transcript) + 1, start_time, previous_word.end_time, subtitle)
)
except IndexError:
pass
return srt.compose(transcript)
parser = argparse.ArgumentParser()
parser.add_argument('-b', '--bin', help='caption length, defaults to 3 seconds', default=3)
parser.add_argument('-c', '--credentials', help='Google credentials JSON', required=True)
parser.add_argument('-l', '--lang', help='language code, defaults to en-US. See https://cloud.google.com/speech-to-text/docs/languages', nargs='?', default='en-US')
parser.add_argument(
'-m',
'--model',
help='transcription model, defaults to default. Note: video is a premium model.',
nargs='?',
choices=['command_and_search', 'phone_call', 'video', 'default'],
default='default'
)
parser.add_argument('-r', '--linebreak', help='line break length, defaults to 37 characters', default=37)
parser.add_argument('filename', help='video filename', nargs='?', type=str)
args = parser.parse_args()
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = args.credentials
file_name = args.filename
""" Generate the temporary audio file """
video_data = mediainfo(file_name)
channels = video_data["channels"]
bit_rate = video_data["bit_rate"]
sample_rate = video_data["sample_rate"]
temp_file = tempfile.NamedTemporaryFile(suffix='.flac', delete=False)
command = f"ffmpeg -i \"{file_name}\" -ss 0 -t 60 -b:a {bit_rate} -ac {channels} -ar {sample_rate} -vn \"{temp_file.name}\""
subprocess.call(command, shell=True)
""" Prepare the audio """
with io.open(temp_file.name, "rb") as audio_file:
content = audio_file.read()
audio = speech.RecognitionAudio(content=content)
config = speech.RecognitionConfig(
language_code = args.lang,
sample_rate_hertz = int(sample_rate),
encoding = "FLAC",
audio_channel_count = int(channels),
enable_word_time_offsets = True,
model = args.model,
enable_automatic_punctuation = True
)
client = speech.SpeechClient()
response = client.recognize(request={"config": config, "audio": audio})
""" Build and output SRT """
subs = build_subs(response)
basename, ext = os.path.splitext(file_name)
lang = args.lang.replace('-', '_')
with open("{basename}.{lang}.srt".format(basename=basename, lang=lang), "w") as f:
f.write(subs)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment