Google Cloud Speech-to-Text SRT Generator
* Install `ffmpeg <>`_
* Setup virtual env `<>`_
* `pip install google-cloud-speech`
* `pip install pydub`
* `pip install srt`
You will also need API Credentials with Cloud Speech-to-Text API enabled.
Instructions here: `<>`_
**Disclaimer:** Be aware that this is technically not a free service. Check Google's pricing to confirm whether this service
is suitable for your requirements.
* This has been tested with .mov, and that's it.
* Accuracy determined by the quality of the source video, and the service.
* Video length cannot exceed 60 seconds with this Google service. The `ffmpeg` command will only send the first 60 seconds.
usage: [-h] [-b BIN] -c CREDENTIALS [-l [LANG]] [-m [{command_and_search,phone_call,video,default}]] [-r BREAK] [filename]
positional arguments:
filename video filename
optional arguments:
-h, --help show this help message and exit
-b BIN, --bin BIN caption length, defaults to 3 seconds
Google credentials JSON
-l [LANG], --lang [LANG]
language code, defaults to en-US. See
-m [{command_and_search,phone_call,video,default}], --model [{command_and_search,phone_call,video,default}]
transcription model, defaults to default. Note: video is a premium model.
-r BREAK, --break BREAK
line break length, defaults to 37 characters
Cobbled together from -
* `autosub <>`_
* `<>`_
import argparse, datetime, io, os, subprocess, sys, tempfile, textwrap
from import speech
from pydub.utils import mediainfo
import srt
def build_subs(response):
transcript = []
for result in response.results:
for alternative in result.alternatives:
if not alternative:
""" Initialise timing """
if alternative.words[0].start_time.seconds:
start_time = alternative.words[0].start_time
start_time = datetime.timedelta(seconds=0, microseconds=0)
end_time = alternative.words[-1].end_time
subtitle = ''
previous_word = None
for word in alternative.words:
if word.end_time < start_time + datetime.timedelta(seconds=args.bin):
subtitle = subtitle + " " + word.word
elif previous_word:
subtitle = textwrap.dedent(subtitle)
subtitle = '\n'.join(l for line in subtitle.splitlines()
for l in textwrap.wrap(line, width=37)
srt.Subtitle(len(transcript) + 1, start_time, previous_word.end_time, subtitle)
start_time = word.start_time
end_time = word.end_time + datetime.timedelta(seconds=args.bin)
subtitle = word.word
previous_word = word
except IndexError:
# Last one
srt.Subtitle(len(transcript) + 1, start_time, previous_word.end_time, subtitle)
except IndexError:
return srt.compose(transcript)
parser = argparse.ArgumentParser()
parser.add_argument('-b', '--bin', help='caption length, defaults to 3 seconds', default=3)
parser.add_argument('-c', '--credentials', help='Google credentials JSON', required=True)
parser.add_argument('-l', '--lang', help='language code, defaults to en-US. See', nargs='?', default='en-US')
help='transcription model, defaults to default. Note: video is a premium model.',
choices=['command_and_search', 'phone_call', 'video', 'default'],
parser.add_argument('-r', '--linebreak', help='line break length, defaults to 37 characters', default=37)
parser.add_argument('filename', help='video filename', nargs='?', type=str)
args = parser.parse_args()
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = args.credentials
file_name = args.filename
""" Generate the temporary audio file """
video_data = mediainfo(file_name)
channels = video_data["channels"]
bit_rate = video_data["bit_rate"]
sample_rate = video_data["sample_rate"]
temp_file = tempfile.NamedTemporaryFile(suffix='.flac', delete=False)
command = f"ffmpeg -i \"{file_name}\" -ss 0 -t 60 -b:a {bit_rate} -ac {channels} -ar {sample_rate} -vn \"{}\"", shell=True)
""" Prepare the audio """
with, "rb") as audio_file:
content =
audio = speech.RecognitionAudio(content=content)
config = speech.RecognitionConfig(
language_code = args.lang,
sample_rate_hertz = int(sample_rate),
encoding = "FLAC",
audio_channel_count = int(channels),
enable_word_time_offsets = True,
model = args.model,
enable_automatic_punctuation = True
client = speech.SpeechClient()
response = client.recognize(request={"config": config, "audio": audio})
""" Build and output SRT """
subs = build_subs(response)
basename, ext = os.path.splitext(file_name)
lang = args.lang.replace('-', '_')
with open("{basename}.{lang}.srt".format(basename=basename, lang=lang), "w") as f:
