solace/makesrt.py

## makesrt.py
"""
Check out STEAM Powered (https://steampoweredshow.com/) where I have conversations
with women in STEAM to learn a bit about what they do and who they are.
https://steampoweredshow.com/learn-more/
"""

"""
Requirements
************

* Install `ffmpeg <https://www.ffmpeg.org/>`_
* Setup virtual env `<https://cloud.google.com/python/docs/setup>`_
* `pip install google-cloud-speech`
* `pip install pydub`
* `pip install srt`

You will also need API Credentials with Cloud Speech-to-Text API enabled.
Instructions here: `<https://cloud.google.com/speech-to-text/docs/quickstart-client-libraries>`_

**Disclaimer:** Be aware that this is technically not a free service. Check Google's pricing to confirm whether this service
is suitable for your requirements.

Notes
*****

* This has been tested with .mov, and that's it.
* Accuracy determined by the quality of the source video, and the service.
* Video length cannot exceed 60 seconds with this Google service. The `ffmpeg` command will only send the first 60 seconds.

YMMV


Usage
*****

usage: makesrt.py [-h] [-b BIN] -c CREDENTIALS [-l [LANG]] [-m [{command_and_search,phone_call,video,default}]] [-r BREAK] [filename]

positional arguments:
  filename              video filename

optional arguments:
  -h, --help            show this help message and exit
  -b BIN, --bin BIN     caption length, defaults to 3 seconds
  -c CREDENTIALS, --credentials CREDENTIALS
                        Google credentials JSON
  -l [LANG], --lang [LANG]
                        language code, defaults to en-US. See https://cloud.google.com/speech-to-text/docs/languages
  -m [{command_and_search,phone_call,video,default}], --model [{command_and_search,phone_call,video,default}]
                        transcription model, defaults to default. Note: video is a premium model.
  -r BREAK, --break BREAK
                        line break length, defaults to 37 characters


Acknowledgements
****************

Cobbled together from -

* `autosub <https://github.com/agermanidis/autosub/>`_
* `<https://github.com/darshan-majithiya/Generate-SRT-File-using-Google-Cloud-s-Speech-to-Text-API>`_
"""

import argparse, datetime, io, os, subprocess, sys, tempfile, textwrap

from google.cloud import speech
from pydub.utils import mediainfo
import srt

def build_subs(response):
    transcript = []

    for result in response.results:
        for alternative in result.alternatives:
            if not alternative:
                continue

            try:
                """ Initialise timing """
                if alternative.words[0].start_time.seconds:
                    start_time = alternative.words[0].start_time
                else:
                    start_time = datetime.timedelta(seconds=0, microseconds=0)
                end_time = alternative.words[-1].end_time

                subtitle = ''
                previous_word = None
                for word in alternative.words:
                    try:
                        if word.end_time < start_time + datetime.timedelta(seconds=args.bin):
                            subtitle = subtitle + " " + word.word
                        elif previous_word:
                            subtitle = textwrap.dedent(subtitle)
                            subtitle = '\n'.join(l for line in subtitle.splitlines()
                                for l in textwrap.wrap(line, width=37)
                            )
                            transcript.append(
                                srt.Subtitle(len(transcript) + 1, start_time, previous_word.end_time, subtitle)
                            )
                            start_time = word.start_time
                            end_time = word.end_time + datetime.timedelta(seconds=args.bin)
                            subtitle = word.word
                        previous_word = word
                    except IndexError:
                        pass

                # Last one
                transcript.append(
                    srt.Subtitle(len(transcript) + 1, start_time, previous_word.end_time, subtitle)
                )
            except IndexError:
                pass

    return srt.compose(transcript)

parser = argparse.ArgumentParser()
parser.add_argument('-b', '--bin', help='caption length, defaults to 3 seconds', default=3)
parser.add_argument('-c', '--credentials', help='Google credentials JSON', required=True)
parser.add_argument('-l', '--lang', help='language code, defaults to en-US. See https://cloud.google.com/speech-to-text/docs/languages', nargs='?', default='en-US')
parser.add_argument(
    '-m',
    '--model',
    help='transcription model, defaults to default. Note: video is a premium model.',
    nargs='?',
    choices=['command_and_search', 'phone_call', 'video', 'default'],
    default='default'
)
parser.add_argument('-r', '--linebreak', help='line break length, defaults to 37 characters', default=37)
parser.add_argument('filename', help='video filename', nargs='?', type=str)
args = parser.parse_args()

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = args.credentials
file_name = args.filename

""" Generate the temporary audio file """
video_data = mediainfo(file_name)
channels = video_data["channels"]
bit_rate = video_data["bit_rate"]
sample_rate = video_data["sample_rate"]

temp_file = tempfile.NamedTemporaryFile(suffix='.flac', delete=False)
command = f"ffmpeg -i \"{file_name}\" -ss 0 -t 60 -b:a {bit_rate} -ac {channels} -ar {sample_rate} -vn \"{temp_file.name}\""
subprocess.call(command, shell=True)

""" Prepare the audio """
with io.open(temp_file.name, "rb") as audio_file:
    content = audio_file.read()
    audio = speech.RecognitionAudio(content=content)

config = speech.RecognitionConfig(
   language_code = args.lang,
   sample_rate_hertz = int(sample_rate),
   encoding = "FLAC",
   audio_channel_count = int(channels),
   enable_word_time_offsets = True,
   model = args.model,
   enable_automatic_punctuation = True
)

client = speech.SpeechClient()
response = client.recognize(request={"config": config, "audio": audio})

""" Build and output SRT """
subs = build_subs(response)

basename, ext = os.path.splitext(file_name)
lang = args.lang.replace('-', '_')
with open("{basename}.{lang}.srt".format(basename=basename, lang=lang), "w") as f:
    f.write(subs)
	"""
	Check out STEAM Powered (https://steampoweredshow.com/) where I have conversations
	with women in STEAM to learn a bit about what they do and who they are.
	https://steampoweredshow.com/learn-more/
	"""

	"""
	Requirements
	************

	* Install `ffmpeg <https://www.ffmpeg.org/>`_
	* Setup virtual env `<https://cloud.google.com/python/docs/setup>`_
	* `pip install google-cloud-speech`
	* `pip install pydub`
	* `pip install srt`

	You will also need API Credentials with Cloud Speech-to-Text API enabled.
	Instructions here: `<https://cloud.google.com/speech-to-text/docs/quickstart-client-libraries>`_

	Disclaimer: Be aware that this is technically not a free service. Check Google's pricing to confirm whether this service
	is suitable for your requirements.

	Notes
	*****

	* This has been tested with .mov, and that's it.
	* Accuracy determined by the quality of the source video, and the service.
	* Video length cannot exceed 60 seconds with this Google service. The `ffmpeg` command will only send the first 60 seconds.

	YMMV


	Usage
	*****

	usage: makesrt.py [-h] [-b BIN] -c CREDENTIALS [-l [LANG]] [-m [{command_and_search,phone_call,video,default}]] [-r BREAK] [filename]

	positional arguments:
	filename video filename

	optional arguments:
	-h, --help show this help message and exit
	-b BIN, --bin BIN caption length, defaults to 3 seconds
	-c CREDENTIALS, --credentials CREDENTIALS
	Google credentials JSON
	-l [LANG], --lang [LANG]
	language code, defaults to en-US. See https://cloud.google.com/speech-to-text/docs/languages
	-m [{command_and_search,phone_call,video,default}], --model [{command_and_search,phone_call,video,default}]
	transcription model, defaults to default. Note: video is a premium model.
	-r BREAK, --break BREAK
	line break length, defaults to 37 characters


	Acknowledgements
	****************

	Cobbled together from -

	* `autosub <https://github.com/agermanidis/autosub/>`_
	* `<https://github.com/darshan-majithiya/Generate-SRT-File-using-Google-Cloud-s-Speech-to-Text-API>`_
	"""

	import argparse, datetime, io, os, subprocess, sys, tempfile, textwrap

	from google.cloud import speech
	from pydub.utils import mediainfo
	import srt

	def build_subs(response):
	transcript = []

	for result in response.results:
	for alternative in result.alternatives:
	if not alternative:
	continue

	try:
	""" Initialise timing """
	if alternative.words[0].start_time.seconds:
	start_time = alternative.words[0].start_time
	else:
	start_time = datetime.timedelta(seconds=0, microseconds=0)
	end_time = alternative.words[-1].end_time

	subtitle = ''
	previous_word = None
	for word in alternative.words:
	try:
	if word.end_time < start_time + datetime.timedelta(seconds=args.bin):
	subtitle = subtitle + " " + word.word
	elif previous_word:
	subtitle = textwrap.dedent(subtitle)
	subtitle = '\n'.join(l for line in subtitle.splitlines()
	for l in textwrap.wrap(line, width=37)
	)
	transcript.append(
	srt.Subtitle(len(transcript) + 1, start_time, previous_word.end_time, subtitle)
	)
	start_time = word.start_time
	end_time = word.end_time + datetime.timedelta(seconds=args.bin)
	subtitle = word.word
	previous_word = word
	except IndexError:
	pass

	# Last one
	transcript.append(
	srt.Subtitle(len(transcript) + 1, start_time, previous_word.end_time, subtitle)
	)
	except IndexError:
	pass

	return srt.compose(transcript)

	parser = argparse.ArgumentParser()
	parser.add_argument('-b', '--bin', help='caption length, defaults to 3 seconds', default=3)
	parser.add_argument('-c', '--credentials', help='Google credentials JSON', required=True)
	parser.add_argument('-l', '--lang', help='language code, defaults to en-US. See https://cloud.google.com/speech-to-text/docs/languages', nargs='?', default='en-US')
	parser.add_argument(
	'-m',
	'--model',
	help='transcription model, defaults to default. Note: video is a premium model.',
	nargs='?',
	choices=['command_and_search', 'phone_call', 'video', 'default'],
	default='default'
	)
	parser.add_argument('-r', '--linebreak', help='line break length, defaults to 37 characters', default=37)
	parser.add_argument('filename', help='video filename', nargs='?', type=str)
	args = parser.parse_args()

	os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = args.credentials
	file_name = args.filename

	""" Generate the temporary audio file """
	video_data = mediainfo(file_name)
	channels = video_data["channels"]
	bit_rate = video_data["bit_rate"]
	sample_rate = video_data["sample_rate"]

	temp_file = tempfile.NamedTemporaryFile(suffix='.flac', delete=False)
	command = f"ffmpeg -i \"{file_name}\" -ss 0 -t 60 -b:a {bit_rate} -ac {channels} -ar {sample_rate} -vn \"{temp_file.name}\""
	subprocess.call(command, shell=True)

	""" Prepare the audio """
	with io.open(temp_file.name, "rb") as audio_file:
	content = audio_file.read()
	audio = speech.RecognitionAudio(content=content)

	config = speech.RecognitionConfig(
	language_code = args.lang,
	sample_rate_hertz = int(sample_rate),
	encoding = "FLAC",
	audio_channel_count = int(channels),
	enable_word_time_offsets = True,
	model = args.model,
	enable_automatic_punctuation = True
	)

	client = speech.SpeechClient()
	response = client.recognize(request={"config": config, "audio": audio})

	""" Build and output SRT """
	subs = build_subs(response)

	basename, ext = os.path.splitext(file_name)
	lang = args.lang.replace('-', '_')
	with open("{basename}.{lang}.srt".format(basename=basename, lang=lang), "w") as f:
	f.write(subs)