royshil/google_cloud_speech_json_to_srt.py

## google_cloud_speech_json_to_srt.py
#!/usr/local/bin/python3

import os
import json
import sys
import argparse
import math
import subprocess
from tqdm import tqdm

parser = argparse.ArgumentParser(description='Convert google ml speech .json to .srt subtitles.')
parser.add_argument('files', metavar='F', type=str, nargs='+',
                    help='json files for converting')
parser.add_argument('--concat', dest='concat', action='store_const',
                    const=True, default=False,
                    help='concatenate the subtitles to a single .srt output file')
parser.add_argument('--fix_timestamps', '-t', dest='fix_timestamps', action='store_const',
                    const=True, default=False,
                    help='fix timestamps of consequent files')

args = parser.parse_args()


def mktime(x):
    hours = math.floor(x / 3600)
    minutes = math.floor((x - hours * 3600) / 60)
    seconds = x % 60
    return ("%02d:%02d:%02.03f" % (hours, minutes, seconds)).replace('.', ',')


output_file = None

if args.concat and len(args.files) > 0:
    output_filename = os.path.splitext(args.files[0])[0] + '.srt'
    output_file = open(output_filename, 'w')

sub_id = 0

# a single subtitle line template
sub_line_template = """%d
%s --> %s
%s

"""


def write_sub(sub, sub_start_time, sub_end_time):
    global sub_id, sub_line_template

    sentence = ' '.join(sub)
    sub_id += 1
    output_file.write(sub_line_template % (sub_id,
                                           mktime(sub_start_time),
                                           mktime(sub_end_time).replace('.', ','),
                                           sentence))


for file_name in tqdm(args.files):
    with open(file_name, 'r') as in_file:
        trans_json = json.load(in_file)

    if not args.concat:
        output_filename = os.path.splitext(file_name)[0] + '.srt'
        output_file = open(output_filename, 'w')
        sub_id = 0

    # sometimes the results are in /results and sometimes in /response/results
    results_element = trans_json['results'] if 'results' in trans_json else \
        trans_json['response']['results']

    start_timestamp = 0

    if args.fix_timestamps:
        flac_file = os.path.splitext(file_name)[0] + '.flac'
        p = subprocess.run(["ffprobe", "-i", flac_file, "-show_frames", "-show_entries",
                            "frame=pkt_pts_time", "-of", "csv=p=0", "-hide_banner", "-v", "0"],
                           capture_output=True)
        start_timestamp = float(p.stdout.splitlines()[0])

    for i, a in enumerate(results_element):
        sent = a['alternatives'][0]
        words = sent['words']  # usually 20-30 words

        num_words = len(words)

        sub = [words[0]['word']]
        sub_start_time = float(words[0]['startTime'][:-1]) + start_timestamp
        sub_end_time = float(words[0]['endTime'][:-1]) + start_timestamp
        sub_start_wi = 0
        sub_end_wi = 0
        wi = 0  # word index in sentence

        while wi < num_words:
            wi += 1

            if wi >= num_words:
                if len(sub) > 0:
                    # write out the remaining words in the sentence
                    write_sub(sub, sub_start_time, sub_end_time)
                break

            next_word_start_time = float(words[wi]['startTime'][:-1]) + start_timestamp
            next_word_end_time = float(words[wi]['endTime'][:-1]) + start_timestamp

            # max 10 seconds and max 10 words
            if sub_end_wi - sub_start_wi < 10 and next_word_start_time - sub_start_time < 10:
                # add next word
                sub.append(words[wi]['word'])
                sub_end_time = next_word_end_time
                sub_end_wi = wi
            else:
                # write a sub without next word
                write_sub(sub, sub_start_time, sub_end_time)

                # add next word to next sub
                if wi < num_words:
                    sub = [words[wi]['word']]
                    sub_start_time = float(words[wi]['startTime'][:-1]) + start_timestamp
                    sub_end_time = float(words[wi]['endTime'][:-1]) + start_timestamp
                    sub_start_wi = wi
                    sub_end_wi = wi

    if not args.concat:
        output_file.close()
	#!/usr/local/bin/python3

	import os
	import json
	import sys
	import argparse
	import math
	import subprocess
	from tqdm import tqdm

	parser = argparse.ArgumentParser(description='Convert google ml speech .json to .srt subtitles.')
	parser.add_argument('files', metavar='F', type=str, nargs='+',
	help='json files for converting')
	parser.add_argument('--concat', dest='concat', action='store_const',
	const=True, default=False,
	help='concatenate the subtitles to a single .srt output file')
	parser.add_argument('--fix_timestamps', '-t', dest='fix_timestamps', action='store_const',
	const=True, default=False,
	help='fix timestamps of consequent files')

	args = parser.parse_args()


	def mktime(x):
	hours = math.floor(x / 3600)
	minutes = math.floor((x - hours * 3600) / 60)
	seconds = x % 60
	return ("%02d:%02d:%02.03f" % (hours, minutes, seconds)).replace('.', ',')


	output_file = None

	if args.concat and len(args.files) > 0:
	output_filename = os.path.splitext(args.files[0])[0] + '.srt'
	output_file = open(output_filename, 'w')

	sub_id = 0

	# a single subtitle line template
	sub_line_template = """%d
	%s --> %s
	%s

	"""


	def write_sub(sub, sub_start_time, sub_end_time):
	global sub_id, sub_line_template

	sentence = ' '.join(sub)
	sub_id += 1
	output_file.write(sub_line_template % (sub_id,
	mktime(sub_start_time),
	mktime(sub_end_time).replace('.', ','),
	sentence))


	for file_name in tqdm(args.files):
	with open(file_name, 'r') as in_file:
	trans_json = json.load(in_file)

	if not args.concat:
	output_filename = os.path.splitext(file_name)[0] + '.srt'
	output_file = open(output_filename, 'w')
	sub_id = 0

	# sometimes the results are in /results and sometimes in /response/results
	results_element = trans_json['results'] if 'results' in trans_json else \
	trans_json['response']['results']

	start_timestamp = 0

	if args.fix_timestamps:
	flac_file = os.path.splitext(file_name)[0] + '.flac'
	p = subprocess.run(["ffprobe", "-i", flac_file, "-show_frames", "-show_entries",
	"frame=pkt_pts_time", "-of", "csv=p=0", "-hide_banner", "-v", "0"],
	capture_output=True)
	start_timestamp = float(p.stdout.splitlines()[0])

	for i, a in enumerate(results_element):
	sent = a['alternatives'][0]
	words = sent['words'] # usually 20-30 words

	num_words = len(words)

	sub = [words[0]['word']]
	sub_start_time = float(words[0]['startTime'][:-1]) + start_timestamp
	sub_end_time = float(words[0]['endTime'][:-1]) + start_timestamp
	sub_start_wi = 0
	sub_end_wi = 0
	wi = 0 # word index in sentence

	while wi < num_words:
	wi += 1

	if wi >= num_words:
	if len(sub) > 0:
	# write out the remaining words in the sentence
	write_sub(sub, sub_start_time, sub_end_time)
	break

	next_word_start_time = float(words[wi]['startTime'][:-1]) + start_timestamp
	next_word_end_time = float(words[wi]['endTime'][:-1]) + start_timestamp

	# max 10 seconds and max 10 words
	if sub_end_wi - sub_start_wi < 10 and next_word_start_time - sub_start_time < 10:
	# add next word
	sub.append(words[wi]['word'])
	sub_end_time = next_word_end_time
	sub_end_wi = wi
	else:
	# write a sub without next word
	write_sub(sub, sub_start_time, sub_end_time)

	# add next word to next sub
	if wi < num_words:
	sub = [words[wi]['word']]
	sub_start_time = float(words[wi]['startTime'][:-1]) + start_timestamp
	sub_end_time = float(words[wi]['endTime'][:-1]) + start_timestamp
	sub_start_wi = wi
	sub_end_wi = wi

	if not args.concat:
	output_file.close()