sigio/fixsubs.py

## fixsubs.py
#!/usr/bin/env python3
#
# Fix (hidive) srt subtitles for mkv player
# These subs seem to have multi-line captions with a 1ms start offset
# mpv then prints these in the wrong order (issue #7070)
# This script will merge subtitle lines correctly for mpv
#
# Public domain (created mostly with ChatGPT)
#
import argparse

def read_srt(input_file):
    with open(input_file, 'r') as f:
        lines = f.readlines()

    captions = []
    current_caption = None
    index = 1  # Initialize index counter

    for line in lines:
        line = line.strip()

        # Skip lines containing only a single integer
        if not line or line.isdigit():
            continue

        # print("Processing line:", line)  # Debug print

        # If the line contains timestamps
        if '-->' in line:
            # If there's an ongoing caption, add it to the captions list
            if current_caption:
                captions.append(current_caption)
                current_caption = None

            start, end = line.split(' --> ')
            current_caption = {"index": str(index), "start": start.strip(), "end": end.strip(), "text": []}
            index += 1  # Increment index for next caption
        else:
            # If it's not a timestamp line, it must be part of the caption text
            if current_caption is not None:
                current_caption["text"].append(line)

    # Append the last caption if there's one remaining
    if current_caption:
        captions.append(current_caption)

    # print("Captions:", captions)  # Debug print
    return captions

def write_srt(output_file, captions):
    with open(output_file, 'w') as f:
        for caption in captions:
            # Skip captions without a valid index
            if "index" not in caption or caption["index"] is None:
                continue

            # Write index line only for the first caption in a merged group
            if caption.get("merged", False) is False:
                f.write(caption["index"] + '\n')

            f.write(caption["start"] + ' --> ' + caption["end"] + '\n')
            for line in caption["text"]:
                f.write(line + '\n')
            f.write('\n')


def merge_captions(captions):
    merged_captions = []
    previous_caption = None

    for caption in captions:
        # Skip captions without required keys
        if "start" not in caption or "end" not in caption:
            continue

        # print("Processing caption:", caption)  # Debug print

        if previous_caption is None:
            previous_caption = caption
            continue

        prev_start_ms = int(previous_caption["start"].split(',')[0].split(':')[2])
        curr_start_ms = int(caption["start"].split(',')[0].split(':')[2])
        # print("Previous start ms:", prev_start_ms)
        # print("Current start ms:", curr_start_ms)

        if (previous_caption["start"] == caption["start"] or abs(prev_start_ms - curr_start_ms) <= 1) and previous_caption["end"] == caption["end"]:
            # print("Merging captions...")
            previous_caption["text"].extend(caption["text"])
        else:
            # print("Appending previous caption:", previous_caption)
            merged_captions.append(previous_caption)
            previous_caption = caption

    # Append the last caption if it's not already added
    if previous_caption:
        merged_captions.append(previous_caption)

    return merged_captions

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Merge subtitles based on start times within 1 ms and same end time")
    parser.add_argument("input_file", type=str, help="Input .srt subtitle file")
    parser.add_argument("output_file", type=str, help="Output .srt subtitle file with merged captions")
    args = parser.parse_args()

    captions = read_srt(args.input_file)
    merged_captions = merge_captions(captions)
    write_srt(args.output_file, merged_captions)
	#!/usr/bin/env python3
	#
	# Fix (hidive) srt subtitles for mkv player
	# These subs seem to have multi-line captions with a 1ms start offset
	# mpv then prints these in the wrong order (issue #7070)
	# This script will merge subtitle lines correctly for mpv
	#
	# Public domain (created mostly with ChatGPT)
	#
	import argparse

	def read_srt(input_file):
	with open(input_file, 'r') as f:
	lines = f.readlines()

	captions = []
	current_caption = None
	index = 1 # Initialize index counter

	for line in lines:
	line = line.strip()

	# Skip lines containing only a single integer
	if not line or line.isdigit():
	continue

	# print("Processing line:", line) # Debug print

	# If the line contains timestamps
	if '-->' in line:
	# If there's an ongoing caption, add it to the captions list
	if current_caption:
	captions.append(current_caption)
	current_caption = None

	start, end = line.split(' --> ')
	current_caption = {"index": str(index), "start": start.strip(), "end": end.strip(), "text": []}
	index += 1 # Increment index for next caption
	else:
	# If it's not a timestamp line, it must be part of the caption text
	if current_caption is not None:
	current_caption["text"].append(line)

	# Append the last caption if there's one remaining
	if current_caption:
	captions.append(current_caption)

	# print("Captions:", captions) # Debug print
	return captions

	def write_srt(output_file, captions):
	with open(output_file, 'w') as f:
	for caption in captions:
	# Skip captions without a valid index
	if "index" not in caption or caption["index"] is None:
	continue

	# Write index line only for the first caption in a merged group
	if caption.get("merged", False) is False:
	f.write(caption["index"] + '\n')

	f.write(caption["start"] + ' --> ' + caption["end"] + '\n')
	for line in caption["text"]:
	f.write(line + '\n')
	f.write('\n')


	def merge_captions(captions):
	merged_captions = []
	previous_caption = None

	for caption in captions:
	# Skip captions without required keys
	if "start" not in caption or "end" not in caption:
	continue

	# print("Processing caption:", caption) # Debug print

	if previous_caption is None:
	previous_caption = caption
	continue

	prev_start_ms = int(previous_caption["start"].split(',')[0].split(':')[2])
	curr_start_ms = int(caption["start"].split(',')[0].split(':')[2])
	# print("Previous start ms:", prev_start_ms)
	# print("Current start ms:", curr_start_ms)

	if (previous_caption["start"] == caption["start"] or abs(prev_start_ms - curr_start_ms) <= 1) and previous_caption["end"] == caption["end"]:
	# print("Merging captions...")
	previous_caption["text"].extend(caption["text"])
	else:
	# print("Appending previous caption:", previous_caption)
	merged_captions.append(previous_caption)
	previous_caption = caption

	# Append the last caption if it's not already added
	if previous_caption:
	merged_captions.append(previous_caption)

	return merged_captions

	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Merge subtitles based on start times within 1 ms and same end time")
	parser.add_argument("input_file", type=str, help="Input .srt subtitle file")
	parser.add_argument("output_file", type=str, help="Output .srt subtitle file with merged captions")
	args = parser.parse_args()

	captions = read_srt(args.input_file)
	merged_captions = merge_captions(captions)
	write_srt(args.output_file, merged_captions)