abu-co/subtitle_overlap_remover.py

## subtitle_overlap_remover.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from enum import Enum
import argparse

import pysubs2

__author__ = "abuco (https://github.com/abu-co)"


def process(
    path: str,
    output_path: str,
    *,
    sep: str = "\\N",
    encoding: str = "utf-8",
    echo: bool = False,
    verbosity: int = 0
):
    '''
    Processes the input subtitle file so there are no overlaps events
    by merging overlapped content into the same event.

    Parameters `path` and `output_path` should contain extensions.

    The separator specified by `sep`, defaulting to a line break "\\N",
    is inserted between events when they are being merged.
    '''

    subs = pysubs2.load(path, encoding)

    def echo_text(text: str, text_verbosity: int = 0):
        '''Prints text if `echo` is set to `True`.'''
        if echo and text_verbosity <= verbosity:
            print(text)

    class Timepoint:
        class Type(Enum):
            START = 0,
            END = 1

        def __init__(self, type: Type, time: int, line: pysubs2.SSAEvent) -> None:
            self.type = type
            self.time = time
            self.line = line

        def __str__(self) -> str:
            return f"[{self.type}] ({self.time}) {self.line}"

    raw_timepoints: dict[int, list[Timepoint]] = {}

    def get_event_list(d: dict[int, list[Timepoint]], time: int) -> list[Timepoint]:
        if time in d:
            return d[time]
        else:
            d[time] = []
            return d[time]

    line: pysubs2.SSAEvent
    for line in subs.events:
        if line.text == "":
            continue
        s = get_event_list(raw_timepoints, line.start)
        e = get_event_list(raw_timepoints, line.end)
        s.append(Timepoint(Timepoint.Type.START, line.start, line))
        e.append(Timepoint(Timepoint.Type.END, line.end, line))

    timepoints = sorted(raw_timepoints.items(), key=lambda t: t[0])

    events: list[pysubs2.SSAEvent] = []

    timepoint_count = len(timepoints)

    echo_text(f"Found {timepoint_count} distinct timepoints.")

    # echo_text('\t' + str([t[0] for t in timepoints]), 3)

    active_events: list[pysubs2.SSAEvent] = []
    for i in range(timepoint_count):
        timepoint = timepoints[i]

        if verbosity >= 2:
            text = (f"\t[{i}]: ({timepoint[0]}) " +
                    f"{list(map((lambda t: str(t)), timepoint[1]))}")
            echo_text(text, 2)

        for event in timepoint[1]:
            if event.type == Timepoint.Type.START:
                active_events.append(event.line)
            else:
                active_events.remove(event.line)

        echo_text(f"\t\tActive Events: {active_events}", 2)

        if len(active_events) > 0:
            start = timepoint[0]
            end = (timepoints[i+1][0]
                   if i + 1 < timepoint_count
                   else timepoint[1][0].line.end)

            assert end - start > 0

            result_line = pysubs2.SSAEvent(
                start=start, end=end, text=active_events[0].text)
            for j in range(1, len(active_events)):
                result_line.text += sep + active_events[j].text

            echo_text(f"\t\tResult Line: {result_line}", 2)

            events.append(result_line)

    echo_text(f"Converted {len(subs)} into {len(events)} individual events.")

    default_style = subs.styles.get("Default", pysubs2.SSAStyle())
    subs.styles.clear()
    subs.styles["Default"] = default_style

    echo_text(f"Using default style: {default_style}", 1)

    subs.events = events

    subs.save(output_path, encoding)


if __name__ == "__main__":
    arg_parser = argparse.ArgumentParser()

    arg_parser.add_argument("input_path", help="the input file's path")
    arg_parser.add_argument("output_path", help="the output file's path")

    arg_parser.add_argument("-s", "--sep", default="\\N",
                            help="the separator to be inserted between merged events (default is line break)")

    arg_parser.add_argument("--silent", action="store_true",
                            help="disable console output")
    arg_parser.add_argument("--verbosity", type=int, default=0,
                            help="the verbosity of console output (default is 0)")

    arg = arg_parser.parse_args()

    process(str(arg.input_path), str(arg.output_path),
            sep=str(arg.sep),
            echo=(not bool(arg.silent)), verbosity=int(arg.verbosity))
	#!/usr/bin/env python
	# -- coding: utf-8 --

	from enum import Enum
	import argparse

	import pysubs2

	__author__ = "abuco (https://github.com/abu-co)"


	def process(
	path: str,
	output_path: str,
	*,
	sep: str = "\\N",
	encoding: str = "utf-8",
	echo: bool = False,
	verbosity: int = 0
	):
	'''
	Processes the input subtitle file so there are no overlaps events
	by merging overlapped content into the same event.

	Parameters `path` and `output_path` should contain extensions.

	The separator specified by `sep`, defaulting to a line break "\\N",
	is inserted between events when they are being merged.
	'''

	subs = pysubs2.load(path, encoding)

	def echo_text(text: str, text_verbosity: int = 0):
	'''Prints text if `echo` is set to `True`.'''
	if echo and text_verbosity <= verbosity:
	print(text)

	class Timepoint:
	class Type(Enum):
	START = 0,
	END = 1

	def __init__(self, type: Type, time: int, line: pysubs2.SSAEvent) -> None:
	self.type = type
	self.time = time
	self.line = line

	def __str__(self) -> str:
	return f"[{self.type}] ({self.time}) {self.line}"

	raw_timepoints: dict[int, list[Timepoint]] = {}

	def get_event_list(d: dict[int, list[Timepoint]], time: int) -> list[Timepoint]:
	if time in d:
	return d[time]
	else:
	d[time] = []
	return d[time]

	line: pysubs2.SSAEvent
	for line in subs.events:
	if line.text == "":
	continue
	s = get_event_list(raw_timepoints, line.start)
	e = get_event_list(raw_timepoints, line.end)
	s.append(Timepoint(Timepoint.Type.START, line.start, line))
	e.append(Timepoint(Timepoint.Type.END, line.end, line))

	timepoints = sorted(raw_timepoints.items(), key=lambda t: t[0])

	events: list[pysubs2.SSAEvent] = []

	timepoint_count = len(timepoints)

	echo_text(f"Found {timepoint_count} distinct timepoints.")

	# echo_text('\t' + str([t[0] for t in timepoints]), 3)

	active_events: list[pysubs2.SSAEvent] = []
	for i in range(timepoint_count):
	timepoint = timepoints[i]

	if verbosity >= 2:
	text = (f"\t[{i}]: ({timepoint[0]}) " +
	f"{list(map((lambda t: str(t)), timepoint[1]))}")
	echo_text(text, 2)

	for event in timepoint[1]:
	if event.type == Timepoint.Type.START:
	active_events.append(event.line)
	else:
	active_events.remove(event.line)

	echo_text(f"\t\tActive Events: {active_events}", 2)

	if len(active_events) > 0:
	start = timepoint[0]
	end = (timepoints[i+1][0]
	if i + 1 < timepoint_count
	else timepoint[1][0].line.end)

	assert end - start > 0

	result_line = pysubs2.SSAEvent(
	start=start, end=end, text=active_events[0].text)
	for j in range(1, len(active_events)):
	result_line.text += sep + active_events[j].text

	echo_text(f"\t\tResult Line: {result_line}", 2)

	events.append(result_line)

	echo_text(f"Converted {len(subs)} into {len(events)} individual events.")

	default_style = subs.styles.get("Default", pysubs2.SSAStyle())
	subs.styles.clear()
	subs.styles["Default"] = default_style

	echo_text(f"Using default style: {default_style}", 1)

	subs.events = events

	subs.save(output_path, encoding)


	if __name__ == "__main__":
	arg_parser = argparse.ArgumentParser()

	arg_parser.add_argument("input_path", help="the input file's path")
	arg_parser.add_argument("output_path", help="the output file's path")

	arg_parser.add_argument("-s", "--sep", default="\\N",
	help="the separator to be inserted between merged events (default is line break)")

	arg_parser.add_argument("--silent", action="store_true",
	help="disable console output")
	arg_parser.add_argument("--verbosity", type=int, default=0,
	help="the verbosity of console output (default is 0)")

	arg = arg_parser.parse_args()

	process(str(arg.input_path), str(arg.output_path),
	sep=str(arg.sep),
	echo=(not bool(arg.silent)), verbosity=int(arg.verbosity))