Skip to content

Instantly share code, notes, and snippets.

@abu-co
Created January 30, 2022 04:03
Show Gist options
  • Save abu-co/43324afa923b5c3d25297564c36ac8d4 to your computer and use it in GitHub Desktop.
Save abu-co/43324afa923b5c3d25297564c36ac8d4 to your computer and use it in GitHub Desktop.
Removes & merges overlapped events in subtitles without losing the original content & timings.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from enum import Enum
import argparse
import pysubs2
__author__ = "abuco (https://github.com/abu-co)"
def process(
path: str,
output_path: str,
*,
sep: str = "\\N",
encoding: str = "utf-8",
echo: bool = False,
verbosity: int = 0
):
'''
Processes the input subtitle file so there are no overlaps events
by merging overlapped content into the same event.
Parameters `path` and `output_path` should contain extensions.
The separator specified by `sep`, defaulting to a line break "\\N",
is inserted between events when they are being merged.
'''
subs = pysubs2.load(path, encoding)
def echo_text(text: str, text_verbosity: int = 0):
'''Prints text if `echo` is set to `True`.'''
if echo and text_verbosity <= verbosity:
print(text)
class Timepoint:
class Type(Enum):
START = 0,
END = 1
def __init__(self, type: Type, time: int, line: pysubs2.SSAEvent) -> None:
self.type = type
self.time = time
self.line = line
def __str__(self) -> str:
return f"[{self.type}] ({self.time}) {self.line}"
raw_timepoints: dict[int, list[Timepoint]] = {}
def get_event_list(d: dict[int, list[Timepoint]], time: int) -> list[Timepoint]:
if time in d:
return d[time]
else:
d[time] = []
return d[time]
line: pysubs2.SSAEvent
for line in subs.events:
if line.text == "":
continue
s = get_event_list(raw_timepoints, line.start)
e = get_event_list(raw_timepoints, line.end)
s.append(Timepoint(Timepoint.Type.START, line.start, line))
e.append(Timepoint(Timepoint.Type.END, line.end, line))
timepoints = sorted(raw_timepoints.items(), key=lambda t: t[0])
events: list[pysubs2.SSAEvent] = []
timepoint_count = len(timepoints)
echo_text(f"Found {timepoint_count} distinct timepoints.")
# echo_text('\t' + str([t[0] for t in timepoints]), 3)
active_events: list[pysubs2.SSAEvent] = []
for i in range(timepoint_count):
timepoint = timepoints[i]
if verbosity >= 2:
text = (f"\t[{i}]: ({timepoint[0]}) " +
f"{list(map((lambda t: str(t)), timepoint[1]))}")
echo_text(text, 2)
for event in timepoint[1]:
if event.type == Timepoint.Type.START:
active_events.append(event.line)
else:
active_events.remove(event.line)
echo_text(f"\t\tActive Events: {active_events}", 2)
if len(active_events) > 0:
start = timepoint[0]
end = (timepoints[i+1][0]
if i + 1 < timepoint_count
else timepoint[1][0].line.end)
assert end - start > 0
result_line = pysubs2.SSAEvent(
start=start, end=end, text=active_events[0].text)
for j in range(1, len(active_events)):
result_line.text += sep + active_events[j].text
echo_text(f"\t\tResult Line: {result_line}", 2)
events.append(result_line)
echo_text(f"Converted {len(subs)} into {len(events)} individual events.")
default_style = subs.styles.get("Default", pysubs2.SSAStyle())
subs.styles.clear()
subs.styles["Default"] = default_style
echo_text(f"Using default style: {default_style}", 1)
subs.events = events
subs.save(output_path, encoding)
if __name__ == "__main__":
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument("input_path", help="the input file's path")
arg_parser.add_argument("output_path", help="the output file's path")
arg_parser.add_argument("-s", "--sep", default="\\N",
help="the separator to be inserted between merged events (default is line break)")
arg_parser.add_argument("--silent", action="store_true",
help="disable console output")
arg_parser.add_argument("--verbosity", type=int, default=0,
help="the verbosity of console output (default is 0)")
arg = arg_parser.parse_args()
process(str(arg.input_path), str(arg.output_path),
sep=str(arg.sep),
echo=(not bool(arg.silent)), verbosity=int(arg.verbosity))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment