Created
January 30, 2022 04:03
-
-
Save abu-co/43324afa923b5c3d25297564c36ac8d4 to your computer and use it in GitHub Desktop.
Removes & merges overlapped events in subtitles without losing the original content & timings.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
from enum import Enum | |
import argparse | |
import pysubs2 | |
__author__ = "abuco (https://github.com/abu-co)" | |
def process( | |
path: str, | |
output_path: str, | |
*, | |
sep: str = "\\N", | |
encoding: str = "utf-8", | |
echo: bool = False, | |
verbosity: int = 0 | |
): | |
''' | |
Processes the input subtitle file so there are no overlaps events | |
by merging overlapped content into the same event. | |
Parameters `path` and `output_path` should contain extensions. | |
The separator specified by `sep`, defaulting to a line break "\\N", | |
is inserted between events when they are being merged. | |
''' | |
subs = pysubs2.load(path, encoding) | |
def echo_text(text: str, text_verbosity: int = 0): | |
'''Prints text if `echo` is set to `True`.''' | |
if echo and text_verbosity <= verbosity: | |
print(text) | |
class Timepoint: | |
class Type(Enum): | |
START = 0, | |
END = 1 | |
def __init__(self, type: Type, time: int, line: pysubs2.SSAEvent) -> None: | |
self.type = type | |
self.time = time | |
self.line = line | |
def __str__(self) -> str: | |
return f"[{self.type}] ({self.time}) {self.line}" | |
raw_timepoints: dict[int, list[Timepoint]] = {} | |
def get_event_list(d: dict[int, list[Timepoint]], time: int) -> list[Timepoint]: | |
if time in d: | |
return d[time] | |
else: | |
d[time] = [] | |
return d[time] | |
line: pysubs2.SSAEvent | |
for line in subs.events: | |
if line.text == "": | |
continue | |
s = get_event_list(raw_timepoints, line.start) | |
e = get_event_list(raw_timepoints, line.end) | |
s.append(Timepoint(Timepoint.Type.START, line.start, line)) | |
e.append(Timepoint(Timepoint.Type.END, line.end, line)) | |
timepoints = sorted(raw_timepoints.items(), key=lambda t: t[0]) | |
events: list[pysubs2.SSAEvent] = [] | |
timepoint_count = len(timepoints) | |
echo_text(f"Found {timepoint_count} distinct timepoints.") | |
# echo_text('\t' + str([t[0] for t in timepoints]), 3) | |
active_events: list[pysubs2.SSAEvent] = [] | |
for i in range(timepoint_count): | |
timepoint = timepoints[i] | |
if verbosity >= 2: | |
text = (f"\t[{i}]: ({timepoint[0]}) " + | |
f"{list(map((lambda t: str(t)), timepoint[1]))}") | |
echo_text(text, 2) | |
for event in timepoint[1]: | |
if event.type == Timepoint.Type.START: | |
active_events.append(event.line) | |
else: | |
active_events.remove(event.line) | |
echo_text(f"\t\tActive Events: {active_events}", 2) | |
if len(active_events) > 0: | |
start = timepoint[0] | |
end = (timepoints[i+1][0] | |
if i + 1 < timepoint_count | |
else timepoint[1][0].line.end) | |
assert end - start > 0 | |
result_line = pysubs2.SSAEvent( | |
start=start, end=end, text=active_events[0].text) | |
for j in range(1, len(active_events)): | |
result_line.text += sep + active_events[j].text | |
echo_text(f"\t\tResult Line: {result_line}", 2) | |
events.append(result_line) | |
echo_text(f"Converted {len(subs)} into {len(events)} individual events.") | |
default_style = subs.styles.get("Default", pysubs2.SSAStyle()) | |
subs.styles.clear() | |
subs.styles["Default"] = default_style | |
echo_text(f"Using default style: {default_style}", 1) | |
subs.events = events | |
subs.save(output_path, encoding) | |
if __name__ == "__main__": | |
arg_parser = argparse.ArgumentParser() | |
arg_parser.add_argument("input_path", help="the input file's path") | |
arg_parser.add_argument("output_path", help="the output file's path") | |
arg_parser.add_argument("-s", "--sep", default="\\N", | |
help="the separator to be inserted between merged events (default is line break)") | |
arg_parser.add_argument("--silent", action="store_true", | |
help="disable console output") | |
arg_parser.add_argument("--verbosity", type=int, default=0, | |
help="the verbosity of console output (default is 0)") | |
arg = arg_parser.parse_args() | |
process(str(arg.input_path), str(arg.output_path), | |
sep=str(arg.sep), | |
echo=(not bool(arg.silent)), verbosity=int(arg.verbosity)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment