Skip to content

Instantly share code, notes, and snippets.

Created June 6, 2023 20:07
Show Gist options
  • Save metaist/b10433ccc6795d4ed82ef42e0b70a209 to your computer and use it in GitHub Desktop.
Save metaist/b10433ccc6795d4ed82ef42e0b70a209 to your computer and use it in GitHub Desktop.
Convert a subtitle file into a transcript.
#!/usr/bin/env python
"""Convert a subtitles file into a transcript.
Usage: [-h] [--version]
[-i FILE] [-o FILE]
-h, --help show this message and exit
--version show version and exit
-i FILE, --input FILE subtitle file [default:]
-o FILE, --output FILE transcript file [default: output.txt]
--md markdown format
# native
from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
from typing import List
# lib
from attrbox import AttrDict
from attrbox import parse_docopt
__version__ = "0.1.0"
class Subtitle:
"""Represents a single subtitle."""
idx: int = 0
beg: str = ""
end: str = ""
who: str = ""
txt: str = ""
def __str__(self) -> str:
"""Return a string representation of this subtitle."""
return f"{self.idx}\n{self.beg} --> {self.end}\n[{self.who}]: {self.txt}\n"
def __add__(self, other: Subtitle) -> Subtitle:
"""Add two subtitles together."""
return Subtitle(
idx=min(self.idx, other.idx),
beg=min(self.beg, other.beg),
end=max(self.end, other.end),
txt=self.txt + "\n" + other.txt,
def __iadd__(self, other: Subtitle) -> Subtitle:
"""Add a subtitle to this one."""
self.idx = min(self.idx, other.idx)
self.beg = min(self.beg, other.beg)
self.end = max(self.end, other.end)
self.txt += "\n" + other.txt
return self
def parse(lines: List[str]) -> Subtitle:
"""Convert a list of lines into a `Subtitle`."""
result = Subtitle()
for line in lines:
if line.isdecimal():
result.idx = int(line)
elif "-->" in line:
result.beg, _, result.end = line.split()
elif line.startswith("["):
result.who = line[1 : line.find("]")]
result.txt = line[line.find("]: ") + 3 :]
result.txt += line
return result
def parse_srt(text: str) -> List[Subtitle]:
"""Convert a subtitle file text into a list of `Subtitle`."""
result: List[Subtitle] = []
lines: List[str] = []
for line in text.split("\n"):
line = line.strip()
if lines and not line:
lines = []
return result
def merge_subtitles(subs: List[Subtitle]) -> List[Subtitle]:
"""Merge adjacent subtitles with the same speaker."""
result: List[Subtitle] = []
prev = None
for curr in subs:
if not prev or prev.who != curr.who:
prev = curr
prev += curr
return result
def srt2txt(srt: Path, txt: Path, md: bool = False):
"""Convert a subtitle file to a transcript.
srt (Path): path to subtitle file
txt (Path): path to output file
md (bool, optional): whether to render output as markdown instead of
a subtitle-format. Defaults to `False`.
subs = parse_srt(srt.read_text(encoding="utf-8"))
subs = merge_subtitles(subs)
with"w", encoding="utf8") as out:
for sub in subs:
if md:
out.write(f"**{sub.who}**: {sub.txt}\n")
def main() -> None:
"""Main entry point."""
args = parse_docopt(__doc__, version=__version__)
args.input = Path(args.input or "")
args.output = Path(args.output or "output.txt")
if not args.input.exists():
print(f"ERROR: Cannot find file: {args.input}")
srt2txt(args.input, args.output,
if __name__ == "__main__":
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment