Skip to content

Instantly share code, notes, and snippets.

@arturmartins
Created October 26, 2023 10:53
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save arturmartins/1c78de3e8c21ffce81a17dc2f2181de4 to your computer and use it in GitHub Desktop.
Save arturmartins/1c78de3e8c21ffce81a17dc2f2181de4 to your computer and use it in GitHub Desktop.
Converts WEBVTT subtitles (vtt) to plain text.
#!/usr/bin/env python3
"""
Converts WEBVTT subtitles (vtt) to plain text.
It removes all time related info as well as duplicated and empty lines.
"""
# Author: Artur Martins <arturmartins@gmail.com>
# Version: 1.0
# Date: 2023-Oct-25
import re
import argparse
import os
ENCODING_TYPE = "utf-8"
HEADER = "WEBVTT"
def clean_line(line: str) -> str:
"""
Remove all WebVTT tags and time codes from the given line.
Args:
line (str): The line of text to clean.
Returns:
str: The cleaned line with all tags and time codes removed and leading/trailing whitespace stripped.
"""
# Remove all WebVTT tags and time codes
cleaned_line = re.sub(r"<.*?>", "", line)
cleaned_line = re.sub(r"\d{2}:\d{2}:\d{2}\.\d{3}", "", cleaned_line)
return cleaned_line.strip()
def convert_webvtt_to_text(input_path: str, output_path: str, verbose: bool) -> None:
"""
Convert a WebVTT file to plain text.
Args:
input_path (str): The path to the WebVTT input file.
output_path (str): The path to the output text file.
verbose (bool): If True, print the cleaned lines as they are written.
Returns:
None
"""
last_written_line = ""
with open(input_path, "r", encoding=ENCODING_TYPE) as infile, open(
output_path, "w", encoding=ENCODING_TYPE
) as outfile:
lines = infile.readlines()
for line in lines:
line = line.strip()
# Skip time lines or WebVTT header
if "-->" in line or line == HEADER:
continue
# Skip empty lines
if not line:
continue
cleaned_line = clean_line(line)
if cleaned_line and cleaned_line != last_written_line:
if verbose:
print(f"Writing: {cleaned_line}")
outfile.write(cleaned_line + "\n")
last_written_line = cleaned_line
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Convert WebVTT to text.")
parser.add_argument(
"-i", "--input", required=True, help="Path to the input WebVTT file."
)
parser.add_argument(
"-o",
"--output",
help="Path to the output text file. Defaults to input file name with .txt extension.",
)
parser.add_argument(
"-v", "--verbose", action="store_true", help="Enable verbose output."
)
args = parser.parse_args()
if args.output is None:
base_name = os.path.splitext(args.input)[0]
args.output = f"{base_name}.txt"
if args.verbose:
print(f"Converting {args.input} to {args.output}...")
convert_webvtt_to_text(args.input, args.output, args.verbose)
@epogrebnyak
Copy link

epogrebnyak commented Oct 26, 2023

Nice code - can I reuse this in a package?

What would be proper attribution?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment