lucidyan/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Process yt-dlp Extracted Subtitles Script

Description

This Python script is designed to process .vtt subtitle files obtained using yt-dlp from YouTube or similar platforms. It merges subtitles with overlapping segments and cleans the text by removing excess whitespace. The script outputs the processed subtitles into a new text file with a timestamped filename.
Features


Subtitle Merging: Combines multiple subtitle entries into a single entry, considering overlaps.
Text Cleaning: Cleans subtitle text by replacing newline characters and reducing multiple spaces to a single space.
Output: Generates a cleaned and merged text file for each .vtt file in the specified directory.

Requirements


Python 3
webvtt-py library for parsing .vtt files

Usage


Obtain Subtitles: First, download the subtitles using yt-dlp with the following commands:

For automatic subtitles (machine-generated):
yt-dlp --write-auto-sub --sub-lang ru --skip-download YOUR_URL


For manual subtitles (provided by the uploader):
yt-dlp --write-subs --sub-lang ru --skip-download YOUR_URL


Script Execution: Place the script in a directory one level above the directory containing the .vtt files (or modify the path variable as needed). Run the script.

Output

The script outputs processed subtitles in a new file named with the pattern content_YYYY-MM-DD-HH-MM-SS.txt, where the timestamp corresponds to the script execution time.
Error Handling

Errors during the processing of .vtt files (such as malformed files) are logged to the console.


## script.py
import os
import webvtt
import re
import json
import datetime

def merge_subtitles(subtitles):
    if not subtitles:
        return ""

    merged = subtitles[0]
    for subtitle in subtitles[1:]:
        overlap = next((j for j in range(min(len(merged), len(subtitle)), 0, -1)
                        if merged.endswith(subtitle[:j])), 0)
        merged += subtitle[overlap:]
    return merged

def clean_text(text):
    return re.sub(r"\s{2,}", " ", text.replace("\n", " "))

def process_subtitles_from_directory(path):
    content = {}
    for filename in os.listdir(path):
        full_path = os.path.join(path, filename)
        if filename.endswith(".vtt") and os.path.isfile(full_path):
            try:
                captions = [clean_text(caption.text) for caption in webvtt.read(full_path)]
                content[filename.split(" [")[0]] = merge_subtitles(captions).strip()
            except webvtt.MalformedFileError as e:
                print(f"Error processing {filename}: {e}")
    return content

def save_content_to_file(content, path, filename):
    with open(os.path.join(path, filename), "w") as f:
        for k, v in sorted(content.items()):
            print(f"# {k}", file=f)
            print("", file=f)
            print(v, file=f)
            print("", file=f)

def main():
    path = ".."
    content = process_subtitles_from_directory(path)
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
    output_filename = f'content_{timestamp}.txt'
    save_content_to_file(content, path, output_filename)
    print(f"Processed subtitles saved to {output_filename}")

if __name__ == "__main__":
    main()
	import os
	import webvtt
	import re
	import json
	import datetime

	def merge_subtitles(subtitles):
	if not subtitles:
	return ""

	merged = subtitles[0]
	for subtitle in subtitles[1:]:
	overlap = next((j for j in range(min(len(merged), len(subtitle)), 0, -1)
	if merged.endswith(subtitle[:j])), 0)
	merged += subtitle[overlap:]
	return merged

	def clean_text(text):
	return re.sub(r"\s{2,}", " ", text.replace("\n", " "))

	def process_subtitles_from_directory(path):
	content = {}
	for filename in os.listdir(path):
	full_path = os.path.join(path, filename)
	if filename.endswith(".vtt") and os.path.isfile(full_path):
	try:
	captions = [clean_text(caption.text) for caption in webvtt.read(full_path)]
	content[filename.split(" [")[0]] = merge_subtitles(captions).strip()
	except webvtt.MalformedFileError as e:
	print(f"Error processing {filename}: {e}")
	return content

	def save_content_to_file(content, path, filename):
	with open(os.path.join(path, filename), "w") as f:
	for k, v in sorted(content.items()):
	print(f"# {k}", file=f)
	print("", file=f)
	print(v, file=f)
	print("", file=f)

	def main():
	path = ".."
	content = process_subtitles_from_directory(path)
	timestamp = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
	output_filename = f'content_{timestamp}.txt'
	save_content_to_file(content, path, output_filename)
	print(f"Processed subtitles saved to {output_filename}")

	if __name__ == "__main__":
	main()