Created
March 8, 2023 21:34
-
-
Save zoharbabin/482ce5954a72aa9cc4028fb61a8c7059 to your computer and use it in GitHub Desktop.
A python script to strip SRT captions file into a non-timed TXT transcript. Provided an SRT file as input, the script preserves the caption blocks structure in an SRT file input, while merging consecutive lines inside each caption block. Then, the script removes all SRT formatting, and produces a clean TXT file without line index markers, timing…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import re | |
def merge_lines(srt_file): | |
# Read in the SRT file | |
with open(srt_file, 'r') as f: | |
srt = f.read() | |
# Merge consecutive lines in each caption block | |
merged_srt = re.sub(r'\d+\n(\d\d:\d\d:\d\d,\d+ --> \d\d:\d\d:\d\d,\d+\n)?(.*?\n\n)', lambda match: match.group(2).replace('\n', ' ') + '\n', srt, flags=re.S) | |
return merged_srt | |
def clean_srt(srt): | |
# Remove SRT formatting (index markers, timing lines, speaker indication lines) | |
clean_srt = re.sub(r'\d+\n\d\d:\d\d:\d\d,\d+ --> \d\d:\d\d:\d\d,\d+\n', '', srt) | |
clean_srt = re.sub(r'^>> ', '', clean_srt, flags=re.M) | |
clean_srt = re.sub(r'>> ', '', clean_srt) | |
clean_srt = re.sub(r'\n\d+\n', '\n', clean_srt) | |
return clean_srt | |
if __name__ == '__main__': | |
# Get the SRT file path from the command line arguments | |
srt_file = sys.argv[1] | |
# Merge consecutive lines in each caption block | |
merged_srt = merge_lines(srt_file) | |
# Remove SRT formatting (index markers, timing lines, speaker indication lines) | |
clean_srt = clean_srt(merged_srt) | |
# Write the cleaned text to a new file with the same name as the SRT file, but with the extension changed to .txt | |
with open(srt_file[:-4] + '.txt', 'w') as f: | |
f.write(clean_srt) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment