Skip to content

Instantly share code, notes, and snippets.

@zoharbabin
Created March 8, 2023 21:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zoharbabin/482ce5954a72aa9cc4028fb61a8c7059 to your computer and use it in GitHub Desktop.
Save zoharbabin/482ce5954a72aa9cc4028fb61a8c7059 to your computer and use it in GitHub Desktop.
A python script to strip SRT captions file into a non-timed TXT transcript. Provided an SRT file as input, the script preserves the caption blocks structure in an SRT file input, while merging consecutive lines inside each caption block. Then, the script removes all SRT formatting, and produces a clean TXT file without line index markers, timing…
import sys
import re
def merge_lines(srt_file):
# Read in the SRT file
with open(srt_file, 'r') as f:
srt = f.read()
# Merge consecutive lines in each caption block
merged_srt = re.sub(r'\d+\n(\d\d:\d\d:\d\d,\d+ --> \d\d:\d\d:\d\d,\d+\n)?(.*?\n\n)', lambda match: match.group(2).replace('\n', ' ') + '\n', srt, flags=re.S)
return merged_srt
def clean_srt(srt):
# Remove SRT formatting (index markers, timing lines, speaker indication lines)
clean_srt = re.sub(r'\d+\n\d\d:\d\d:\d\d,\d+ --> \d\d:\d\d:\d\d,\d+\n', '', srt)
clean_srt = re.sub(r'^>> ', '', clean_srt, flags=re.M)
clean_srt = re.sub(r'>> ', '', clean_srt)
clean_srt = re.sub(r'\n\d+\n', '\n', clean_srt)
return clean_srt
if __name__ == '__main__':
# Get the SRT file path from the command line arguments
srt_file = sys.argv[1]
# Merge consecutive lines in each caption block
merged_srt = merge_lines(srt_file)
# Remove SRT formatting (index markers, timing lines, speaker indication lines)
clean_srt = clean_srt(merged_srt)
# Write the cleaned text to a new file with the same name as the SRT file, but with the extension changed to .txt
with open(srt_file[:-4] + '.txt', 'w') as f:
f.write(clean_srt)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment