Skip to content

Instantly share code, notes, and snippets.

@hshrews
Last active April 25, 2024 14:25
Show Gist options
  • Save hshrews/e7b4f54edebf661c3dcde33d5d97c4cb to your computer and use it in GitHub Desktop.
Save hshrews/e7b4f54edebf661c3dcde33d5d97c4cb to your computer and use it in GitHub Desktop.
Python script to convert .srt file to .txt file containing just the text as a data blob.
"""
About:
Looks in the given directory for any .srt (transcript) files to convert to a simple,
single line of text with timestamps and numbered lines removed.
Command line execution:
python srt_to_txt.py <source_directory> <file_encoding>
@:param <source_directory> is the location of the .srt file(s)
@:param <file_encoding> defaults to 'utf-8'
Output:
The result is that any file of type .srt in the directory is converted to a .txt file
of the same name, placed in a (dedicated) nested directory created to store such files.
Assumptions:
- lines beginning with lowercase letters or commas are part of the previous line
- lines beginning with any other character are new lines
Tested with Python 3.7
"""
import os
import re
import sys
def is_timestamp(l):
return True if l[:2].isnumeric() and l[2] == ':' else False
def is_text_content(line):
return True if re.search('[a-zA-Z]', line) else False
def has_no_text(line):
if not len(line):
return True
if line.isnumeric():
return True
if is_timestamp(line):
return True
if line[0] == '(' and line[-1] == ')':
return True
if not is_text_content(line):
return True
return False
def filter_lines(lines):
""" Remove timestamps, any lines without text, and line breaks """
new_lines = []
for line in lines[1:]:
line = line.strip()
if has_no_text(line):
continue
else:
# Strip the line of text before adding it to the list
new_lines.append(line)
# Combine the lines into a single data string
return ' '.join(new_lines)
def file_srt_to_txt(file_name, cur_dir, new_dir, encoding):
with open(os.path.join(cur_dir, file_name), 'r', encoding=encoding, errors='replace') as f:
data = filter_lines(f.readlines())
new_file_name = os.path.join(cur_dir, new_dir, file_name[:-4]) + '.txt'
with open(new_file_name, 'w') as f:
f.write(data)
def main(args):
# Get the source directory name
dir_path = args[1]
if not os.path.isdir(dir_path):
print('Enter a valid directory path')
exit()
# NOTE: If a lot of [?]s appear in the output, try setting file_encoding to 'cp1252'
encoding = 'utf-8' if len(args) < 3 else args[2]
# Create a dedicated directory for the converted files
new_dir = 'srt_to_txt'
try:
os.makedirs(os.path.join(dir_path, new_dir))
except FileExistsError:
# Directory already exists
pass
with os.scandir(dir_path) as dir_it:
for file_entry in dir_it:
if file_entry.name.endswith(".srt") and file_entry.is_file():
file_srt_to_txt(file_entry.name, dir_path, new_dir, encoding)
if __name__ == '__main__':
main(sys.argv)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment