Last active
April 25, 2024 14:25
-
-
Save hshrews/e7b4f54edebf661c3dcde33d5d97c4cb to your computer and use it in GitHub Desktop.
Python script to convert .srt file to .txt file containing just the text as a data blob.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
About: | |
Looks in the given directory for any .srt (transcript) files to convert to a simple, | |
single line of text with timestamps and numbered lines removed. | |
Command line execution: | |
python srt_to_txt.py <source_directory> <file_encoding> | |
@:param <source_directory> is the location of the .srt file(s) | |
@:param <file_encoding> defaults to 'utf-8' | |
Output: | |
The result is that any file of type .srt in the directory is converted to a .txt file | |
of the same name, placed in a (dedicated) nested directory created to store such files. | |
Assumptions: | |
- lines beginning with lowercase letters or commas are part of the previous line | |
- lines beginning with any other character are new lines | |
Tested with Python 3.7 | |
""" | |
import os | |
import re | |
import sys | |
def is_timestamp(l): | |
return True if l[:2].isnumeric() and l[2] == ':' else False | |
def is_text_content(line): | |
return True if re.search('[a-zA-Z]', line) else False | |
def has_no_text(line): | |
if not len(line): | |
return True | |
if line.isnumeric(): | |
return True | |
if is_timestamp(line): | |
return True | |
if line[0] == '(' and line[-1] == ')': | |
return True | |
if not is_text_content(line): | |
return True | |
return False | |
def filter_lines(lines): | |
""" Remove timestamps, any lines without text, and line breaks """ | |
new_lines = [] | |
for line in lines[1:]: | |
line = line.strip() | |
if has_no_text(line): | |
continue | |
else: | |
# Strip the line of text before adding it to the list | |
new_lines.append(line) | |
# Combine the lines into a single data string | |
return ' '.join(new_lines) | |
def file_srt_to_txt(file_name, cur_dir, new_dir, encoding): | |
with open(os.path.join(cur_dir, file_name), 'r', encoding=encoding, errors='replace') as f: | |
data = filter_lines(f.readlines()) | |
new_file_name = os.path.join(cur_dir, new_dir, file_name[:-4]) + '.txt' | |
with open(new_file_name, 'w') as f: | |
f.write(data) | |
def main(args): | |
# Get the source directory name | |
dir_path = args[1] | |
if not os.path.isdir(dir_path): | |
print('Enter a valid directory path') | |
exit() | |
# NOTE: If a lot of [?]s appear in the output, try setting file_encoding to 'cp1252' | |
encoding = 'utf-8' if len(args) < 3 else args[2] | |
# Create a dedicated directory for the converted files | |
new_dir = 'srt_to_txt' | |
try: | |
os.makedirs(os.path.join(dir_path, new_dir)) | |
except FileExistsError: | |
# Directory already exists | |
pass | |
with os.scandir(dir_path) as dir_it: | |
for file_entry in dir_it: | |
if file_entry.name.endswith(".srt") and file_entry.is_file(): | |
file_srt_to_txt(file_entry.name, dir_path, new_dir, encoding) | |
if __name__ == '__main__': | |
main(sys.argv) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment