Skip to content

Instantly share code, notes, and snippets.

@eunicode
Forked from ndunn219/srt_to_txt.py
Created July 9, 2021 21:42
Show Gist options
  • Save eunicode/43cac5a60faf71f342b331a89719c107 to your computer and use it in GitHub Desktop.
Save eunicode/43cac5a60faf71f342b331a89719c107 to your computer and use it in GitHub Desktop.
Simple Python Script for Extracting Text from an SRT File
"""
Creates readable text file from SRT file.
"""
import re, sys
def is_time_stamp(l):
if l[:2].isnumeric() and l[2] == ':':
return True
return False
def has_letters(line):
if re.search('[a-zA-Z]', line):
return True
return False
def has_no_text(line):
l = line.strip()
if not len(l):
return True
if l.isnumeric():
return True
if is_time_stamp(l):
return True
if l[0] == '(' and l[-1] == ')':
return True
if not has_letters(line):
return True
return False
def is_lowercase_letter_or_comma(letter):
if letter.isalpha() and letter.lower() == letter:
return True
if letter == ',':
return True
return False
def clean_up(lines):
"""
Get rid of all non-text lines and
try to combine text broken into multiple lines
"""
new_lines = []
for line in lines[1:]:
if has_no_text(line):
continue
elif len(new_lines) and is_lowercase_letter_or_comma(line[0]):
#combine with previous line
new_lines[-1] = new_lines[-1].strip() + ' ' + line
else:
#append line
new_lines.append(line)
return new_lines
def main(args):
"""
args[1]: file name
args[2]: encoding. Default: utf-8.
- If you get a lot of [?]s replacing characters,
- you probably need to change file_encoding to 'cp1252'
"""
file_name = args[1]
file_encoding = 'utf-8' if len(args) < 3 else args[2]
with open(file_name, encoding=file_encoding, errors='replace') as f:
lines = f.readlines()
new_lines = clean_up(lines)
new_file_name = file_name[:-4] + '.txt'
with open(new_file_name, 'w') as f:
for line in new_lines:
f.write(line)
if __name__ == '__main__':
main(sys.argv)
"""
NOTES
* Run from command line as
** python srt_to_txt.py file_name.srt cp1252
* Creates file_name.txt with extracted text from file_name.srt
* Script assumes that lines beginning with lowercase letters or commas
* are part of the previous line and lines beginning with any other character
* are new lines. This won't always be correct.
"""
@eunicode
Copy link
Author

eunicode commented Jul 9, 2021

To use:
python3 srt_to_txt.py <file_name.srt> <cp1252>
cp1252 is the encoding

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment