Last active
February 1, 2020 19:38
-
-
Save abdusco/0bed8e42f0eadb6b954fa28a667e0d51 to your computer and use it in GitHub Desktop.
Convert TTML to SRT
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
bs4 | |
lxml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import sys | |
from textwrap import dedent | |
from typing import List | |
import bs4 | |
def extract_subs(xml_path: str) -> List[str]: | |
with open(xml_path) as fin: | |
soup = bs4.BeautifulSoup(fin.read(), 'lxml') | |
subs = [] | |
for i, s in enumerate(soup.select('[begin]'), start=1): | |
text, begin, end = s.text.strip(), s['begin'], s['end'] | |
text = re.sub(r'\n{2,}', '\n', text) | |
srt_item = '\n'.join([str(i), f'{begin} --> {end}', text, '\n']) | |
subs.append(srt_item) | |
return subs | |
def main(): | |
if len(sys.argv) < 3: | |
print(dedent('''\ | |
usage: python ttml2srt.py src.xml out.srt | |
''')) | |
return | |
src_path, out_path = sys.argv[1:] | |
subs = extract_subs(src_path) | |
with open(out_path, 'w') as fout: | |
for item in subs: | |
fout.write(item) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment