YouTube XML Caption Format to SRT
#!/usr/bin/env python3 | |
# -*- encoding:utf-8 -*- | |
# Install: pip install lxml | |
# Usage: python youtube_xml_to_srt.py INPUT_FILE.xml > OUTPUT_FILE.srt | |
import re | |
import sys | |
from decimal import Decimal | |
import html | |
from lxml import html as lxml_html | |
filename = sys.argv[1] | |
with open(filename, 'rb') as f: | |
root = lxml_html.fromstring(f.read()) | |
def extract_s_ms(value): | |
if '.' in value: | |
s, ms = value.split('.') | |
else: | |
s, ms = (value, 0) | |
return Decimal(s), Decimal(ms) | |
def convert_to_h_m_s(value): | |
rem_m, s = divmod(value, 60) | |
h, m = divmod(rem_m, 60) | |
return h, m, s | |
for (i, element) in enumerate(root): | |
start_time_s, start_time_ms = extract_s_ms(element.attrib['start']) | |
duration_s, duration_ms = extract_s_ms(element.attrib['dur']) | |
end_time_s, end_time_ms = start_time_s + duration_s, start_time_ms + duration_ms | |
s_h, s_m, s_s = convert_to_h_m_s(start_time_s) | |
e_h, e_m, e_s = convert_to_h_m_s(end_time_s) | |
text = html.unescape(element.text) | |
print(f"""{i} | |
{s_h:02.2}:{s_m:02.2}:{s_s:02.2},{start_time_ms:03} --> {e_h:02.2}:{e_m:02.2}:{e_s:02.2},{end_time_ms:03} | |
{text} | |
""") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment