Skip to content

Instantly share code, notes, and snippets.

@dlo
Last active January 25, 2018 17:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dlo/f4ff67278eb683419222ad711c82b87d to your computer and use it in GitHub Desktop.
Save dlo/f4ff67278eb683419222ad711c82b87d to your computer and use it in GitHub Desktop.
YouTube XML Caption Format to SRT
#!/usr/bin/env python3
# -*- encoding:utf-8 -*-
# Install: pip install lxml
# Usage: python youtube_xml_to_srt.py INPUT_FILE.xml > OUTPUT_FILE.srt
import re
import sys
from decimal import Decimal
import html
from lxml import html as lxml_html
filename = sys.argv[1]
with open(filename, 'rb') as f:
root = lxml_html.fromstring(f.read())
def extract_s_ms(value):
if '.' in value:
s, ms = value.split('.')
else:
s, ms = (value, 0)
return Decimal(s), Decimal(ms)
def convert_to_h_m_s(value):
rem_m, s = divmod(value, 60)
h, m = divmod(rem_m, 60)
return h, m, s
for (i, element) in enumerate(root):
start_time_s, start_time_ms = extract_s_ms(element.attrib['start'])
duration_s, duration_ms = extract_s_ms(element.attrib['dur'])
end_time_s, end_time_ms = start_time_s + duration_s, start_time_ms + duration_ms
s_h, s_m, s_s = convert_to_h_m_s(start_time_s)
e_h, e_m, e_s = convert_to_h_m_s(end_time_s)
text = html.unescape(element.text)
print(f"""{i}
{s_h:02.2}:{s_m:02.2}:{s_s:02.2},{start_time_ms:03} --> {e_h:02.2}:{e_m:02.2}:{e_s:02.2},{end_time_ms:03}
{text}
""")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment