Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Convert googles audio transcription xml to sbv
#!/usr/bin/env python
# encoding: utf-8
import sys
import os.path
import HTMLParser
from xml.sax.saxutils import unescape
from xml.dom import pulldom
MINS = 60
def fmt_time(t):
h = t / 3600
t = t % 3600
m = t / 60
t = t % 60
return "%d:%02d:%02.3f" % (h, m, t)
def read_elements(filename):
""" yields (start_second, end_second, text) """
events = pulldom.parse(filename)
for event, node in events:
if event == 'START_ELEMENT' and node.tagName=='text':
text = ""
start = float(node.getAttribute('start'))
dur = float(node.getAttribute('dur'))
end = start + dur
elif event == 'CHARACTERS':
text += node.data
elif event == 'END_ELEMENT':
yield start, end, text
def convert_times(elements):
for s, e, txt in elements:
yield fmt_time(s), fmt_time(e), txt
def write_elements(elements, outfile):
p = HTMLParser.HTMLParser()
with open(outfile, 'w') as f:
for s, e, txt in elements:
f.write(s)
f.write(",")
f.write(e)
f.write("\n")
f.write(p.unescape(txt))
f.write("\n\n")
TEST_FILE = "testfile_transscript.xml"
def read_url(url, outfile=None):
import tempfile
import requests
if not outfile:
temp = temfile.TemporaryFile()
outfile = temp.name
with open(outfile, 'w') as f:
f.write(requests.get(url).text)
return outfile
if __name__ == '__main__':
args = sys.argv
outfile = "outfile.sbv"
if len(args) > 1:
arg = args[1]
if arg.startswith("http"):
#filename = read_url(arg)
filename = TEST_FILE
else:
filename = arg
if len(args) > 2:
outfile = args[2]
if os.path.exists(filename):
elems = read_elements(filename)
elems = convert_times(elems)
write_elements(elems, outfile)
else:
print "No such file: " + filename
else:
print """Usage: youtube_trascribe.py filename.xml [outfile.sbv]"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment