Created
April 5, 2017 13:37
-
-
Save benosteen/d5b3e3a8a83d43ff277f2213c7c77c08 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from xml.etree import ElementTree as ET | |
import re | |
def format_t(time_string, frac_sep = ".", output_frag_sep = "."): | |
h = m = secs = ms = 0 | |
frags = time_string.split(":") | |
spart = frags[-1].split(frac_sep) | |
if len(spart) == 1: | |
s = int(spart[0]) | |
elif len(spart) == 2: | |
s = int(spart[0]) | |
# small trick - "2.1" should be 2.100, not 2.001 | |
ms = int(float("0." + spart[1]) * 1000) | |
if len(frags) == 2: | |
m = int(frags[0]) | |
elif len(frags) == 3: | |
h = int(frags[0]) | |
m = int(frags[1]) | |
return "{0:02d}:{1:02d}:{2:02d}{3}{4:03d}".format(h,m,s, output_frag_sep,ms) | |
def assert_time_format(time_string): | |
p = re.compile("\d{2}\:\d{2}:\d{2}[\.\,]\d{3}") | |
if p.match(time_string) != None: | |
return True | |
return False | |
def convert_to_srt(xmlfile, srtfile): | |
with open(xmlfile, "r") as srcf: | |
with open(srtfile, "w") as dstf: | |
doc = ET.fromstring(srcf.read()) | |
anno_count = 1 | |
for anno in doc.findall("annotations/annotation"): | |
text_e = anno.find("TEXT") | |
t_rects = [x for x in anno.findall("segment/movingRegion/rectRegion")] | |
if text_e.text != None and len(t_rects) == 2: | |
# add new annotation | |
dstf.write("\n{0}\n".format(anno_count)) | |
time1 = format_t(t_rects[0].attrib["t"]) | |
time2 = format_t(t_rects[1].attrib["t"]) | |
if not assert_time_format(time1) or not assert_time_format(time2): | |
print("Time is formatted incorrectly for annotation {0}:".format(anno_count)) | |
print("time1: {0}".format(time1)) | |
print("time2: {0}".format(time2)) | |
dstf.write("{0} --> {1}\n".format(time1, time2)) | |
dstf.write(text_e.text) | |
dstf.write("\n") | |
anno_count += 1 | |
else: | |
print("Annotation found, but couldn't parse it for some reason:") | |
if text_e.text == None: | |
print("No text found in the TEXT element") | |
if len(t_rects) != 2: | |
print("More than 2 rect regions found for some reason...") | |
if __name__ == "__main__": | |
import sys | |
if len(sys.argv) != 3: | |
print("Usage: python convert_to_srt.py xmlfilename.xml srtfilename.srt") | |
else: | |
_, xmlf, srtf = sys.argv | |
convert_to_srt(xmlf, srtf) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment