Skip to content

Instantly share code, notes, and snippets.

@Norod
Last active July 16, 2020 15:46
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save Norod/e109a023fa29e1f375bd9f630e7e84e1 to your computer and use it in GitHub Desktop.
Read xml in OpenSubtitles format, output text intended for training machine learning models
#Read xml in OpenSubtitles format, output text intended for training machine learning models
import sys
from os import path
import time
import xml.etree.ElementTree as ET
if len(sys.argv) != 2:
print("Usage: "+ str(sys.argv[0]) + " input.xml")
exit(-1)
input_xml_file = sys.argv[1]
if path.exists(input_xml_file) == False:
print("Error: "+ str(sys.argv[1]) + " not found")
exit(-2)
root = ET.parse(input_xml_file).getroot()
all_s = root.findall('s')
print("\n<|endoftext|>\n")
previous_timestamp = 0
for type_tag in all_s:
all_time = type_tag.findall('time')
if len(all_time) > 0:
time_tag = all_time[-1]
value_time = time_tag.get('value')
ms = int(value_time.split(",")[1])
time_struct = time.strptime(value_time, "%H:%M:%S,%f")
timestamp = ((time_struct.tm_hour * 36000) + (time_struct.tm_min * 60) + (time_struct.tm_sec)) * 1000 + ms
#print('timestamp = ' + str(timestamp))
if previous_timestamp == 0:
previous_timestamp = timestamp
delta_timestamp = timestamp - previous_timestamp
previous_timestamp = timestamp
if delta_timestamp > 10000:
print("<|endoftext|>")
value_texts = type_tag.itertext()
print(''.join(value_texts).replace("\n", ""))
print("<|endoftext|>")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment