Created
May 9, 2016 15:22
-
-
Save macdems/f69d5addc6c199ce9b7a7d773c89a07c to your computer and use it in GitHub Desktop.
Subtitsles format converter
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import sys | |
import re | |
import codecs | |
from cStringIO import StringIO | |
FORMAT_PATTERNS = ( | |
(re.compile(r'^(\s*)/([^|]+)'), r'\1<i>\2</i>'), | |
(re.compile(r'\|(\s*)/([^|]+)'), r'|\1<i>\2</i>'), | |
(re.compile(r'\{Y:([bui])\}([^|]+)\|'), r'<\1>\2</\1>|{Y:\1}'), | |
(re.compile(r'\{Y:([bui])\}([^|]+)$'), r'<\1>\2</\1>'), | |
(re.compile(r'\{y:([bui])\}([^|]+)'), r'<\1>\2</\1>'), | |
(re.compile(r'\{Y:([bui]),([bui])\}([^|]+)\|'), r'<\1><\2>\3</\2></\1>|{Y:\1}'), | |
(re.compile(r'\{Y:([bui]),([bui])\}([^|]+)$'), r'<\1><\2>\3</\2></\1>'), | |
(re.compile(r'\{y:([bui]),([bui])\}([^|]+)'), r'<\1><\2>\3</\2></\1>'), | |
(re.compile(r'<([bui])>\{C:\$([0-9a-hA-H]{6})\}([^|]+)</\1>\|'), r'<\1><font color="#\2">\3</font></\1>|{<C:\2}'), | |
(re.compile(r'<([bui])>\{C:\$([0-9a-hA-H]{6})\}([^|]+)</\1>$'), r'<\1><font color="#\2">\3</font></\1>'), | |
(re.compile(r'<([bui])>\{c:\$([0-9a-hA-H]{6})\}([^|]+)</\1>'), r'<\1><font color="#\2">\3</font></\1>'), | |
(re.compile(r'\{C:\$([0-9a-hA-H]{6})\}([^|]+)\|'), r'<font color="#\1">\2</font>|{<C:\1}'), | |
(re.compile(r'\{C:\$([0-9a-hA-H]{6})\}([^|]+)$'), r'<font color="#\1">\2</font>'), | |
(re.compile(r'\{c:\$([0-9a-hA-H]{6})\}([^|]+)'), r'<font color="#\1">\2</font>'), | |
) | |
def _remove_undefined(text, coding): | |
if coding == 'cp1250': | |
return text.replace('\x81','').replace('\x83','').replace('\x88','').replace('\x90','').replace('\x98','') | |
return text | |
class MicroDVD(object): | |
pattern = re.compile(r'^\{(\d+)l?\}\{(\d*)l?\}(.*)$') | |
format_patterns = FORMAT_PATTERNS | |
remove_pattern = re.compile(r'\{[^}]*\}') | |
def __init__(self, framerate): | |
self.framerate = framerate | |
def read(self, ifile, coding='cp1250'): | |
lines = [] | |
for line in ifile: | |
line = line.strip() # strip \r\n from the end of the line | |
if not line: continue | |
match = self.pattern.match(line) | |
if not match: | |
raise ValueError("Line '%s' does not match the pattern." % line) | |
start, end, text = match.groups() | |
start = float(start) / self.framerate | |
for p,r in self.format_patterns: | |
n = 1 | |
while n: | |
text, n = p.subn(r, text) | |
text = self.remove_pattern.sub('', text) | |
text = text.replace('|','\n') | |
text = _remove_undefined(text, coding) | |
text = text.decode(coding) | |
if end != "": | |
end = float(end) / self.framerate | |
else: | |
end = start + max(0.07*len(text), 1.5) | |
lines.append((start, end, text)) | |
# Unscramble possibly scrambled subtitles | |
for i,(start, end, text) in enumerate(lines[:-1]): | |
if end >= lines[i+1][0]: | |
end = lines[i+1][0]-0.001 | |
lines[i] = start, end, text | |
return lines | |
class MPL2(object): | |
pattern = re.compile(r'^\[(\d+)]\[(\d*)](.*)$') | |
format_patterns = FORMAT_PATTERNS | |
remove_pattern = re.compile(r'\{[^}]*\}') | |
def __init__(self, framerate): | |
self.framerate = framerate | |
def read(self, ifile, coding='cp1250'): | |
lines = [] | |
for line in ifile: | |
line = line.strip() # strip \r\n from the end of the line | |
if not line: continue | |
match = self.pattern.match(line) | |
if not match: | |
raise ValueError("Line '%s' does not match the pattern." % line) | |
start, end, text = match.groups() | |
start = float(start) / 10. | |
if end != "": | |
end = float(end) / 10. | |
else: | |
end = start + max(0.07*len(text), 1.5) | |
for p,r in self.format_patterns: | |
n = 1 | |
while n: | |
text, n = p.subn(r, text) | |
text = self.remove_pattern.sub('', text) | |
text = text.replace('|','\n') | |
text = _remove_undefined(text, coding) | |
text = text.decode(coding) | |
lines.append((start, end, text)) | |
# Unscramble possibly scrambled subtitles | |
for i,(start, end, text) in enumerate(lines[:-1]): | |
if end >= lines[i+1][0]: | |
end = lines[i+1][0]-0.001 | |
lines[i] = start, end, text | |
return lines | |
class TMP(object): | |
pattern = re.compile(r'^(\d+):(\d+):(\d+):(.*)$') | |
format_patterns = FORMAT_PATTERNS | |
remove_pattern = re.compile(r'\{[^}]*\}') | |
def __init__(self, framerate): | |
self.framerate = framerate | |
def read(self, ifile, coding='cp1250'): | |
lines = [] | |
for line in ifile: | |
line = line.strip() # strip \r\n from the end of the line | |
if not line: continue | |
match = self.pattern.match(line) | |
if not match: | |
raise ValueError("Line '%s' does not match the pattern." % line) | |
h, m, s, text = match.groups() | |
try: | |
h,m,s = map(int,[h,m,s]) | |
except TypeError: | |
raise ValueError("Line '%s' does not match the pattern." % line) | |
start = float(3600*h + 60*m + s) | |
end = start + max(0.07*len(text), 1.5) | |
for p,r in self.format_patterns: | |
n = 1 | |
while n: | |
text, n = p.subn(r, text) | |
text = self.remove_pattern.sub('', text) | |
text = text.replace('|','\n') | |
text = _remove_undefined(text, coding) | |
text = text.decode(coding) | |
lines.append((start, end, text)) | |
# Unscramble possibly scrambled subtitles | |
for i,(start, end, text) in enumerate(lines[:-1]): | |
if end >= lines[i+1][0]: | |
end = lines[i+1][0]-0.001 | |
lines[i] = start, end, text | |
return lines | |
class SubRip(object): | |
tpattern = re.compile(r'(\d{2}):(\d{2}):(\d{2}),(\d{3}) --> (\d{2}):(\d{2}):(\d{2}),(\d{3})') | |
def __init__(self, framerate): | |
self.framerate = framerate | |
def read(self, ifile, coding='cp1250'): | |
lines = [] | |
for line in ifile: | |
if not line.strip(): continue | |
no = int(line) | |
line = ifile.next() | |
match = self.tpattern.match(line) | |
if not match: | |
raise ValueError("Timing line '%s' does not match the pattern." % line) | |
sh, sm, ss, sms, eh, em, es, ems = map(int, match.groups()) | |
start = 3600.*sh + 60.*sm + ss + 0.001*sms | |
end = 3600.*eh + 60.*em + es + 0.001*ems | |
text = "" | |
line = ifile.next().strip() | |
while line: | |
text += line + '\n' | |
try: | |
line = ifile.next().strip() | |
except StopIteration: | |
break | |
text = _remove_undefined(text, coding) | |
lines.append((start, end, text[:-1].decode(coding))) | |
# Unscramble possibly scrambled subtitles | |
for i,(start, end, text) in enumerate(lines[:-1]): | |
if end >= lines[i+1][0]: | |
end = lines[i+1][0]-0.001 | |
lines[i] = start, end, text | |
return lines | |
@staticmethod | |
def format(time): | |
h = int(time / 3600) | |
time = time % 3600 | |
m = int(time / 60) | |
time = time % 60 | |
s = int(time) | |
ms = int(round(1000 * (time-s))) | |
if ms >= 1000: | |
ms -= 1000 | |
s += 1 | |
return "%02d:%02d:%02d,%03d" % (h,m,s,ms) | |
def write(self, ofile, lines, coding='utf8'): | |
if coding=='utf8' and ofile.tell() == 0: | |
ofile.write(codecs.BOM_UTF8) | |
n = 1 | |
for start, end, text in lines: | |
text = text.encode(coding) | |
ofile.write("%d\r\n" % n) | |
ofile.write("%s --> %s\r\n" % (self.format(start), self.format(end))) | |
text = text.replace("\n", "\r\n") + "\r\n" | |
ofile.write(text + "\r\n") | |
n += 1 | |
def read_format(ifile, framerate, coding='cp1250', logger=None): | |
if type(ifile) == str: | |
ifile = StringIO(ifile) | |
error = "" | |
bom = ifile.read(len(codecs.BOM_UTF8)) | |
if bom == codecs.BOM_UTF8: | |
coding = 'utf8' | |
start = len(codecs.BOM_UTF8) | |
else: | |
start = 0 | |
for Format in MicroDVD, MPL2, TMP, SubRip: | |
ok = False | |
try: | |
informat = Format(framerate) | |
ifile.seek(start) | |
lines = informat.read(ifile, coding) | |
except ValueError, e: | |
#import traceback | |
#traceback.print_exc() | |
error += Format.__name__ + ": " + str(e) + "\n" | |
else: | |
ok = True | |
break | |
if not ok: | |
if logger is not None: | |
logger.error(error) | |
else: | |
print >> sys.stderr, error | |
ifile.seek(0) | |
return ifile.read(), None | |
else: | |
return lines, informat | |
def read(ifile, framerate, coding='cp1250', logger=None): | |
data, format = read_format(ifile, framerate, coding, logger) | |
if format is None: | |
raise ValueError('Unrecognized format') | |
if __name__ == "__main__": | |
# Options parser | |
from optparse import OptionParser | |
parser = OptionParser(usage = r"usage: %prog [options] file1.txt [file2.txt ...]") | |
parser.add_option("-f", "--framerate", dest="framerate", type="float", default=23.976, help=r"Framerate of the video (default: %default)") | |
parser.add_option("-u", "--unicode", action="store_true", dest="unicode", help=r"Use unicode for srt file (default: %default)") | |
options, args = parser.parse_args() | |
outformat = SubRip(options.framerate) | |
coding = 'utf8' if options.unicode else 'cp1250' | |
if len(args) == 0: | |
print >> sys.stderr, parser.get_usage() | |
sys.exit(1) | |
for infilename in args: | |
outfilename = infilename[:-4]+'.srt' | |
infile = open(infilename, 'r') | |
try: | |
lines, format = read_format(infile, options.framerate) | |
except IOError: | |
print >> sys.stderr, "Could not read '%s'" % infilename | |
else: | |
if format is None: | |
print >> sys.stderr, "Could not recognize format of '%s'" % infilename | |
else: | |
outfile = open(outfilename, 'w') | |
outformat.write(outfile, lines, coding) | |
outfile.close() | |
print "'%s' detected as %s and converted to '%s'" % (infilename, format.__class__.__name__, outfilename) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment