Skip to content

Instantly share code, notes, and snippets.

@macdems
Created May 9, 2016 15:22
Show Gist options
  • Save macdems/f69d5addc6c199ce9b7a7d773c89a07c to your computer and use it in GitHub Desktop.
Save macdems/f69d5addc6c199ce9b7a7d773c89a07c to your computer and use it in GitHub Desktop.
Subtitsles format converter
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import re
import codecs
from cStringIO import StringIO
FORMAT_PATTERNS = (
(re.compile(r'^(\s*)/([^|]+)'), r'\1<i>\2</i>'),
(re.compile(r'\|(\s*)/([^|]+)'), r'|\1<i>\2</i>'),
(re.compile(r'\{Y:([bui])\}([^|]+)\|'), r'<\1>\2</\1>|{Y:\1}'),
(re.compile(r'\{Y:([bui])\}([^|]+)$'), r'<\1>\2</\1>'),
(re.compile(r'\{y:([bui])\}([^|]+)'), r'<\1>\2</\1>'),
(re.compile(r'\{Y:([bui]),([bui])\}([^|]+)\|'), r'<\1><\2>\3</\2></\1>|{Y:\1}'),
(re.compile(r'\{Y:([bui]),([bui])\}([^|]+)$'), r'<\1><\2>\3</\2></\1>'),
(re.compile(r'\{y:([bui]),([bui])\}([^|]+)'), r'<\1><\2>\3</\2></\1>'),
(re.compile(r'<([bui])>\{C:\$([0-9a-hA-H]{6})\}([^|]+)</\1>\|'), r'<\1><font color="#\2">\3</font></\1>|{<C:\2}'),
(re.compile(r'<([bui])>\{C:\$([0-9a-hA-H]{6})\}([^|]+)</\1>$'), r'<\1><font color="#\2">\3</font></\1>'),
(re.compile(r'<([bui])>\{c:\$([0-9a-hA-H]{6})\}([^|]+)</\1>'), r'<\1><font color="#\2">\3</font></\1>'),
(re.compile(r'\{C:\$([0-9a-hA-H]{6})\}([^|]+)\|'), r'<font color="#\1">\2</font>|{<C:\1}'),
(re.compile(r'\{C:\$([0-9a-hA-H]{6})\}([^|]+)$'), r'<font color="#\1">\2</font>'),
(re.compile(r'\{c:\$([0-9a-hA-H]{6})\}([^|]+)'), r'<font color="#\1">\2</font>'),
)
def _remove_undefined(text, coding):
if coding == 'cp1250':
return text.replace('\x81','').replace('\x83','').replace('\x88','').replace('\x90','').replace('\x98','')
return text
class MicroDVD(object):
pattern = re.compile(r'^\{(\d+)l?\}\{(\d*)l?\}(.*)$')
format_patterns = FORMAT_PATTERNS
remove_pattern = re.compile(r'\{[^}]*\}')
def __init__(self, framerate):
self.framerate = framerate
def read(self, ifile, coding='cp1250'):
lines = []
for line in ifile:
line = line.strip() # strip \r\n from the end of the line
if not line: continue
match = self.pattern.match(line)
if not match:
raise ValueError("Line '%s' does not match the pattern." % line)
start, end, text = match.groups()
start = float(start) / self.framerate
for p,r in self.format_patterns:
n = 1
while n:
text, n = p.subn(r, text)
text = self.remove_pattern.sub('', text)
text = text.replace('|','\n')
text = _remove_undefined(text, coding)
text = text.decode(coding)
if end != "":
end = float(end) / self.framerate
else:
end = start + max(0.07*len(text), 1.5)
lines.append((start, end, text))
# Unscramble possibly scrambled subtitles
for i,(start, end, text) in enumerate(lines[:-1]):
if end >= lines[i+1][0]:
end = lines[i+1][0]-0.001
lines[i] = start, end, text
return lines
class MPL2(object):
pattern = re.compile(r'^\[(\d+)]\[(\d*)](.*)$')
format_patterns = FORMAT_PATTERNS
remove_pattern = re.compile(r'\{[^}]*\}')
def __init__(self, framerate):
self.framerate = framerate
def read(self, ifile, coding='cp1250'):
lines = []
for line in ifile:
line = line.strip() # strip \r\n from the end of the line
if not line: continue
match = self.pattern.match(line)
if not match:
raise ValueError("Line '%s' does not match the pattern." % line)
start, end, text = match.groups()
start = float(start) / 10.
if end != "":
end = float(end) / 10.
else:
end = start + max(0.07*len(text), 1.5)
for p,r in self.format_patterns:
n = 1
while n:
text, n = p.subn(r, text)
text = self.remove_pattern.sub('', text)
text = text.replace('|','\n')
text = _remove_undefined(text, coding)
text = text.decode(coding)
lines.append((start, end, text))
# Unscramble possibly scrambled subtitles
for i,(start, end, text) in enumerate(lines[:-1]):
if end >= lines[i+1][0]:
end = lines[i+1][0]-0.001
lines[i] = start, end, text
return lines
class TMP(object):
pattern = re.compile(r'^(\d+):(\d+):(\d+):(.*)$')
format_patterns = FORMAT_PATTERNS
remove_pattern = re.compile(r'\{[^}]*\}')
def __init__(self, framerate):
self.framerate = framerate
def read(self, ifile, coding='cp1250'):
lines = []
for line in ifile:
line = line.strip() # strip \r\n from the end of the line
if not line: continue
match = self.pattern.match(line)
if not match:
raise ValueError("Line '%s' does not match the pattern." % line)
h, m, s, text = match.groups()
try:
h,m,s = map(int,[h,m,s])
except TypeError:
raise ValueError("Line '%s' does not match the pattern." % line)
start = float(3600*h + 60*m + s)
end = start + max(0.07*len(text), 1.5)
for p,r in self.format_patterns:
n = 1
while n:
text, n = p.subn(r, text)
text = self.remove_pattern.sub('', text)
text = text.replace('|','\n')
text = _remove_undefined(text, coding)
text = text.decode(coding)
lines.append((start, end, text))
# Unscramble possibly scrambled subtitles
for i,(start, end, text) in enumerate(lines[:-1]):
if end >= lines[i+1][0]:
end = lines[i+1][0]-0.001
lines[i] = start, end, text
return lines
class SubRip(object):
tpattern = re.compile(r'(\d{2}):(\d{2}):(\d{2}),(\d{3}) --> (\d{2}):(\d{2}):(\d{2}),(\d{3})')
def __init__(self, framerate):
self.framerate = framerate
def read(self, ifile, coding='cp1250'):
lines = []
for line in ifile:
if not line.strip(): continue
no = int(line)
line = ifile.next()
match = self.tpattern.match(line)
if not match:
raise ValueError("Timing line '%s' does not match the pattern." % line)
sh, sm, ss, sms, eh, em, es, ems = map(int, match.groups())
start = 3600.*sh + 60.*sm + ss + 0.001*sms
end = 3600.*eh + 60.*em + es + 0.001*ems
text = ""
line = ifile.next().strip()
while line:
text += line + '\n'
try:
line = ifile.next().strip()
except StopIteration:
break
text = _remove_undefined(text, coding)
lines.append((start, end, text[:-1].decode(coding)))
# Unscramble possibly scrambled subtitles
for i,(start, end, text) in enumerate(lines[:-1]):
if end >= lines[i+1][0]:
end = lines[i+1][0]-0.001
lines[i] = start, end, text
return lines
@staticmethod
def format(time):
h = int(time / 3600)
time = time % 3600
m = int(time / 60)
time = time % 60
s = int(time)
ms = int(round(1000 * (time-s)))
if ms >= 1000:
ms -= 1000
s += 1
return "%02d:%02d:%02d,%03d" % (h,m,s,ms)
def write(self, ofile, lines, coding='utf8'):
if coding=='utf8' and ofile.tell() == 0:
ofile.write(codecs.BOM_UTF8)
n = 1
for start, end, text in lines:
text = text.encode(coding)
ofile.write("%d\r\n" % n)
ofile.write("%s --> %s\r\n" % (self.format(start), self.format(end)))
text = text.replace("\n", "\r\n") + "\r\n"
ofile.write(text + "\r\n")
n += 1
def read_format(ifile, framerate, coding='cp1250', logger=None):
if type(ifile) == str:
ifile = StringIO(ifile)
error = ""
bom = ifile.read(len(codecs.BOM_UTF8))
if bom == codecs.BOM_UTF8:
coding = 'utf8'
start = len(codecs.BOM_UTF8)
else:
start = 0
for Format in MicroDVD, MPL2, TMP, SubRip:
ok = False
try:
informat = Format(framerate)
ifile.seek(start)
lines = informat.read(ifile, coding)
except ValueError, e:
#import traceback
#traceback.print_exc()
error += Format.__name__ + ": " + str(e) + "\n"
else:
ok = True
break
if not ok:
if logger is not None:
logger.error(error)
else:
print >> sys.stderr, error
ifile.seek(0)
return ifile.read(), None
else:
return lines, informat
def read(ifile, framerate, coding='cp1250', logger=None):
data, format = read_format(ifile, framerate, coding, logger)
if format is None:
raise ValueError('Unrecognized format')
if __name__ == "__main__":
# Options parser
from optparse import OptionParser
parser = OptionParser(usage = r"usage: %prog [options] file1.txt [file2.txt ...]")
parser.add_option("-f", "--framerate", dest="framerate", type="float", default=23.976, help=r"Framerate of the video (default: %default)")
parser.add_option("-u", "--unicode", action="store_true", dest="unicode", help=r"Use unicode for srt file (default: %default)")
options, args = parser.parse_args()
outformat = SubRip(options.framerate)
coding = 'utf8' if options.unicode else 'cp1250'
if len(args) == 0:
print >> sys.stderr, parser.get_usage()
sys.exit(1)
for infilename in args:
outfilename = infilename[:-4]+'.srt'
infile = open(infilename, 'r')
try:
lines, format = read_format(infile, options.framerate)
except IOError:
print >> sys.stderr, "Could not read '%s'" % infilename
else:
if format is None:
print >> sys.stderr, "Could not recognize format of '%s'" % infilename
else:
outfile = open(outfilename, 'w')
outformat.write(outfile, lines, coding)
outfile.close()
print "'%s' detected as %s and converted to '%s'" % (infilename, format.__class__.__name__, outfilename)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment