macdems/subtitles.py

## subtitles.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys
import re
import codecs
from cStringIO import StringIO


FORMAT_PATTERNS = (
    (re.compile(r'^(\s*)/([^|]+)'), r'\1<i>\2</i>'),
    (re.compile(r'\|(\s*)/([^|]+)'), r'|\1<i>\2</i>'),

    (re.compile(r'\{Y:([bui])\}([^|]+)\|'), r'<\1>\2</\1>|{Y:\1}'),
    (re.compile(r'\{Y:([bui])\}([^|]+)$'), r'<\1>\2</\1>'),
    (re.compile(r'\{y:([bui])\}([^|]+)'), r'<\1>\2</\1>'),
    (re.compile(r'\{Y:([bui]),([bui])\}([^|]+)\|'), r'<\1><\2>\3</\2></\1>|{Y:\1}'),
    (re.compile(r'\{Y:([bui]),([bui])\}([^|]+)$'), r'<\1><\2>\3</\2></\1>'),
    (re.compile(r'\{y:([bui]),([bui])\}([^|]+)'), r'<\1><\2>\3</\2></\1>'),

    (re.compile(r'<([bui])>\{C:\$([0-9a-hA-H]{6})\}([^|]+)</\1>\|'), r'<\1><font color="#\2">\3</font></\1>|{<C:\2}'),
    (re.compile(r'<([bui])>\{C:\$([0-9a-hA-H]{6})\}([^|]+)</\1>$'), r'<\1><font color="#\2">\3</font></\1>'),
    (re.compile(r'<([bui])>\{c:\$([0-9a-hA-H]{6})\}([^|]+)</\1>'), r'<\1><font color="#\2">\3</font></\1>'),

    (re.compile(r'\{C:\$([0-9a-hA-H]{6})\}([^|]+)\|'), r'<font color="#\1">\2</font>|{<C:\1}'),
    (re.compile(r'\{C:\$([0-9a-hA-H]{6})\}([^|]+)$'), r'<font color="#\1">\2</font>'),
    (re.compile(r'\{c:\$([0-9a-hA-H]{6})\}([^|]+)'), r'<font color="#\1">\2</font>'),
)


def _remove_undefined(text, coding):
    if coding == 'cp1250':
        return text.replace('\x81','').replace('\x83','').replace('\x88','').replace('\x90','').replace('\x98','')
    return text


class MicroDVD(object):
    pattern = re.compile(r'^\{(\d+)l?\}\{(\d*)l?\}(.*)$')
    format_patterns = FORMAT_PATTERNS
    remove_pattern = re.compile(r'\{[^}]*\}')

    def __init__(self, framerate):
        self.framerate = framerate

    def read(self, ifile, coding='cp1250'):
        lines = []

        for line in ifile:
            line = line.strip() # strip \r\n from the end of the line
            if not line: continue

            match = self.pattern.match(line)
            if not match:
                raise ValueError("Line '%s' does not match the pattern." % line)
            start, end, text = match.groups()

            start = float(start) / self.framerate
            for p,r in self.format_patterns:
                n = 1
                while n:
                    text, n = p.subn(r, text)
            text = self.remove_pattern.sub('', text)
            text = text.replace('|','\n')
            text = _remove_undefined(text, coding)
            text = text.decode(coding)
            if end != "":
                end = float(end) / self.framerate
            else:
                end = start + max(0.07*len(text), 1.5)

            lines.append((start, end, text))

        # Unscramble possibly scrambled subtitles
        for i,(start, end, text) in enumerate(lines[:-1]):
            if end >= lines[i+1][0]:
                end = lines[i+1][0]-0.001
                lines[i] = start, end, text

        return lines


class MPL2(object):
    pattern = re.compile(r'^\[(\d+)]\[(\d*)](.*)$')
    format_patterns = FORMAT_PATTERNS
    remove_pattern = re.compile(r'\{[^}]*\}')

    def __init__(self, framerate):
        self.framerate = framerate

    def read(self, ifile, coding='cp1250'):
        lines = []

        for line in ifile:
            line = line.strip() # strip \r\n from the end of the line
            if not line: continue

            match = self.pattern.match(line)
            if not match:
                raise ValueError("Line '%s' does not match the pattern." % line)
            start, end, text = match.groups()

            start = float(start) / 10.
            if end != "":
                end = float(end) / 10.
            else:
                end = start + max(0.07*len(text), 1.5)

            for p,r in self.format_patterns:
                n = 1
                while n:
                    text, n = p.subn(r, text)
            text = self.remove_pattern.sub('', text)
            text = text.replace('|','\n')
            text = _remove_undefined(text, coding)
            text = text.decode(coding)

            lines.append((start, end, text))

        # Unscramble possibly scrambled subtitles
        for i,(start, end, text) in enumerate(lines[:-1]):
            if end >= lines[i+1][0]:
                end = lines[i+1][0]-0.001
                lines[i] = start, end, text

        return lines


class TMP(object):
    pattern = re.compile(r'^(\d+):(\d+):(\d+):(.*)$')
    format_patterns = FORMAT_PATTERNS
    remove_pattern = re.compile(r'\{[^}]*\}')

    def __init__(self, framerate):
        self.framerate = framerate

    def read(self, ifile, coding='cp1250'):
        lines = []

        for line in ifile:
            line = line.strip() # strip \r\n from the end of the line
            if not line: continue

            match = self.pattern.match(line)
            if not match:
                raise ValueError("Line '%s' does not match the pattern." % line)
            h, m, s, text = match.groups()

            try:
                h,m,s = map(int,[h,m,s])
            except TypeError:
                raise ValueError("Line '%s' does not match the pattern." % line)

            start = float(3600*h + 60*m + s)
            end = start + max(0.07*len(text), 1.5)

            for p,r in self.format_patterns:
                n = 1
                while n:
                    text, n = p.subn(r, text)
            text = self.remove_pattern.sub('', text)
            text = text.replace('|','\n')
            text = _remove_undefined(text, coding)
            text = text.decode(coding)

            lines.append((start, end, text))

        # Unscramble possibly scrambled subtitles
        for i,(start, end, text) in enumerate(lines[:-1]):
            if end >= lines[i+1][0]:
                end = lines[i+1][0]-0.001
                lines[i] = start, end, text

        return lines


class SubRip(object):
    tpattern = re.compile(r'(\d{2}):(\d{2}):(\d{2}),(\d{3}) --> (\d{2}):(\d{2}):(\d{2}),(\d{3})')

    def __init__(self, framerate):
        self.framerate = framerate

    def read(self, ifile, coding='cp1250'):
        lines = []
        for line in ifile:
	    if not line.strip(): continue
            no = int(line)
            line = ifile.next()
            match = self.tpattern.match(line)
            if not match:
                raise ValueError("Timing line '%s' does not match the pattern." % line)
            sh, sm, ss, sms, eh, em, es, ems = map(int, match.groups())
            start = 3600.*sh + 60.*sm + ss + 0.001*sms
            end   = 3600.*eh + 60.*em + es + 0.001*ems
            text = ""
            line = ifile.next().strip()
            while line:
                text += line + '\n'
                try:
                    line = ifile.next().strip()
                except StopIteration:
                    break
            text = _remove_undefined(text, coding)
            lines.append((start, end, text[:-1].decode(coding)))

        # Unscramble possibly scrambled subtitles
        for i,(start, end, text) in enumerate(lines[:-1]):
            if end >= lines[i+1][0]:
                end = lines[i+1][0]-0.001
                lines[i] = start, end, text
        return lines

    @staticmethod
    def format(time):
        h = int(time / 3600)
        time = time % 3600
        m = int(time / 60)
        time = time % 60
        s = int(time)
        ms = int(round(1000 * (time-s)))
        if ms >= 1000:
            ms -= 1000
            s += 1
        return "%02d:%02d:%02d,%03d" % (h,m,s,ms)

    def write(self, ofile, lines, coding='utf8'):
        if coding=='utf8' and ofile.tell() == 0:
            ofile.write(codecs.BOM_UTF8)
        n = 1
        for start, end, text in lines:
            text = text.encode(coding)
            ofile.write("%d\r\n" % n)
            ofile.write("%s --> %s\r\n" % (self.format(start), self.format(end)))
            text = text.replace("\n", "\r\n") + "\r\n"
            ofile.write(text + "\r\n")
            n += 1


def read_format(ifile, framerate, coding='cp1250', logger=None):
    if type(ifile) == str:
        ifile = StringIO(ifile)
    error = ""

    bom = ifile.read(len(codecs.BOM_UTF8))
    if bom == codecs.BOM_UTF8:
        coding = 'utf8'
        start = len(codecs.BOM_UTF8)
    else:
        start = 0

    for Format in MicroDVD, MPL2, TMP, SubRip:
        ok = False
        try:
            informat = Format(framerate)
            ifile.seek(start)
            lines = informat.read(ifile, coding)
        except ValueError, e:
	    #import traceback
	    #traceback.print_exc()
            error += Format.__name__ + ": " + str(e) + "\n"
        else:
            ok = True
            break
    if not ok:
        if logger is not None:
            logger.error(error)
        else:
            print >> sys.stderr, error
        ifile.seek(0)
        return ifile.read(), None
    else:
        return lines, informat


def read(ifile, framerate, coding='cp1250', logger=None):
    data, format = read_format(ifile, framerate, coding, logger)
    if format is None:
        raise ValueError('Unrecognized format')


if __name__ == "__main__":

    # Options parser
    from optparse import OptionParser
    parser = OptionParser(usage = r"usage: %prog [options] file1.txt [file2.txt ...]")
    parser.add_option("-f", "--framerate", dest="framerate", type="float", default=23.976, help=r"Framerate of the video (default: %default)")
    parser.add_option("-u", "--unicode", action="store_true", dest="unicode", help=r"Use unicode for srt file (default: %default)")
    options, args = parser.parse_args()

    outformat = SubRip(options.framerate)

    coding = 'utf8' if options.unicode else 'cp1250'

    if len(args) == 0:
        print >> sys.stderr, parser.get_usage()
        sys.exit(1)

    for infilename in args:
        outfilename = infilename[:-4]+'.srt'

        infile = open(infilename, 'r')

        try:
            lines, format = read_format(infile, options.framerate)
        except IOError:
            print >> sys.stderr, "Could not read '%s'" % infilename
        else:
            if format is None:
                print >> sys.stderr, "Could not recognize format of '%s'" % infilename
            else:
                outfile = open(outfilename, 'w')
                outformat.write(outfile, lines, coding)
                outfile.close()
                print "'%s' detected as %s and converted to '%s'" % (infilename, format.__class__.__name__, outfilename)
	#!/usr/bin/env python
	# -- coding: utf-8 --

	import sys
	import re
	import codecs
	from cStringIO import StringIO


	FORMAT_PATTERNS = (
	(re.compile(r'^(\s*)/([^\|]+)'), r'\1<i>\2</i>'),
	(re.compile(r'\\|(\s*)/([^\|]+)'), r'\|\1<i>\2</i>'),

	(re.compile(r'\{Y:([bui])\}([^\|]+)\\|'), r'<\1>\2</\1>\|{Y:\1}'),
	(re.compile(r'\{Y:([bui])\}([^\|]+)$'), r'<\1>\2</\1>'),
	(re.compile(r'\{y:([bui])\}([^\|]+)'), r'<\1>\2</\1>'),
	(re.compile(r'\{Y:([bui]),([bui])\}([^\|]+)\\|'), r'<\1><\2>\3</\2></\1>\|{Y:\1}'),
	(re.compile(r'\{Y:([bui]),([bui])\}([^\|]+)$'), r'<\1><\2>\3</\2></\1>'),
	(re.compile(r'\{y:([bui]),([bui])\}([^\|]+)'), r'<\1><\2>\3</\2></\1>'),

	(re.compile(r'<([bui])>\{C:\$([0-9a-hA-H]{6})\}([^\|]+)</\1>\\|'), r'<\1><font color="#\2">\3</font></\1>\|{<C:\2}'),
	(re.compile(r'<([bui])>\{C:\$([0-9a-hA-H]{6})\}([^\|]+)</\1>$'), r'<\1><font color="#\2">\3</font></\1>'),
	(re.compile(r'<([bui])>\{c:\$([0-9a-hA-H]{6})\}([^\|]+)</\1>'), r'<\1><font color="#\2">\3</font></\1>'),

	(re.compile(r'\{C:\$([0-9a-hA-H]{6})\}([^\|]+)\\|'), r'<font color="#\1">\2</font>\|{<C:\1}'),
	(re.compile(r'\{C:\$([0-9a-hA-H]{6})\}([^\|]+)$'), r'<font color="#\1">\2</font>'),
	(re.compile(r'\{c:\$([0-9a-hA-H]{6})\}([^\|]+)'), r'<font color="#\1">\2</font>'),
	)


	def _remove_undefined(text, coding):
	if coding == 'cp1250':
	return text.replace('\x81','').replace('\x83','').replace('\x88','').replace('\x90','').replace('\x98','')
	return text


	class MicroDVD(object):
	pattern = re.compile(r'^\{(\d+)l?\}\{(\d)l?\}(.)$')
	format_patterns = FORMAT_PATTERNS
	remove_pattern = re.compile(r'\{[^}]*\}')

	def __init__(self, framerate):
	self.framerate = framerate

	def read(self, ifile, coding='cp1250'):
	lines = []

	for line in ifile:
	line = line.strip() # strip \r\n from the end of the line
	if not line: continue

	match = self.pattern.match(line)
	if not match:
	raise ValueError("Line '%s' does not match the pattern." % line)
	start, end, text = match.groups()

	start = float(start) / self.framerate
	for p,r in self.format_patterns:
	n = 1
	while n:
	text, n = p.subn(r, text)
	text = self.remove_pattern.sub('', text)
	text = text.replace('\|','\n')
	text = _remove_undefined(text, coding)
	text = text.decode(coding)
	if end != "":
	end = float(end) / self.framerate
	else:
	end = start + max(0.07*len(text), 1.5)

	lines.append((start, end, text))

	# Unscramble possibly scrambled subtitles
	for i,(start, end, text) in enumerate(lines[:-1]):
	if end >= lines[i+1][0]:
	end = lines[i+1][0]-0.001
	lines[i] = start, end, text

	return lines


	class MPL2(object):
	pattern = re.compile(r'^\[(\d+)]\[(\d)](.)$')
	format_patterns = FORMAT_PATTERNS
	remove_pattern = re.compile(r'\{[^}]*\}')

	def __init__(self, framerate):
	self.framerate = framerate

	def read(self, ifile, coding='cp1250'):
	lines = []

	for line in ifile:
	line = line.strip() # strip \r\n from the end of the line
	if not line: continue

	match = self.pattern.match(line)
	if not match:
	raise ValueError("Line '%s' does not match the pattern." % line)
	start, end, text = match.groups()

	start = float(start) / 10.
	if end != "":
	end = float(end) / 10.
	else:
	end = start + max(0.07*len(text), 1.5)

	for p,r in self.format_patterns:
	n = 1
	while n:
	text, n = p.subn(r, text)
	text = self.remove_pattern.sub('', text)
	text = text.replace('\|','\n')
	text = _remove_undefined(text, coding)
	text = text.decode(coding)

	lines.append((start, end, text))

	# Unscramble possibly scrambled subtitles
	for i,(start, end, text) in enumerate(lines[:-1]):
	if end >= lines[i+1][0]:
	end = lines[i+1][0]-0.001
	lines[i] = start, end, text

	return lines


	class TMP(object):
	pattern = re.compile(r'^(\d+):(\d+):(\d+):(.*)$')
	format_patterns = FORMAT_PATTERNS
	remove_pattern = re.compile(r'\{[^}]*\}')

	def __init__(self, framerate):
	self.framerate = framerate

	def read(self, ifile, coding='cp1250'):
	lines = []

	for line in ifile:
	line = line.strip() # strip \r\n from the end of the line
	if not line: continue

	match = self.pattern.match(line)
	if not match:
	raise ValueError("Line '%s' does not match the pattern." % line)
	h, m, s, text = match.groups()

	try:
	h,m,s = map(int,[h,m,s])
	except TypeError:
	raise ValueError("Line '%s' does not match the pattern." % line)

	start = float(3600h + 60m + s)
	end = start + max(0.07*len(text), 1.5)

	for p,r in self.format_patterns:
	n = 1
	while n:
	text, n = p.subn(r, text)
	text = self.remove_pattern.sub('', text)
	text = text.replace('\|','\n')
	text = _remove_undefined(text, coding)
	text = text.decode(coding)

	lines.append((start, end, text))

	# Unscramble possibly scrambled subtitles
	for i,(start, end, text) in enumerate(lines[:-1]):
	if end >= lines[i+1][0]:
	end = lines[i+1][0]-0.001
	lines[i] = start, end, text

	return lines


	class SubRip(object):
	tpattern = re.compile(r'(\d{2}):(\d{2}):(\d{2}),(\d{3}) --> (\d{2}):(\d{2}):(\d{2}),(\d{3})')

	def __init__(self, framerate):
	self.framerate = framerate

	def read(self, ifile, coding='cp1250'):
	lines = []
	for line in ifile:
	if not line.strip(): continue
	no = int(line)
	line = ifile.next()
	match = self.tpattern.match(line)
	if not match:
	raise ValueError("Timing line '%s' does not match the pattern." % line)
	sh, sm, ss, sms, eh, em, es, ems = map(int, match.groups())
	start = 3600.sh + 60.sm + ss + 0.001*sms
	end = 3600.eh + 60.em + es + 0.001*ems
	text = ""
	line = ifile.next().strip()
	while line:
	text += line + '\n'
	try:
	line = ifile.next().strip()
	except StopIteration:
	break
	text = _remove_undefined(text, coding)
	lines.append((start, end, text[:-1].decode(coding)))

	# Unscramble possibly scrambled subtitles
	for i,(start, end, text) in enumerate(lines[:-1]):
	if end >= lines[i+1][0]:
	end = lines[i+1][0]-0.001
	lines[i] = start, end, text
	return lines

	@staticmethod
	def format(time):
	h = int(time / 3600)
	time = time % 3600
	m = int(time / 60)
	time = time % 60
	s = int(time)
	ms = int(round(1000 * (time-s)))
	if ms >= 1000:
	ms -= 1000
	s += 1
	return "%02d:%02d:%02d,%03d" % (h,m,s,ms)

	def write(self, ofile, lines, coding='utf8'):
	if coding=='utf8' and ofile.tell() == 0:
	ofile.write(codecs.BOM_UTF8)
	n = 1
	for start, end, text in lines:
	text = text.encode(coding)
	ofile.write("%d\r\n" % n)
	ofile.write("%s --> %s\r\n" % (self.format(start), self.format(end)))
	text = text.replace("\n", "\r\n") + "\r\n"
	ofile.write(text + "\r\n")
	n += 1


	def read_format(ifile, framerate, coding='cp1250', logger=None):
	if type(ifile) == str:
	ifile = StringIO(ifile)
	error = ""

	bom = ifile.read(len(codecs.BOM_UTF8))
	if bom == codecs.BOM_UTF8:
	coding = 'utf8'
	start = len(codecs.BOM_UTF8)
	else:
	start = 0

	for Format in MicroDVD, MPL2, TMP, SubRip:
	ok = False
	try:
	informat = Format(framerate)
	ifile.seek(start)
	lines = informat.read(ifile, coding)
	except ValueError, e:
	#import traceback
	#traceback.print_exc()
	error += Format.__name__ + ": " + str(e) + "\n"
	else:
	ok = True
	break
	if not ok:
	if logger is not None:
	logger.error(error)
	else:
	print >> sys.stderr, error
	ifile.seek(0)
	return ifile.read(), None
	else:
	return lines, informat


	def read(ifile, framerate, coding='cp1250', logger=None):
	data, format = read_format(ifile, framerate, coding, logger)
	if format is None:
	raise ValueError('Unrecognized format')


	if __name__ == "__main__":

	# Options parser
	from optparse import OptionParser
	parser = OptionParser(usage = r"usage: %prog [options] file1.txt [file2.txt ...]")
	parser.add_option("-f", "--framerate", dest="framerate", type="float", default=23.976, help=r"Framerate of the video (default: %default)")
	parser.add_option("-u", "--unicode", action="store_true", dest="unicode", help=r"Use unicode for srt file (default: %default)")
	options, args = parser.parse_args()

	outformat = SubRip(options.framerate)

	coding = 'utf8' if options.unicode else 'cp1250'

	if len(args) == 0:
	print >> sys.stderr, parser.get_usage()
	sys.exit(1)

	for infilename in args:
	outfilename = infilename[:-4]+'.srt'

	infile = open(infilename, 'r')

	try:
	lines, format = read_format(infile, options.framerate)
	except IOError:
	print >> sys.stderr, "Could not read '%s'" % infilename
	else:
	if format is None:
	print >> sys.stderr, "Could not recognize format of '%s'" % infilename
	else:
	outfile = open(outfilename, 'w')
	outformat.write(outfile, lines, coding)
	outfile.close()
	print "'%s' detected as %s and converted to '%s'" % (infilename, format.__class__.__name__, outfilename)