mbrezu/SRTfix.py

## SRTfix.py
# Tries to fix a srt file to match the sub time with the actual dialog time.
# Inputs:
#  - the srt file
#  - a list of (srt time, spoken dialog time) pairs that will be used
#    to infer the parameters for the srt time adjustment
# Outputs:
#  - the corrected srt file to standard output
#  - the correction parameters to standard error

import sys
import datetime
import re
import math

class SrtEntry(object):
    def __init__(self, number, startTime, endTime, text):
        self.number = number
        self.startTime = startTime
        self.endTime = endTime
        self.text = text

    def __repr__(self):
        return "SrtEntry(%d, %lf, %lf, %s)" % (self.number,
                                               self.startTime,
                                               self.endTime,
                                               repr(self.text))

    def __str__(self):
        def breakTime(t):
            h = int(t / 3600)
            m = int((t - h * 3600) / 60)
            s = int(t - h * 3600 - m * 60)
            ms = int((t - math.floor(t)) * 1000)
            return [h,m,s,ms]

        args = [self.number] + breakTime(self.startTime) + breakTime(self.endTime) + [self.text]
        return "%d\n%02d:%02d:%02d,%03d --> %02d:%02d:%02d,%03d\n%s\n" % tuple(args)

def parseSubtitle(lines):
    result = SrtEntry(int(lines[0]), 0, 0, "\n".join([line.strip() for line in lines[2:]]))
    pattern = '(\d+):(\d\d):(\d\d),(\d\d\d) --> (\d+):(\d\d):(\d\d),(\d\d\d)'
    match = re.search(pattern, lines[1])
    result.startTime = int(match.group(1)) * 3600 \
                       + int(match.group(2)) * 60 \
                       + int(match.group(3)) \
                       + int(match.group(4)) / 1000.0
    result.endTime = int(match.group(5)) * 3600 \
                     + int(match.group(6)) * 60 \
                     + int(match.group(7)) \
                     + int(match.group(8)) / 1000.0
    return result

def parseFile(fileName):
    f = file(fileName)
    content = f.readlines()
    f.close()
    subtitles = []
    currentSubtitle = []
    for line in content:
        if line.strip() == "":
            if len(currentSubtitle) > 0:
                subtitles.append(currentSubtitle)
                currentSubtitle = []
        else:
            currentSubtitle.append(line)
    if len(currentSubtitle) > 0:
        subtitles.append(currentSubtitle)
    return [parseSubtitle(lines) for lines in subtitles]

def parseOneCorrection(line):
    pattern = '(\d\d):(\d\d):(\d\d),(\d\d\d) --> (\d\d):(\d\d):(\d\d),(\d\d\d)'
    match = re.search(pattern, line)
    srtTime = int(match.group(1)) * 3600 \
              + int(match.group(2)) * 60 \
              + int(match.group(3)) \
              + int(match.group(4)) / 1000.0
    correctTime = int(match.group(5)) * 3600 \
                  + int(match.group(6)) * 60 \
                  + int(match.group(7)) \
                  + int(match.group(8)) / 1000.0
    return (srtTime, correctTime)

def parseCorrections(fileName):
    f = file(fileName)
    content = f.readlines()
    f.close()
    return [parseOneCorrection(line) for line in content if line.strip() != ""]

def leastSquares(corrections):
    sum_x=0
    sum_y=0
    sum_xx=0
    sum_xy=0
    for (x, y) in corrections:
        sum_x = sum_x + x
        sum_y = sum_y + y
        xx = math.pow(x, 2)
        sum_xx = sum_xx + xx
        xy = x*y
        sum_xy = sum_xy + xy
    n = len(corrections)
    b = (-sum_x * sum_xy + sum_xx * sum_y) / (n * sum_xx-sum_x * sum_x)
    a = (-sum_x * sum_y + n * sum_xy) / (n * sum_xx-sum_x * sum_x)
    return (a, b)

def processSub(sub, a, b):
    return SrtEntry(sub.number,
                    sub.startTime * a + b,
                    sub.endTime * a + b,
                    sub.text)

if __name__ == "__main__":
    subs = parseFile(sys.argv[1])
    corrections = parseCorrections(sys.argv[2])
    a, b = leastSquares(corrections)
    sys.stderr.write("%lf, %lf\n" % (a, b))
    fixedSubs = [processSub(sub, a, b) for sub in subs]
    for sub in fixedSubs:
        print sub
	# Tries to fix a srt file to match the sub time with the actual dialog time.
	# Inputs:
	# - the srt file
	# - a list of (srt time, spoken dialog time) pairs that will be used
	# to infer the parameters for the srt time adjustment
	# Outputs:
	# - the corrected srt file to standard output
	# - the correction parameters to standard error

	import sys
	import datetime
	import re
	import math

	class SrtEntry(object):
	def __init__(self, number, startTime, endTime, text):
	self.number = number
	self.startTime = startTime
	self.endTime = endTime
	self.text = text

	def __repr__(self):
	return "SrtEntry(%d, %lf, %lf, %s)" % (self.number,
	self.startTime,
	self.endTime,
	repr(self.text))

	def __str__(self):
	def breakTime(t):
	h = int(t / 3600)
	m = int((t - h * 3600) / 60)
	s = int(t - h * 3600 - m * 60)
	ms = int((t - math.floor(t)) * 1000)
	return [h,m,s,ms]

	args = [self.number] + breakTime(self.startTime) + breakTime(self.endTime) + [self.text]
	return "%d\n%02d:%02d:%02d,%03d --> %02d:%02d:%02d,%03d\n%s\n" % tuple(args)

	def parseSubtitle(lines):
	result = SrtEntry(int(lines[0]), 0, 0, "\n".join([line.strip() for line in lines[2:]]))
	pattern = '(\d+):(\d\d):(\d\d),(\d\d\d) --> (\d+):(\d\d):(\d\d),(\d\d\d)'
	match = re.search(pattern, lines[1])
	result.startTime = int(match.group(1)) * 3600 \
	+ int(match.group(2)) * 60 \
	+ int(match.group(3)) \
	+ int(match.group(4)) / 1000.0
	result.endTime = int(match.group(5)) * 3600 \
	+ int(match.group(6)) * 60 \
	+ int(match.group(7)) \
	+ int(match.group(8)) / 1000.0
	return result

	def parseFile(fileName):
	f = file(fileName)
	content = f.readlines()
	f.close()
	subtitles = []
	currentSubtitle = []
	for line in content:
	if line.strip() == "":
	if len(currentSubtitle) > 0:
	subtitles.append(currentSubtitle)
	currentSubtitle = []
	else:
	currentSubtitle.append(line)
	if len(currentSubtitle) > 0:
	subtitles.append(currentSubtitle)
	return [parseSubtitle(lines) for lines in subtitles]

	def parseOneCorrection(line):
	pattern = '(\d\d):(\d\d):(\d\d),(\d\d\d) --> (\d\d):(\d\d):(\d\d),(\d\d\d)'
	match = re.search(pattern, line)
	srtTime = int(match.group(1)) * 3600 \
	+ int(match.group(2)) * 60 \
	+ int(match.group(3)) \
	+ int(match.group(4)) / 1000.0
	correctTime = int(match.group(5)) * 3600 \
	+ int(match.group(6)) * 60 \
	+ int(match.group(7)) \
	+ int(match.group(8)) / 1000.0
	return (srtTime, correctTime)

	def parseCorrections(fileName):
	f = file(fileName)
	content = f.readlines()
	f.close()
	return [parseOneCorrection(line) for line in content if line.strip() != ""]

	def leastSquares(corrections):
	sum_x=0
	sum_y=0
	sum_xx=0
	sum_xy=0
	for (x, y) in corrections:
	sum_x = sum_x + x
	sum_y = sum_y + y
	xx = math.pow(x, 2)
	sum_xx = sum_xx + xx
	xy = x*y
	sum_xy = sum_xy + xy
	n = len(corrections)
	b = (-sum_x * sum_xy + sum_xx * sum_y) / (n * sum_xx-sum_x * sum_x)
	a = (-sum_x * sum_y + n * sum_xy) / (n * sum_xx-sum_x * sum_x)
	return (a, b)

	def processSub(sub, a, b):
	return SrtEntry(sub.number,
	sub.startTime * a + b,
	sub.endTime * a + b,
	sub.text)

	if __name__ == "__main__":
	subs = parseFile(sys.argv[1])
	corrections = parseCorrections(sys.argv[2])
	a, b = leastSquares(corrections)
	sys.stderr.write("%lf, %lf\n" % (a, b))
	fixedSubs = [processSub(sub, a, b) for sub in subs]
	for sub in fixedSubs:
	print sub