merge OCR fuck-ups in SRT file
#!/usr/bin/env python
# merges screwed up OCR'ed subs, which generated 2 separate items for the same
# text, maybe sometimes with italics or not, or different upper/lower case
import sys
from pysrt import SubRipFile
def main():
srtfile = sys.argv[1]
subs =
newsubs = SubRipFile()
for i in xrange(len(subs)):
if i == 0:
s0, s1 = subs[i - 1], subs[i]
delta = s1.start - s0.end
if delta.ordinal < 5 and s0.text_without_tags.lower() == s1.text_without_tags.lower():
# adjust previous line to this end time
newsubs[-1].end.ordinal = s1.end.ordinal
#newsubs[-1].text += 'XXXMERGEDXXX'
print s0.index, '+', s1.index
# manual check to see if it's really not same?
#print 's0:', s0.index, s0.start, repr(s0.text)
#print 's1:', s1.index, s1.start, repr(s1.text)
# fix up the index numbers. for easy diffing, comment this out
newsubs.clean_indexes() + '.new')
if __name__ == '__main__':
