Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
merge OCR fuck-ups in SRT file
#!/usr/bin/env python
#
# merges screwed up OCR'ed subs, which generated 2 separate items for the same
# text, maybe sometimes with italics or not, or different upper/lower case
#
import sys
from pysrt import SubRipFile
def main():
srtfile = sys.argv[1]
subs = SubRipFile.open(srtfile)
newsubs = SubRipFile()
for i in xrange(len(subs)):
if i == 0:
newsubs.append(subs[i])
continue
s0, s1 = subs[i - 1], subs[i]
delta = s1.start - s0.end
if delta.ordinal < 5 and s0.text_without_tags.lower() == s1.text_without_tags.lower():
# adjust previous line to this end time
newsubs[-1].end.ordinal = s1.end.ordinal
#newsubs[-1].text += 'XXXMERGEDXXX'
print s0.index, '+', s1.index
else:
newsubs.append(s1)
# manual check to see if it's really not same?
#print 's0:', s0.index, s0.start, repr(s0.text)
#print 's1:', s1.index, s1.start, repr(s1.text)
# fix up the index numbers. for easy diffing, comment this out
newsubs.clean_indexes()
newsubs.save(srtfile + '.new')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.