Skip to content

Instantly share code, notes, and snippets.

@arktrin
Last active September 11, 2015 11:32
Show Gist options
  • Save arktrin/b02959d5f0ce5f7406c0 to your computer and use it in GitHub Desktop.
Save arktrin/b02959d5f0ce5f7406c0 to your computer and use it in GitHub Desktop.
find longest string coincidences in all possible combinations of two subtitles that are located in current folder
import itertools, glob, os, re, difflib
import timeit as ti
# srts = os.listdir(os.path.dirname(os.path.abspath(__file__)))
srts = glob.glob(os.path.dirname(os.path.abspath(__file__))+'/*.srt')
srtsComb = list(itertools.combinations(srts, 2))
print len(srtsComb)
i = 0
repls = (',', ''),('</i>', ''),('<i>', ''),('\n', ''),('-', ''),(' ', ''),(':', ''),('#', '')
for comb in srtsComb:
in0 = open(comb[0], 'r').read()
in1 = open(comb[1], 'r').read()
in0 = re.sub(r'(?m)^\d.*\n?', '', in0).lower()
in0 = reduce(lambda a, kv: a.replace(*kv), repls, in0)
in1 = re.sub(r'(?m)^\d.*\n?', '', in1).lower()
in1 = reduce(lambda a, kv: a.replace(*kv), repls, in1)
s = difflib.SequenceMatcher(None, in0, in1)
longestMatch = s.find_longest_match(0,len(in0),0,len(in1))
if longestMatch[2] > 8:
print comb[0][50:-4], comb[1][50:-4]
print longestMatch, in0[longestMatch[0]:longestMatch[0]+longestMatch[2]+1]
i += 1
if i % 50 == 0:
print i
# break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment