Skip to content

Instantly share code, notes, and snippets.

@kylebgorman
Created July 10, 2011 17:45
Show Gist options
  • Save kylebgorman/1074747 to your computer and use it in GitHub Desktop.
Save kylebgorman/1074747 to your computer and use it in GitHub Desktop.
Demonstration of using the difflib built-in class in Python to compute approximately Levenshtein-optimal alignments, with examples from my past-tense learning experiments
#!/usr/bin/env python
# difflib_demo.py
# Kyle Gorman <kgorman@ling.upenn.edu>
from difflib import SequenceMatcher
if __name__ == '__main__':
from sys import argv
for file in argv[1:]:
for line in open(file, 'r'):
(input, output) = line.rstrip().split()
q = SequenceMatcher(None, input, output)
x_string = []
y_string = []
change_string = []
for (tag, i1, i2, j1, j2) in q.get_opcodes():
if tag == 'insert':
change_len = j2 - j1
x_string.append('*' * change_len)
y_string.append(output[j1:j2])
change_string.append('v' * change_len)
elif tag == 'delete':
change_len = i2 - i1
x_string.append(input[i1:i2])
y_string.append('*' * change_len)
change_string.append('^' * change_len)
elif tag == 'replace':
i_len = i2 - i1
diff_len = (j2 - j1) - (i_len)
x_string.append(input[i1:i2])
y_string.append(output[j1:j2])
if diff_len == 0:
# i think this case is the most common, hence this
# unorthodox arrangement of test cases
change_string.append('!' * i_len)
elif diff_len > 0: # j is longer than i
# in the next two cases, however, the sequence-matcher
# doesn't tell us how to align "replaced" sequences that
# aren't the same length on the input and output side.
# how should we take care of this? i can think of a few
# linguistic cases of interest. One is ablaut verbs in
# English, which are a mixed bag:
#
# laIt 'light'
# l It 'lit'
#
# wITst&nd 'withstand'
# wItstU d 'withstood'
#
# bIsitS 'beseech'
# besOt 'besought'
#
# 'beseech' should be fine. withstand and withstood are
# opposites. i suggest aligning actual material to the
# left, mark it all as '=', and call it a day.
x_string.append('*' * diff_len)
change_string.append('!' * i_len)
change_string.append('v' * diff_len)
else: # diff_len < 0: # i is longer than j
diff_len = -diff_len
y_string.append('*' * diff_len)
change_string.append('!' * (i_len - diff_len))
change_string.append('^' * diff_len)
else: # tag == 'equal'
x_string.append(input[i1:i2])
y_string.append(output[j1:j2])
change_string.append('=' * (i2 - i1))
print ''.join(x_string)
print ''.join(change_string)
print ''.join(y_string)
print
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment