kylebgorman/difflib_demo.py

## difflib_demo.py
#!/usr/bin/env python
# difflib_demo.py
# Kyle Gorman <kgorman@ling.upenn.edu>

from difflib import SequenceMatcher

if __name__ == '__main__':

    from sys import argv
    for file in argv[1:]:
        for line in open(file, 'r'):
            (input, output) = line.rstrip().split()
            q = SequenceMatcher(None, input, output)
            x_string = []
            y_string = []
            change_string = []
            for (tag, i1, i2, j1, j2) in q.get_opcodes():
                if tag == 'insert':
                    change_len = j2 - j1
                    x_string.append('*' * change_len)
                    y_string.append(output[j1:j2])
                    change_string.append('v' * change_len)
                elif tag == 'delete':
                    change_len = i2 - i1
                    x_string.append(input[i1:i2])
                    y_string.append('*' * change_len)
                    change_string.append('^' * change_len)
                elif tag == 'replace':
                    i_len = i2 - i1
                    diff_len = (j2 - j1) - (i_len)
                    x_string.append(input[i1:i2])
                    y_string.append(output[j1:j2])
                    if diff_len == 0:
                        # i think this case is the most common, hence this
                        # unorthodox arrangement of test cases
                        change_string.append('!' * i_len)
                    elif diff_len > 0: # j is longer than i
                        # in the next two cases, however, the sequence-matcher
                        # doesn't tell us how to align "replaced" sequences that
                        # aren't the same length on the input and output side.
                        # how should we take care of this? i can think of a few
                        # linguistic cases of interest. One is ablaut verbs in
                        # English, which are a mixed bag:
                        #
                        # laIt 'light'
                        # l It 'lit'
                        #
                        # wITst&nd 'withstand'
                        # wItstU d 'withstood'
                        #
                        # bIsitS    'beseech'
                        # besOt     'besought'
                        #
                        # 'beseech' should be fine. withstand and withstood are
                        # opposites. i suggest aligning actual material to the
                        # left, mark it all as '=', and call it a day.
                        x_string.append('*' * diff_len)
                        change_string.append('!' * i_len)
                        change_string.append('v' * diff_len)
                    else: # diff_len < 0: # i is longer than j
                        diff_len = -diff_len
                        y_string.append('*' * diff_len)
                        change_string.append('!' * (i_len - diff_len))
                        change_string.append('^' * diff_len)
                else: # tag == 'equal'
                    x_string.append(input[i1:i2])
                    y_string.append(output[j1:j2])
                    change_string.append('=' * (i2 - i1))
            print ''.join(x_string)
            print ''.join(change_string)
            print ''.join(y_string)
            print
	#!/usr/bin/env python
	# difflib_demo.py
	# Kyle Gorman <kgorman@ling.upenn.edu>

	from difflib import SequenceMatcher

	if __name__ == '__main__':

	from sys import argv
	for file in argv[1:]:
	for line in open(file, 'r'):
	(input, output) = line.rstrip().split()
	q = SequenceMatcher(None, input, output)
	x_string = []
	y_string = []
	change_string = []
	for (tag, i1, i2, j1, j2) in q.get_opcodes():
	if tag == 'insert':
	change_len = j2 - j1
	x_string.append('' change_len)
	y_string.append(output[j1:j2])
	change_string.append('v' * change_len)
	elif tag == 'delete':
	change_len = i2 - i1
	x_string.append(input[i1:i2])
	y_string.append('' change_len)
	change_string.append('^' * change_len)
	elif tag == 'replace':
	i_len = i2 - i1
	diff_len = (j2 - j1) - (i_len)
	x_string.append(input[i1:i2])
	y_string.append(output[j1:j2])
	if diff_len == 0:
	# i think this case is the most common, hence this
	# unorthodox arrangement of test cases
	change_string.append('!' * i_len)
	elif diff_len > 0: # j is longer than i
	# in the next two cases, however, the sequence-matcher
	# doesn't tell us how to align "replaced" sequences that
	# aren't the same length on the input and output side.
	# how should we take care of this? i can think of a few
	# linguistic cases of interest. One is ablaut verbs in
	# English, which are a mixed bag:
	#
	# laIt 'light'
	# l It 'lit'
	#
	# wITst&nd 'withstand'
	# wItstU d 'withstood'
	#
	# bIsitS 'beseech'
	# besOt 'besought'
	#
	# 'beseech' should be fine. withstand and withstood are
	# opposites. i suggest aligning actual material to the
	# left, mark it all as '=', and call it a day.
	x_string.append('' diff_len)
	change_string.append('!' * i_len)
	change_string.append('v' * diff_len)
	else: # diff_len < 0: # i is longer than j
	diff_len = -diff_len
	y_string.append('' diff_len)
	change_string.append('!' * (i_len - diff_len))
	change_string.append('^' * diff_len)
	else: # tag == 'equal'
	x_string.append(input[i1:i2])
	y_string.append(output[j1:j2])
	change_string.append('=' * (i2 - i1))
	print ''.join(x_string)
	print ''.join(change_string)
	print ''.join(y_string)
	print