Skip to content

Instantly share code, notes, and snippets.

@binux
Created November 22, 2011 02:10
Show Gist options
  • Save binux/1384677 to your computer and use it in GitHub Desktop.
Save binux/1384677 to your computer and use it in GitHub Desktop.
a diff function base on Advanced Dynamic Programming
#!/usr/bin/python
# -*- coding: utf-8 -*-
'''
Created on 2011-6-7
Modified on 2011-11-22
a diff function base on Advanced Dynamic Programming
http://www.avatar.se/molbioinfo2001/dynprog/adv_dynamic.html
@author: binux
'''
__all__ = ['DP_diff']
from itertools import product
from difflib import SequenceMatcher
from multiprocessing import Pool
def _build_matrix(isjunk, a, b, delete_w=-1.5, insert_w=0, replace_w=-1, min_similar = 0.8):
def _init_matrix():
_matrix = [[0 for col in range(len(a)+1)] for row in range(len(b)+1)]
_opt = [[0 for col in range(len(a)+1)] for row in range(len(b)+1)]
for j in range(1, len(a)+1):
_matrix[0][j] = j*delete_w
_opt[0][j] = "delete"
for i in range(1, len(b)+1):
_matrix[i][0] = i*insert_w
_opt[i][0] = "insert"
return _matrix, _opt
def _fill_matrix(m, o):
_seq2 = -1
_seq_cache = None
def _similar(i, j, _seq2, _seq_cache):
i, j = i-1, j-1
if _seq2 == i:
s = _seq_cache
s.set_seq1(a[j])
else:
s = SequenceMatcher(isjunk, a[j], b[i])
_seq2 = i
_seq_cache = s
ratio = s.quick_ratio()
if ratio < min_similar:
return replace_w
if a[j] in b[i] or b[i] in a[j]:
ratio = (1-(1-ratio)/2)
return ratio*2
for i, j in product(range(1, len(b)+1), range(1, len(a)+1)):
score = _similar(i, j, _seq2, _seq_cache)
a1 = m[i-1][j-1]+score
a2 = m[i-1][j]+insert_w
a3 = m[i][j-1]+delete_w
m[i][j] = max(a1, a2, a3)
if m[i][j] == a1 and score > 0:
o[i][j] = "equal"
elif m[i][j] == a1 and score < 0:
o[i][j] = "replace"
elif m[i][j] == a2:
o[i][j] = "insert" # go down
elif m[i][j] == a3:
o[i][j] = "delete" # go right
return m, o
return _fill_matrix(*_init_matrix())
GlobalPool = Pool()
def build_matrix(*args, **kwargs):
return GlobalPool.apply(_build_matrix, args, kwargs)
class DP_diff:
"""a diff function base on Advanced Dynamic Programming
>>> s = DP_diff(None, "GAATTCAGTTA", "GGATCGA")
>>> s._matrix
[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 2.0, 0.0, -1, -1, -1, -1, -1, 2.0, 0.0, -1, -1], [0, 2.0, 1.0, -1.0, -2, -2, -2, -2, 1.0, 1.0, -1.0, -2], [0, 0.0, 4.0, 3.0, 1.0, -1.0, -3, 0.0, -1.0, 0.0, 0.0, 1.0], [0, -1, 2.0, 3.0, 5.0, 3.0, 1.0, -1.0, -1.0, 1.0, 2.0, 0.0], [0, -1, 0.0, 1.0, 3.0, 4.0, 5.0, 3.0, 1.0, -1.0, 0.0, 1.0], [0, 2.0, 0.0, -1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 3.0, 1.0, -1.0], [0, 0.0, 4.0, 2.0, 0.0, 0.0, 1.0, 5.0, 3.0, 4.0, 2.0, 3.0]]
>>> s.get_opcodes()
[('equal', 0, 1, 0, 1), ('replace', 1, 2, 1, 2), ('equal', 2, 3, 2, 3), ('delete', 3, 4, 3, 3), ('equal', 4, 6, 3, 5), ('delete', 6, 7, 5, 5), ('equal', 7, 8, 5, 6), ('delete', 8, 10, 6, 6), ('equal', 10, 11, 6, 7)]
>>> DP_diff(None, "", "AABBCC").get_opcodes()
[('insert', 0, 0, 0, 6)]
>>> DP_diff(None, "AABBCC", "").get_opcodes()
[('delete', 0, 6, 0, 0)]
"""
def __init__(self, isjunk, a, b, delete_w=-1.5, insert_w=0, replace_w=-1, min_similar = 0.8):
self.isjunk = isjunk
self.a = a
self.b = b
self.delete_w = delete_w
self.insert_w = insert_w
self.replace_w = replace_w
self.min_similar = min_similar
self._build_matrix()
def _build_matrix(self):
if len(self.a) > 10 and len(self.b) > 10:
self._matrix, self._opt = build_matrix(
isjunk = self.isjunk,
a = self.a,
b = self.b,
delete_w = self.delete_w,
insert_w = self.insert_w,
replace_w = self.replace_w,
min_similar = self.min_similar,
)
else:
self._matrix, self._opt = _build_matrix(
isjunk = self.isjunk,
a = self.a,
b = self.b,
delete_w = self.delete_w,
insert_w = self.insert_w,
replace_w = self.replace_w,
min_similar = self.min_similar,
)
def get_opcodes(self):
m = self._matrix
o = self._opt
result = []
a2, b2 = i, j = len(m)-1, len(m[0])-1
while o[i][j]:
opt = o[i][j]
if opt == "equal" or o[i][j] == "replace":
while(o[i][j] == opt):
i, j = i-1, j-1
elif opt == "insert":
while(o[i][j] == opt):
i, j = i-1, j
elif opt == "delete":
while(o[i][j] == opt):
i, j = i, j-1
else:
break
result.append( (opt, j, b2, i, a2) )
a2, b2 = i, j
result.reverse()
return result
def ratio(self):
return self._matrix[-1][-1]
if __name__ == "__main__":
import doctest
doctest.testmod()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment