Instantly share code, notes, and snippets.

# memononen/diff.cpp

Last active November 20, 2021 19:01
Show Gist options
• Save memononen/ff90a46ff55c20fc4655952c899239ff to your computer and use it in GitHub Desktop.
O(NP) diff with backtracking
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
 #include #include #include // Based on "An O(NP) Sequence Comparison Algorithm" by Sun Wu, Udi Manber and Gene Myers struct Edit { enum Type { Insert, Delete, Common }; Edit() = default; Edit(Type _type, int _a, int _b, int _n) : type(_type), a(_a), b(_b), n(_n) {} Type type = Insert; // type of edit int a = 0; // index in string a int b = 0; // index in string b int n = 0; // number of characters }; struct Diag { Diag() = default; Diag(int _x, int _y, int _len, int _next) : x(_x), y(_y), len(_len), next(_next) {} int x = 0; // start position int y = 0; int len = 0; // length int next = -1; // next diagonal towards the start }; struct Pt { Pt() = default; Pt(int _y, int _diag) : y(_y), diag(_diag) {} int y = 0; // furthest y int diag = -1; // nearest diagonal }; static void snake(const int k, const char* a, const char* b, const int m, const int n, std::vector& fp, const int fp0, std::vector& diags) { const Pt& pa = fp[fp0 + k-1]; const Pt& pb = fp[fp0 + k+1]; const bool below = (pa.y+1) > pb.y; const int diag = below ? pa.diag : pb.diag; int y = below ? (pa.y+1) : pb.y; int x = y - k; int len = 0; while (x < m && y < n && a[x] == b[y]) { x++; y++; len++; } if (len > 0) { diags.push_back(Diag(x-len, y-len, len, diag)); fp[fp0 + k] = Pt(y, diags.size() - 1); } else { fp[fp0 + k] = Pt(y, diag); } }; std::vector diff(const char* a, const char* b) { int m = strlen(a); int n = strlen(b); bool reverse = false; if (m > n) { std::swap(a, b); std::swap(m, n); reverse = true; } std::vector fp; std::vector diags; const int delta = n - m; const int size = (m+1) + (n+1) + 1; const int fp0 = m+1; // zero offset for furthest point indexing, it can go negative. fp.resize(size); // All paths will lead to empty diagonal at zero. diags.push_back(Diag(0, 0, 0, -1)); std::fill(fp.begin(), fp.end(), Pt(-1,0)); // Calculate common diagonal sequences for (int p = 0; fp[fp0 + delta].y != n; p++) { for (int k = -p; k <= delta-1; k++) snake(k, a,b,m,n,fp,fp0,diags); for (int k = delta+p; k >= delta+1; k--) snake(k, a,b,m,n,fp,fp0,diags); snake(delta, a,b,m,n,fp,fp0,diags); } // Backtrace shortest edit script std::vector ses; Diag pdiag(m, n, 0, -1); for (int d = fp[fp0 + delta].diag; d != -1; d = diags[d].next) { const Diag& diag = diags[d]; // The path between the diagonals is handled with inserts and deletes. int ndel = pdiag.x - (diag.x+diag.len); int nins = pdiag.y - (diag.y+diag.len); int ncom = diag.len; int x = diag.x, y = diag.y; if (reverse) { std::swap(ndel, nins); std::swap(x, y); } if (nins > 0) ses.push_back(Edit(Edit::Insert, x+ncom+ndel, y+ncom, nins)); if (ndel > 0) ses.push_back(Edit(Edit::Delete, x+ncom, y+ncom, ndel)); if (ncom > 0) ses.push_back(Edit(Edit::Common, x, y, ncom)); pdiag = diag; } // Backtrace left the sequence in reverse, flip it. std::reverse(ses.begin(), ses.end()); return ses; } int main() { const char* b = "Lorem ipsum dolor sit amet"; const char* a = "A quick brown fox jumps over the lazy dog"; std::vector ses = diff(a, b); printf("A: "); for (const Edit& ed : ses) { switch (ed.type) { case Edit::Common: for (int k = 0; k < ed.n; k++) printf("%c", a[ed.a+k]); break; case Edit::Insert: for (int k = 0; k < ed.n; k++) printf(" "); break; case Edit::Delete: printf("\u001b[31m"); for (int k = 0; k < ed.n; k++) printf("%c", a[ed.a+k]); printf("\u001b[0m"); break; } } printf("\u001b[0m\n"); printf("B: "); for (const Edit& ed : ses) { switch (ed.type) { case Edit::Common: for (int k = 0; k < ed.n; k++) printf("%c", b[ed.b+k]); break; case Edit::Insert: printf("\u001b[32m"); for (int k = 0; k < ed.n; k++) printf("%c", b[ed.b+k]); printf("\u001b[0m"); break; case Edit::Delete: for (int k = 0; k < ed.n; k++) printf(" "); break; } } printf("\u001b[0m\n"); return 0; }

### memononen commented Nov 19, 2021

``````A: A quick br own fox ju  m  ps    ove  r   the lazy dog
B:           Lo         rem ipsum do  lor sit    a      met
``````