Instantly share code, notes, and snippets.

# memononen/diff3.cpp

Created December 7, 2021 19:33
Show Gist options
• Save memononen/2c83d183c2749e5f4a493ce7ddb73f4d to your computer and use it in GitHub Desktop.
3-way merge based on O(NP) Myers diff and diff3, merging per item
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
 #include #include #include #include // Based on // "An O(NP) Sequence Comparison Algorithm" by Sun Wu, Udi Manber and Gene Myers // - https://publications.mpi-cbg.de/Wu_1990_6334.pdf // - Good article visualizing Myer's older algorithm: https://epxx.co/artigos/diff_en.html // // "A Formal Investigation of Diff3" // - https://www.cis.upenn.edu/~bcpierce/papers/diff3-short.pdf enum class EditType { Insert, Delete, Substitute, Match }; // Describes how one sequence is changed to another. struct Edit { Edit() = default; Edit(EditType _type, int _base, int _change) : type(_type), base(_base), change(_change) {} EditType type = EditType::Match; // type of edit int base = INT32_MAX; // index in string a int change = INT32_MAX; // index in string b }; struct Diag { Diag() = default; Diag(int _x, int _y, int _len, int _next) : x(_x), y(_y), length(_len), next(_next) {} int x = 0; // start position int y = 0; int length = 0; // diagonal length int next = -1; // next diagonal towards the start }; struct Point { Point() = default; Point(int _y, int _score, int _diag) : y(_y), score(_score), diag(_diag) {} int y = 0; // furthest y int score = 0; int diag = -1; // nearest diagonal }; enum class MergeType { Conflict, Base, Left, Right }; // Describes a change, type defines from which sequence should be used for merged. struct Merge { Merge() = default; Merge(MergeType _type, EditType _edit, const int _base, const int _left, const int _right) : type(_type), edit(_edit), base(_base), left(_left), right(_right) {} MergeType type = MergeType::Base; EditType edit = EditType::Match; int base = 0; int left = 0; int right = 0; }; static void snake(const int k, std::span left, std::span right, std::span fp, const int fp0, std::vector& diags) { constexpr int insRemScore = 1; constexpr int subsScore = 4; // Making substitute cheaper than match favors longer diagonals and reduces small edits (chaffs). constexpr int matchScore = 2; const Point& belowPt = fp[fp0 + k-1]; const Point& rightPt = fp[fp0 + k+1]; const Point& prevPt = fp[fp0 + k]; int prevDiag = prevPt.diag; int y = prevPt.y+1; int score = prevPt.score + subsScore; if ((rightPt.score + insRemScore) > score) { prevDiag = rightPt.diag; y = rightPt.y; score = rightPt.score + insRemScore; } if ((belowPt.score + insRemScore) > score) { prevDiag = belowPt.diag; y = belowPt.y+1; score = belowPt.score + insRemScore; } int x = y - k; int length = 0; const int N = left.size(); const int M = right.size(); while (x < N && y < M && left[x] == right[y]) { x++; y++; length++; score += matchScore; } if (length > 0) { diags.emplace_back(x - length, y - length, length, prevDiag); fp[fp0 + k] = Point(y, score, diags.size() - 1); } else { fp[fp0 + k] = Point(y, score, prevDiag); } } // Returns shortest(ish) edit script between left and right. std::vector diff(std::span left, std::span right) { bool reverse = false; if (left.size() > right.size()) { std::swap(left, right); reverse = true; } const int N = left.size(); const int M = right.size(); std::vector fp; std::vector diags; const int delta = M - N; const int fp0 = N + 1; // zero offset for furthest point indexing, indexing can go negative. fp.resize((N+1) + (M+1) + 1); // All paths will lead to empty diagonal at zero. diags.push_back(Diag(0, 0, 0, -1)); std::fill(fp.begin(), fp.end(), Point(-1,0,0)); // Calculate common diagonal sequences for (int p = 0; fp[fp0 + delta].y != M; p++) { for (int k = -p; k <= delta-1; k++) snake(k, left, right, fp, fp0, diags); for (int k = delta+p; k >= delta+1; k--) snake(k, left, right, fp, fp0, diags); snake(delta, left, right, fp, fp0, diags); } // Backtrace shortest edit script std::vector diff; Diag prevDiag(N, M, 0, -1); for (int i = fp[fp0 + delta].diag; i != -1; i = diags[i].next) { const Diag& diag = diags[i]; // The path between the diagonals is handled with substitutes, inserts, and deletes. int numDel = prevDiag.x - (diag.x + diag.length); int numIns = prevDiag.y - (diag.y + diag.length); int numSubs = std::min(numDel, numIns); numDel -= numSubs; numIns -= numSubs; int numMatch = diag.length; int x = prevDiag.x, y = prevDiag.y; if (reverse) { std::swap(numDel, numIns); std::swap(x, y); } // Store edit for each item in sequence, walking backwards. for (int j = 0; j < numIns; j++) { y--; diff.emplace_back(EditType::Insert, x, y); } for (int j = 0; j < numDel; j++) { x--; diff.emplace_back(EditType::Delete, x, y); } for (int j = 0; j < numSubs; j++) { x--; y--; diff.emplace_back(EditType::Substitute, x, y); } for (int j = 0; j < numMatch; j++) { x--; y--; diff.emplace_back(EditType::Match, x, y); } prevDiag = diag; } // Backtrace left the sequence in reverse, flip it. std::reverse(diff.begin(), diff.end()); return diff; } // Returns true if index is valid for specific diff. static bool isValidIndex(const int index, std::span diff) { return index < (int)diff.size(); } // Returns index in base seq, or "infinity" if the index is out of bounds (i.e. iterator has finished). static int getBase(const int index, std::span diff) { return index < (int)diff.size() ? diff[index].base : INT32_MAX; } // Merges diffs into operations that can be used to alter each of the sequences to get the combined result. std::vector merge3(std::span base, std::span left, std::span right, std::span leftDiff, std::span rightDiff) { std::vector res; int indexLeft = 0; int indexRight = 0; int lastLeftChange = 0; int lastRightChange = 0; while (isValidIndex(indexLeft, leftDiff) || isValidIndex(indexRight, rightDiff)) { const int leftBasePos = getBase(indexLeft, leftDiff); const int rightBasePos = getBase(indexRight, rightDiff); const int basePos = std::min(leftBasePos, rightBasePos); Edit leftEdit(EditType::Match, basePos, lastLeftChange); if (isValidIndex(indexLeft, leftDiff) && leftDiff[indexLeft].base == basePos) { leftEdit = leftDiff[indexLeft]; indexLeft++; } Edit rightEdit(EditType::Match, basePos, lastRightChange); if (isValidIndex(indexRight, rightDiff) && rightDiff[indexRight].base == basePos) { rightEdit = rightDiff[indexRight]; indexRight++; } // Merge thruth table. There likely is a simple logic that would handle it without the convoluted ifs. // // L / R | ins del subs match // ------+----------------------------- // ins | X SL LR IL // del | SR D IR DR // subs | RL IL X SL // match | IR DL SR MB if (leftEdit.type == EditType::Insert && rightEdit.type == EditType::Insert) { // X - Conflict res.emplace_back(MergeType::Conflict, EditType::Insert, basePos, leftEdit.change, rightEdit.change); } else if (leftEdit.type == EditType::Substitute && rightEdit.type == EditType::Substitute) { // X - Conflict res.emplace_back(MergeType::Conflict, EditType::Substitute, basePos, leftEdit.change, rightEdit.change); } else if ((leftEdit.type == EditType::Insert && rightEdit.type == EditType::Delete) || (leftEdit.type == EditType::Substitute && rightEdit.type == EditType::Match)) { // SL - substitute left res.emplace_back(MergeType::Left, EditType::Substitute, basePos, leftEdit.change, rightEdit.change); } else if ((leftEdit.type == EditType::Delete && rightEdit.type == EditType::Insert) || (leftEdit.type == EditType::Match && rightEdit.type == EditType::Substitute)) { // SR - substitute right res.emplace_back(MergeType::Right, EditType::Substitute, basePos, leftEdit.change, rightEdit.change); } else if ((leftEdit.type == EditType::Insert && rightEdit.type == EditType::Match) || (leftEdit.type == EditType::Substitute && rightEdit.type == EditType::Delete)) { // IL - insert left res.emplace_back(MergeType::Left, EditType::Insert, basePos, leftEdit.change, rightEdit.change); } else if ((leftEdit.type == EditType::Match && rightEdit.type == EditType::Insert) || (leftEdit.type == EditType::Delete && rightEdit.type == EditType::Substitute)) { // IR - insert right res.emplace_back(MergeType::Right, EditType::Insert, basePos, leftEdit.change, rightEdit.change); } else if (leftEdit.type == EditType::Match && rightEdit.type == EditType::Match) { // MB - match base res.emplace_back(MergeType::Base, EditType::Match, basePos, leftEdit.change, rightEdit.change); } else if (leftEdit.type == EditType::Match && rightEdit.type == EditType::Delete) { // DL - delete left res.emplace_back(MergeType::Left, EditType::Delete, basePos, leftEdit.change, rightEdit.change); } else if (leftEdit.type == EditType::Delete && rightEdit.type == EditType::Match) { // DR - delete right res.emplace_back(MergeType::Right, EditType::Delete, basePos, leftEdit.change, rightEdit.change); } else if (leftEdit.type == EditType::Insert && rightEdit.type == EditType::Substitute) { // LR - left and right res.emplace_back(MergeType::Left, EditType::Insert, basePos, leftEdit.change, rightEdit.change); res.emplace_back(MergeType::Right, EditType::Substitute, basePos, leftEdit.change, rightEdit.change); } else if (leftEdit.type == EditType::Substitute && rightEdit.type == EditType::Insert) { // RL - right and left res.emplace_back(MergeType::Right, EditType::Insert, basePos, leftEdit.change, rightEdit.change); res.emplace_back(MergeType::Left, EditType::Substitute, basePos, leftEdit.change, rightEdit.change); } else if (leftEdit.type == EditType::Delete && rightEdit.type == EditType::Delete) { // D - delete res.emplace_back(MergeType::Left, EditType::Delete, basePos, leftEdit.change, rightEdit.change); res.emplace_back(MergeType::Right, EditType::Delete, basePos, leftEdit.change, rightEdit.change); } lastLeftChange = leftEdit.change; lastRightChange = rightEdit.change; } return res; } // Merge changes to right in-situ void resolveIntoRight(std::span merge, std::span base, std::span left, std::vector& right) { int offset = 0; for (const Merge& m : merge) { if (m.type == MergeType::Conflict) { // Conflicting insert or replaces, arbitrarily keep right (could be any) } else if (m.type == MergeType::Left && m.edit == EditType::Substitute) { // Substitute from left right[m.right + offset] = left[m.left]; } else if (m.type == MergeType::Left && m.edit == EditType::Insert) { // Insert from left right.insert(right.begin() + m.right + offset, left[m.left]); offset++; } else if (m.type == MergeType::Right && m.edit == EditType::Delete) { // Right delete. right.erase(right.begin() + m.right + offset); offset--; } else if (m.type == MergeType::Right && (m.edit == EditType::Substitute || m.edit == EditType::Insert)) { // Right, keep. } else if (m.type == MergeType::Base && m.edit == EditType::Match) { // Match with base, keep } } } void printDiff(std::span left, std::span right, std::span diff) { /* for (const Edit& ed : diff) { switch (ed.type) { case EditType::Match: printf("\u001b[0m"); printf("%c %c", left[ed.base], right[ed.change]); printf("\u001b[0m %d, %d\n", ed.base, ed.change); break; case EditType::Substitute: printf("\u001b[46m%c %c", left[ed.base], right[ed.change]); printf("\u001b[0m %d, %d\n", ed.base, ed.change); break; case EditType::Insert: printf(" \u001b[42m%c", right[ed.change]); printf("\u001b[0m %d, %d\n", ed.base, ed.change); break; case EditType::Delete: printf("\u001b[0m%c", left[ed.base]); printf("\u001b[0m %d, %d\n", ed.base, ed.change); break; } } printf("\u001b[0m\n");*/ printf(" Base: "); for (const Edit& ed : diff) { switch (ed.type) { case EditType::Match: printf("\u001b[0m"); printf("%c", left[ed.base]); break; case EditType::Substitute: printf("\u001b[0m"); printf("%c", left[ed.base]); break; case EditType::Insert: printf("\u001b[0m"); printf(" "); break; case EditType::Delete: printf("\u001b[41m"); printf("%c", left[ed.base]); break; default: break; } } printf("\u001b[0m\n"); // Change printf(" Change: "); for (const Edit& ed : diff) { switch (ed.type) { case EditType::Match: printf("\u001b[0m"); printf("%c", right[ed.change]); break; case EditType::Substitute: printf("\u001b[43m"); printf("%c", right[ed.change]); break; case EditType::Insert: printf("\u001b[42m"); printf("%c", right[ed.change]); break; case EditType::Delete: printf("\u001b[0m"); printf(" "); break; default: break; } } printf("\u001b[0m\n"); } void printMerge(std::span base, std::span left, std::span right, std::span merge) { /* char mergeMarker[] = "!BLR"; char editMarker[] = "?+-x="; printf("Diff\n"); for (const Merge& m : merge) { printf("%c%c %d, %d, %d %c %c %c\n", mergeMarker[(int)m.type], editMarker[(int)m.edit], m.base, m.left, m.right, base[m.base], left[m.left], right[m.right]); } printf("-\n");*/ printf(" "); for (const Merge& m : merge) { if (m.type == MergeType::Conflict) { // Conflict, arbitrarily choose right printf("\u001b[41m%c", right[m.right]); } else if (m.type == MergeType::Left && (m.edit == EditType::Substitute || m.edit == EditType::Insert)) { // Left printf("\u001b[44m%c", left[m.left]); } else if (m.type == MergeType::Right && (m.edit == EditType::Substitute || m.edit == EditType::Insert)) { // Right printf("\u001b[46m%c", right[m.right]); } else if (m.type == MergeType::Base && m.edit == EditType::Match) { // Match printf("\u001b[0m%c", base[m.base]); } } printf("\u001b[0m\n"); } void printv(std::span arr) { printf(" "); for (const char c : arr) printf("%c", c); printf("\n"); } std::vector makev(const char* s) { std::vector str; while (*s) { str.push_back(*s); s++; } return str; } int main() { std::vector base = makev("the quick fox jumps ovre some lazy dog"); std::vector left = makev("the abcck brown fox jumped ovre a dog"); std::vector right = makev("the qzick brown fox jumps over some record dog"); std::vector leftDiff = diff(base, left); std::vector rightDiff = diff(base, right); std::vector merged = merge3(base, left, right, leftDiff, rightDiff); printf("\nDiff Base > \u001b[44mLeft\u001b[0m\n"); printDiff(base, left, leftDiff); printf("\nDiff Base > \u001b[46mRight\u001b[0m\n"); printDiff(base, right, rightDiff); printf("\nMerged\n"); printMerge(base, left, right, merged); resolveIntoRight(merged, base, left, right); printv(right); printf("\n"); return 0; }

### memononen commented Dec 7, 2021

``````mikko@diff % g++ -std=c++2a -g3 diff3.cpp -o diff3
mikko@diff % ./diff3

Diff Base > Left
Base:   the quick       fox jumps  ovre some lazy dog
Change: the abcck brown fox jumped ovre       a   dog

Diff Base > Right
Base:   the quick       fox jumps ovre some lazy   dog
Change: the qzick brown fox jumps over some record dog

Merged
the azcck brown fox jumped over record dog
the azcck brown fox jumped over record dog
``````