Last active
July 24, 2022 04:09
-
-
Save Calmarius/24bdf9b753eb0553c148 to your computer and use it in GitHub Desktop.
PHP diff algorithm with minimum gap removal
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Removes gaps between the diff block by fusing the lines into the diff. | |
* | |
* $diff: A diff created by computeDiff. | |
* $minGaps: a number specifying the smallest allowable gap between two diff blocks. | |
* | |
* Returns a new diff (see computeDiff for details.) | |
*/ | |
function removeDiffGaps($diff, $minGaps) | |
{ | |
if (!isset($diff['values'])) return FALSE; | |
if (!isset($diff['mask'])) return FALSE; | |
$values = $diff['values']; | |
$mask = $diff['mask']; | |
$n = count($mask); | |
$newValues = array(); | |
$newMask = array(); | |
$inUnchangedBlock = FALSE; | |
$unchangedLength = 0; | |
$unchangedStart = -1; | |
$pushValues = function($start, $end, $values, $mask) use (&$newValues, &$newMask) | |
{ | |
for ($j = $start; $j < $end; $j++) | |
{ | |
$newValues[] = $values[$j]; | |
$newMask[] = $mask; | |
} | |
}; | |
// Mark short unchanged blocks with '-' and '+' as if they are removed or added at the same time. | |
for ($i = 0; $i < $n; $i++) | |
{ | |
$maskval = $mask[$i]; | |
$value = $values[$i]; | |
if ($maskval == 0) | |
{ | |
if (!$inUnchangedBlock) | |
{ | |
$unchangedLength = 0; | |
$unchangedStart = $i; | |
$inUnchangedBlock = TRUE; | |
} | |
$unchangedLength++; | |
} | |
else | |
{ | |
if ($inUnchangedBlock) | |
{ | |
if ($unchangedLength < $minGaps) | |
{ | |
for ($j = $unchangedStart; $j < $i; $j++) | |
{ | |
$newValues[] = $values[$j]; | |
$newMask[] = -1; | |
$newValues[] = $values[$j]; | |
$newMask[] = 1; | |
} | |
} | |
else | |
{ | |
$pushValues($unchangedStart, $i, $values, 0); | |
} | |
$inUnchangedBlock = FALSE; | |
} | |
$newValues[] = $value; | |
$newMask[] = $maskval; | |
} | |
} | |
if ($inUnchangedBlock) | |
{ | |
$pushValues($unchangedStart, $i, $values, 0); | |
} | |
// Now sort the diff blocks so '-'s precede the '+'s, | |
$values = $newValues; | |
$mask = $newMask; | |
$n = count($mask); | |
$newValues = array(); | |
$newMask = array(); | |
$inUnchangedBlock = FALSE; | |
$minuses = array(); | |
$pluses = array(); | |
$pushMinusPlus = function($minuses, $pluses) use ($pushValues) | |
{ | |
$m = count($minuses); | |
$pushValues(0, $m, $minuses, -1); | |
$m = count($pluses); | |
$pushValues(0, $m, $pluses, 1); | |
}; | |
for ($i = 0; $i < $n; $i++) | |
{ | |
$maskval = $mask[$i]; | |
$value = $values[$i]; | |
if ($maskval == 0) | |
{ | |
if (!$inUnchangedBlock) | |
{ | |
$pushMinusPlus($minuses, $pluses); | |
$inUnchangedBlock = TRUE; | |
} | |
$newValues[] = $value; | |
$newMask[] = $maskval; | |
} | |
else | |
{ | |
if ($inUnchangedBlock) | |
{ | |
$inUnchangedBlock = FALSE; | |
$minuses = array(); | |
$pluses = array(); | |
} | |
if ($maskval == 1) | |
{ | |
$pluses[] = $value; | |
} | |
else | |
{ | |
$minuses[] = $value; | |
} | |
} | |
} | |
if (!$inUnchangedBlock) | |
{ | |
$pushMinusPlus($minuses, $pluses); | |
} | |
return array('values' => $newValues, 'mask' => $newMask); | |
} | |
/** | |
* Computes the diff of two arrays. | |
* | |
* $from, $to: The two arrays to compare. | |
* $lcsOnly: Set true if you are only interested in the longest common subsequence and down want the - and +-s. | |
* | |
* Returns an associative array containing two fields: | |
* | |
* values: Union of the two arrays containing all the common and different elements. | |
* mask: Contains numbers: 0: unchanged, -1: removed, 1: added. | |
* | |
* mask[i] corresponds to values[i] | |
* | |
*/ | |
function computeDiff($from, $to, $lcsOnly = FALSE) | |
{ | |
$diffValues = array(); | |
$diffMask = array(); | |
$dm = array(); | |
$n1 = count($from); | |
$n2 = count($to); | |
$n = min($n1, $n2); | |
// Extract unchanged head and remove unchanged elements from both arrays. | |
$headVal = array(); | |
$headMask = array(); | |
$firstChanged = $n; | |
for ($i = 0; $i < $n; $i++) | |
{ | |
if ($from[$i] != $to[$i]) | |
{ | |
$firstChanged = $i; | |
break; | |
} | |
$headVal[] = $from[$i]; | |
$headMask[] = 0; | |
} | |
$from = array_slice($from, $firstChanged); | |
$to = array_slice($to, $firstChanged); | |
$n1 -= $firstChanged; | |
$n2 -= $firstChanged; | |
$n -= $firstChanged; | |
// Extract unchanged tail and remove unchanged elements from both arrays. | |
$tailVal = array(); | |
$tailMask = array(); | |
$lastChanged = $n; | |
for ($i = 1; $i <= $n; $i++) | |
{ | |
if ($from[$n1 - $i] != $to[$n2 - $i]) | |
{ | |
$lastChanged = $i - 1; | |
break; | |
} | |
$tailVal[] = $from[$n1 - $i]; | |
$tailMask[] = 0; | |
} | |
$tailVal = array_reverse($tailVal); | |
$from = array_slice($from, 0, $n1 - $lastChanged); | |
$to = array_slice($to, 0, $n2 - $lastChanged); | |
$n1 -= $lastChanged; | |
$n2 -= $lastChanged; | |
$n -= $lastChanged; | |
// Generate LCS matrix. First is the column, second is the row. | |
for ($j = -1; $j < $n2; $j++) $dm[-1][$j] = 0; | |
for ($i = -1; $i < $n1; $i++) $dm[$i][-1] = 0; | |
ksort($dm[-1]); | |
ksort($dm); | |
for ($i = 0; $i < $n1; $i++) | |
{ | |
for ($j = 0; $j < $n2; $j++) | |
{ | |
if ($from[$i] == $to[$j]) | |
{ | |
$ad = $dm[$i - 1][$j - 1]; | |
$dm[$i][$j] = $ad + 1; | |
} | |
else | |
{ | |
$a1 = $dm[$i - 1][$j]; | |
$a2 = $dm[$i][$j - 1]; | |
$dm[$i][$j] = max($a1, $a2); | |
} | |
} | |
} | |
// Traverse the diff matrix. | |
$i = $n1 - 1; | |
$j = $n2 - 1; | |
while (($i > -1) || ($j > -1)) | |
{ | |
if ($j > -1) | |
{ | |
if ($dm[$i][$j - 1] == $dm[$i][$j]) | |
{ | |
if (!$lcsOnly) | |
{ | |
$diffValues[] = $to[$j]; | |
$diffMask[] = 1; | |
} | |
$j--; | |
continue; | |
} | |
} | |
if ($i > -1) | |
{ | |
if ($dm[$i - 1][$j] == $dm[$i][$j]) | |
{ | |
if (!$lcsOnly) | |
{ | |
$diffValues[] = $from[$i]; | |
$diffMask[] = -1; | |
} | |
$i--; | |
continue; | |
} | |
} | |
{ | |
$diffValues[] = $from[$i]; | |
$diffMask[] = 0; | |
$i--; | |
$j--; | |
} | |
} | |
$diffValues = array_reverse($diffValues); | |
$diffMask = array_reverse($diffMask); | |
$diffValues = array_merge($headVal, $diffValues, $tailVal); | |
$diffMask = array_merge($headMask, $diffMask, $tailMask); | |
if ($lcsOnly) return $diffValues; | |
return array('values' => $diffValues, 'mask' => $diffMask); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Great tool! Thank you.