levenshtein in php, supports multibyte characters
<?php | |
function levenshtein_php($str1, $str2){ | |
$length1 = mb_strlen( $str1, 'UTF-8'); | |
$length2 = mb_strlen( $str2, 'UTF-8'); | |
if( $length1 < $length2) return levenshtein_php($str2, $str1); | |
if( $length1 == 0 ) return $length2; | |
if( $str1 === $str2) return 0; | |
$prevRow = range( 0, $length2); | |
$currentRow = array(); | |
for ( $i = 0; $i < $length1; $i++ ) { | |
$currentRow=array(); | |
$currentRow[0] = $i + 1; | |
$c1 = mb_substr( $str1, $i, 1, 'UTF-8') ; | |
for ( $j = 0; $j < $length2; $j++ ) { | |
$c2 = mb_substr( $str2, $j, 1, 'UTF-8' ); | |
$insertions = $prevRow[$j+1] + 1; | |
$deletions = $currentRow[$j] + 1; | |
$substitutions = $prevRow[$j] + (($c1 != $c2)?1:0); | |
$currentRow[] = min($insertions, $deletions, $substitutions); | |
} | |
$prevRow = $currentRow; | |
} | |
return $prevRow[$length2]; | |
} | |
echo levenshtein_php( 'കട', 'കടല' )."\n"; | |
echo levenshtein_php( 'കട', 'കല' )."\n"; | |
echo levenshtein_php( 'കട', 'കടി' )."\n"; | |
echo levenshtein_php( 'abce', 'abcdf' )."\n"; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment