Skip to content

Instantly share code, notes, and snippets.

@zwjzxh520
Last active November 8, 2016 04:37
Show Gist options
  • Save zwjzxh520/7e62a850ab8a6fd40417e4be0f2a25d2 to your computer and use it in GitHub Desktop.
Save zwjzxh520/7e62a850ab8a6fd40417e4be0f2a25d2 to your computer and use it in GitHub Desktop.
最小编辑距离算法比较文本异同 Levenshtein Distance。引用自https://code.google.com/archive/p/clamdemo/downloads
<?php
/***************************************************************************
*
* Copyright (c) 2011 log4myself.com, Inc. All Rights Reserved
*
**************************************************************************/
/**
* @file MinumDistance.php
* @author caojiandong(neu.loner@gmail.com)
* @date 2011/11/24 16:03:59
* @brief 计算最小编辑距离类,返回输出过程
*
**/
class MinumDistance {
/**
* 计算编辑距离的核心算法
*
* return array(
* 'distance' => 最小编辑距离
* 'progress' => 编辑过程
* )
**/
const MAX = 100000;
private function execute($src,$tgt){
$progress = array();
$m = count($src);
$n = count($tgt);
$d[self::MAX][self::MAX] = array();
$type[self::MAX][self::MAX] = array();
$back[self::MAX][self::MAX][2] = array();
$i = 0;
$j = 0;
$backi = 0;
$backj = 0;
$backtype = 0;
for($i = 0;$i <= $m;$i ++)
$d[$i][0] = $i;
for($j = 0;$j <= $n;$j ++)
$d[0][$j] = $j;
for($j = 1;$j <= $n;$j ++){
for($i = 1;$i <= $m;$i ++){
if($src[$i-1] == $tgt[$j-1]){
$d[$i][$j] = $d[$i-1][$j-1];
$back[$i][$j][0] = $i-1;
$back[$i][$j][1] = $j-1;
$type[$i][$j] = 0;
}else{
$d[$i][$j] = $d[$i-1][$j]+1;
$back[$i][$j][0] = $i-1;
$back[$i][$j][1] = $j;
$type[$i][$j] = 2;
if($d[$i][$j]>$d[$i][$j-1]+1){
$d[$i][$j] = $d[$i][$j-1]+1;
$back[$i][$j][0] = $i;
$back[$i][$j][1] = $j-1;
$type[$i][$j] = 1;
}
if($d[$i][$j]>$d[$i-1][$j-1]+1){
$d[$i][$j] = $d[$i-1][$j-1]+1;
$back[$i][$j][0] = $i-1;
$back[$i][$j][1] = $j-1;
$type[$i][$j] = 3;
}
}
}
}
$backi = $m;
$backj = $n;
while($backi && $backj){
$backtype = $type[$backi][$backj];
switch($backtype){
case 0:
$progress['copy'][] = array($backi-1,$backj-1);
break;
case 1:
$progress['del'][] = $backj-1;
break;
case 2:
$progress['insert'][] = $backi-1;
break;
case 3:
$progress['replace'][] = array($backi-1,$backj-1);
break;
}
$i = $back[$backi][$backj][0];
$j = $back[$backi][$backj][1];
$backi = $i;
$backj = $j;
}
return array(
'distance' => $d[$m][$n],
'progress' => $progress,
);
}
public function getMinumDistance($arrSrc,$arrTgt){
//接受string参数,如果是string,则进行stringToarray的转化
if(is_string($arrSrc) && is_string($arrTgt)){
$arrSrc = $this->getArrayFromString($arrSrc);
$arrTgt = $this->getArrayFromString($arrTgt);
}
if((!is_string($arrSrc) || !is_string($arrTgt)) &&
(!is_array($arrSrc) || !is_array($arrTgt)))
return false;
return $this->execute($arrSrc,$arrTgt);
}
public function __construct(){
}
//将文本分字成数组
private function getArrayFromString($str){
$result = array();
$strLen = mb_strlen($str,"UTF8");
$i = 0;
for($i = 0; $i < $strLen ; $i ++){
$result[] = mb_substr($str,$i,1);
}
return $result;
}
}
//显示不同之处的示例代码
$src = 'hello world 打倒旧世界';
$tgt = 'heelo wolrd!开创新世界';
$md = new MinumDistance();
$result = $md->getMinumDistance($src, $tgt);
$p = $result['progress'];
$index = 0;
$srcProgress = $tgtProgress = [];
foreach ($p['copy'] as $value) {
if ($value[0] == $value[1]) {
$srcProgress[] = $value[0];
$tgtProgress[] = $value[1];
}
}
echo hightlight($src, $srcProgress). '<br />';
echo hightlight($tgt, $tgtProgress). '<br />';
function hightlight($str, $progress) {
$srcArr = preg_split('//u', $str, null, PREG_SPLIT_NO_EMPTY);
$j = count($srcArr);
foreach ($progress as $i) {
if ($i < $j) {
for ($k=$i+1; $k < $j; $k++) {
$srcArr[$k] = '<span style="color:red;">'.$srcArr[$k].'</span>';
}
}
$j = $i;
}
return implode('', $srcArr);
}
/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment