Skip to content

Instantly share code, notes, and snippets.

@fruit
Created May 11, 2011 21:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save fruit/967374 to your computer and use it in GitHub Desktop.
Save fruit/967374 to your computer and use it in GitHub Desktop.
String comparing algorithm
<?php
/**
* This class implements string comparison algorithm
* based on character pair similarity
*
* @link http://www.catalysoft.com/articles/StrikeAMatch.html
* @author Ilya Sabelnikov <fruit.dev@gmail.com>
*/
class SimilarityTool
{
/**
* Compares the two strings based on letter pair matches
*
* @param string $str1
* @param string $str2
* @return float
*/
public static function compareStrings ($str1, $str2)
{
if (0 == strlen($str1) + strlen($str2) || 0 == strcmp($str1, $str2))
{
return 1.0;
}
$pairs1 = self::wordLetterPairs($str1);
$pairs2 = self::wordLetterPairs($str2);
$pairs1Count = count($pairs1);
$pairs2Count = count($pairs2);
$union = $pairs1Count + $pairs2Count;
if (0 == $union)
{
return 0.0;
}
$intersection = 0;
for ($i = 0; $i < $pairs1Count; $i ++ )
{
for ($j = 0; $j < $pairs2Count; $j ++ )
{
if (isset($pairs2[$j]))
{
if ($pairs1[$i] == $pairs2[$j])
{
$intersection ++;
unset($pairs2[$j]);
break;
}
}
}
}
return (2 * $intersection) / $union;
}
/**
* Gets all letter pairs for each individual word in the string
*
* @param string $str
* @return array
*/
private static function wordLetterPairs ($str)
{
$allPairs = array();
// Tokenize the string and put the tokens/words into an array
$words = explode(' ', $str);
// For each word
foreach ($words as $word)
{
if (! $word)
{
continue;
}
// Find the pairs of characters
$numPairs = mb_strlen($word, 'UTF-8') - 1;
for ($i = 0; $i < $numPairs; $i ++)
{
$allPairs[] = mb_substr($word, $i, 2, 'UTF-8');
}
}
return $allPairs;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment