Last active
October 1, 2015 20:58
-
-
Save vincenzo/2057608 to your computer and use it in GitHub Desktop.
Soundex - Correct and Train
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
function correct($word, $dic) { | |
if (array_key_exists($word, $dic)) { | |
return $word; | |
} | |
$search_result = $dic[soundex($word)]; | |
foreach ($search_result as $key => &$res) { | |
$dist = levenshtein($key, $word); | |
// consider just distance equals to 1 (the best) or 2 | |
if ($dist == 1 || $dist == 2) { | |
$res = $res / $dist; | |
} | |
// discard all the other candidates that have distances other than 1 and 2 | |
// from the original word | |
else { | |
unset($search_result[$key]); | |
} | |
} | |
// reverse sorting of the words by frequence | |
arsort($search_result); | |
// return the first key of the array (which will be the word suggested) | |
foreach ($search_result as $key => $res) { | |
return $key; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
function train($file = 'big.txt') { | |
$contents = file_get_contents($file); | |
// get all strings of word letters | |
preg_match_all('/\w+/', $contents, $matches); | |
unset($contents); | |
$dictionary = array(); | |
foreach ($matches[0] as $word) { | |
$word = strtolower($word); | |
$soundex_key = soundex($word); | |
if (!isset($dictionary[$soundex_key][$word])) { | |
$dictionary[$soundex_key][$word] = 0; | |
} | |
$dictionary[$soundex_key][$word] += 1; | |
} | |
unset($matches); | |
return $dictionary; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment