Skip to content

Instantly share code, notes, and snippets.

@vincenzo
Last active October 1, 2015 20:58
Show Gist options
  • Save vincenzo/2057608 to your computer and use it in GitHub Desktop.
Save vincenzo/2057608 to your computer and use it in GitHub Desktop.
Soundex - Correct and Train
<?php
function correct($word, $dic) {
if (array_key_exists($word, $dic)) {
return $word;
}
$search_result = $dic[soundex($word)];
foreach ($search_result as $key => &$res) {
$dist = levenshtein($key, $word);
// consider just distance equals to 1 (the best) or 2
if ($dist == 1 || $dist == 2) {
$res = $res / $dist;
}
// discard all the other candidates that have distances other than 1 and 2
// from the original word
else {
unset($search_result[$key]);
}
}
// reverse sorting of the words by frequence
arsort($search_result);
// return the first key of the array (which will be the word suggested)
foreach ($search_result as $key => $res) {
return $key;
}
}
<?php
function train($file = 'big.txt') {
$contents = file_get_contents($file);
// get all strings of word letters
preg_match_all('/\w+/', $contents, $matches);
unset($contents);
$dictionary = array();
foreach ($matches[0] as $word) {
$word = strtolower($word);
$soundex_key = soundex($word);
if (!isset($dictionary[$soundex_key][$word])) {
$dictionary[$soundex_key][$word] = 0;
}
$dictionary[$soundex_key][$word] += 1;
}
unset($matches);
return $dictionary;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment