Created
November 3, 2012 23:39
-
-
Save PerpetualBeta/4009338 to your computer and use it in GitHub Desktop.
Adds a "fuzzy" search suggestion to the i18n_search plug-in for the GetSimple CMS
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* Fuzzy Search Suggestion Snippet for i18n_search plug-in for the GetSimple CMS | |
* | |
* Sometimes a search query produces no matches. This is occassionaly due to a | |
* miss-spelling in the query terms. This routine will attempt to offer a search | |
* suggestion to the user when a search results in no matches. | |
* | |
* For example: | |
* | |
* Sorry, I didn't find anything that matched your query: "typografy behavier" | |
* Did you mean: typography behaviour | |
* | |
* The script will also link the suggested search terms back to the search | |
* engine. In most cases the suggestion will resolve to a positive query. | |
* However, it's not (yet) fool-proof and will occasionally link a query that, | |
* itself returns no results. | |
* | |
* I have only tested this with English words. As the script uses the metaphone | |
* algorithm it's likely that it won't work too well for non-English languages. | |
* | |
* Still - it's better than nothing! :-) | |
* | |
* The script is running on my own website at http://darkblue.sdf.org/ - so you | |
* can try it out there. | |
* | |
* @author Jonathan M. Hollin <darkblue@sdf.lonestar.org> | |
* @copyright Copyleft: http://www.gnu.org/copyleft/copyleft.html | |
*/ | |
function stripColons(&$item, &$key) { | |
$temp = explode(':', $item); | |
$item = $temp[0]; | |
} | |
// Search Suggestions | |
$candidates = array(); | |
$individualWords = explode(' ', $words); | |
$suggestions = array(); | |
// Process the word corpus | |
if ( !file_exists(GSDATAOTHERPATH . I18N_WORD_INDEX) ) create_i18n_search_index(); | |
$corpus = array(); | |
$f = fopen(GSDATAOTHERPATH . I18N_WORD_INDEX, 'r'); | |
while ( ($line = fgets($f)) !== false ) { | |
$items = explode(' ', $line); | |
foreach ( $individualWords as $checkWord ) { | |
if ($items[0] === $checkWord) { | |
// Exact match! | |
$items['hit'] = 1; | |
$items['pattern'] = $checkWord; | |
array_walk_recursive($items, 'stripColons'); | |
$candidates[] = $items; | |
if ( ($key = array_search($items[0], $individualWords) ) !== false) unset($individualWords[$key]); | |
continue; | |
} else { | |
// Looking for possible replacements | |
$lev = levenshtein( metaphone($checkWord), metaphone($items[0]) ); | |
if ($lev < 1) { | |
array_walk_recursive($items, 'stripColons'); | |
$items['pattern'] = $checkWord; | |
$items['replace'] = $checkWord . '|' . $items[0]; | |
similar_text($items[0], $checkWord, $items['confidence']); | |
$candidates[] = $items; | |
} | |
} | |
} | |
} | |
fclose($f); | |
// Restore $individualWords | |
$individualWords = explode(' ', $words); | |
// Clean up $candidates | |
foreach ($candidates as $k => $v) { | |
unset($key); | |
if ( array_key_exists('hit', $v) ) $key = $v[0]; | |
if ($key) foreach ($candidates as $y => $z) if ( ($z['pattern'] === $key) && ($y !== $k) ) unset($candidates[$y]); | |
} | |
foreach ($candidates as $k => $v) { | |
unset($candidates[$k]['pattern']); | |
unset($candidates[$k]['hit']); | |
} | |
$candidates = array_values($candidates); | |
// Find match combinations that actually resolve to a page | |
$result = array(); | |
if ( (count($candidates) >= 2) && (count($individualWords) !== 1) ) { | |
// Start from the first array, and continue through all of them. | |
for ($i = 0; $i < count($candidates); $i ++) { | |
// Start with the current array of $i, +1. So that collisions never occur. | |
for ($j = $i + 1; $j < count($candidates); $j ++) { | |
// If there are common values (the array_intersect() function returns a non-empty array | |
if (count(array_intersect($candidates[$i], $candidates[$j])) !== 0) { | |
// Add the first array (if it's not there already) | |
if (!in_array($candidates[$i], $result)) $result[] = $candidates[$i]; | |
// Add the second array (if it's not there already) | |
if (!in_array($candidates[$j], $result)) $result[] = $candidates[$j]; | |
} | |
} | |
} | |
} else { | |
if (count($individualWords) === 1) $result = $candidates; | |
} | |
unset($candidates); | |
// Compute the best suggestion | |
$condidence = 0; | |
$sentence = $replacements = array(); | |
if (count($result)) { | |
foreach ($result as $k => $v) { | |
$replace = (isset($v['replace'])) ? explode('|', $v['replace']) : false; | |
if (count($replace) === 2) { | |
$key = array_search($replace[0], $individualWords); | |
$confidence = ($v['confidence'] > $confidence) ? $v['confidence'] : $confidence; | |
if ($replace && is_array($replace)) { | |
if (isset($replacements[$replace[0]])) { | |
if ($confidence > $replacements[$replace[0]]['confidence']) { | |
$replacements[$replace[0]]['confidence'] = $v['confidence']; | |
$replacements[$replace[0]]['with'] = $replace[1]; | |
} | |
} else { | |
$replacements[$replace[0]]['confidence'] = $v['confidence']; | |
$replacements[$replace[0]]['with'] = $replace[1]; | |
} | |
} | |
} | |
} | |
unset($result); | |
// Render the suggestion | |
foreach ($individualWords as $word) { | |
if (isset($replacements[$word])) { | |
$sentence[] = '<span style="font-style: italic;">' . $replacements[$word]['with'] . '</span>'; | |
} else { | |
$sentence[] = $word; | |
} | |
} | |
unset($individualWords); | |
unset($replacements); | |
if (count($sentence)) echo '<p class="search-no-results">Did you mean: <a href="/search?words=' . strip_tags(implode('+', $sentence)) . '">' . implode(' ', $sentence) . '</a></p>'; | |
} | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
See: http://get-simple.info/forums/showthread.php?tid=1256&pid=29712#pid29712 for more info.