Skip to content

Instantly share code, notes, and snippets.

@EmilHernvall
Created May 3, 2011 17:40
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
  • Save EmilHernvall/953802 to your computer and use it in GitHub Desktop.
Save EmilHernvall/953802 to your computer and use it in GitHub Desktop.
Surpringly smart spellchecker in PHP
<?php
/**
* spellcheck.php
*
* @version 0.1
* @author Emil Hernvall <aderyn@gmail.com>
* @license Public Domain
*/
/**
* A simple spellchecker based on metaphone and similar_text,
* that uses simple binary indexes instead of a database.
*/
class SpellChecker {
protected $_fh = '';
protected $_indexLength = 0;
protected $_indexSize = 0;
protected $_index = '';
protected $_indexCache = array();
protected $_hitCache = array();
/**
* Build a search index from a plain text dictionary
*
* @param string $dictionary
* @param string $outfile
*/
public static function buildIndex($dictionary, $outfile) {
// Read the dictionary into memory
$dict = file($dictionary);
// Group all the words based on their metaphone
// key.
$dict2 = array();
foreach ($dict as $word) {
$m = metaphone($word);
$dict2[$m][] = trim($word);
}
// Sort the dictionary based on the metaphone key so
// that we can perform a binary search later on.
ksort($dict2);
// Build a binary dictionary
$lbuf = ''; // Output buffer
$pos = array(); // Entry positions used for the search index later on
foreach ($dict2 as $m => $words) {
// The format is:
// 2 bytes string len + metaphone key + 0x01 + all the matching words separated by 0x01
$buf = $m;
$buf .= pack('c', 0x01);
$buf .= implode(pack('c', 0x01), $words);
// We prepend the length here. It's hard to know the length of
// a string before it's been built. ;)
$buf = pack('n', strlen($buf)) . $buf;
// Store the position in the index
$pos[] = strlen($lbuf);
// Append the string to the output buffer
$lbuf .= $buf;
}
// Initialize an index buffer.
$index = '';
// Since we prepend the index we need to increment all the index positions by the length
// of the index:
// 4byte index length + 4byte index size + 4byte * count($pos)
// The length is the nr of index positions and the size is the bytecount for the index in this case
$increment = 4 + 4 + 4 * count($pos);
// Build the index
foreach ($pos as $p) {
$index .= pack('L', $increment + $p);
}
// Prepend the length and size to the index
$index = pack('LL', count($pos), strlen($index)) . $index;
// Prepend the index to the word buffer
$lbuf = $index . $lbuf;
// Store it to disk
file_put_contents($outfile, $lbuf);
}
/**
* Crate a new SpellChecker, open the data file
*
* @param string $dictionary
*/
public function __construct($dictionary) {
$this->_fh = fopen($dictionary, 'r');
list(,$this->_indexLength, $this->_indexSize) = unpack('L2', fread($this->_fh, 8));
}
/**
* Decode a word segment
*
* @param string $in
* @return array
*/
protected function _decode($in) {
$arr = explode(pack('c', 0x01), $in);
return $arr;
}
/**
* Retrieve a word segment
*
* @param integer $index
* @return array
*/
protected function _getIndex($index) {
// Check for this position in the cache
// Miss
if (!array_key_exists($index, $this->_indexCache)) {
// Retrieve the position from the index
fseek($this->_fh, 8 + $index * 4);
list(,$pos) = unpack('L', fread($this->_fh, 4));
$this->_indexCache[$index] = $pos;
}
// Hit
else {
$pos = $this->_indexCache[$index];
}
// Check for the actual decoded segment
// Miss
if (!array_key_exists($pos, $this->_hitCache)) {
// Move to the correct positon
fseek($this->_fh, $pos);
// Read the segment length
list(,$len) = unpack("n", fread($this->_fh, 2));
// Read and decode the segment
$hit = $this->_decode(fread($this->_fh, $len));
$this->_hitCache[$pos] = $hit;
}
// Hit
else {
$hit = $this->_hitCache[$pos];
}
return $hit;
}
/**
* Perform a binary search for the metaphone key.
* This function has worst case O(log N) performance,
* where N is the number of metaphone keys.
*
* @param string $m
* @param int $l
* @param int $h
* @return array
*/
protected function _search($m, $l = 0, $h = false) {
// The upper bound must be larger than the lower
if ($h < $l) {
return false;
}
// Set the upper bound if it isn't
if ($h === false) {
$h = $this->_indexLength - 1;
}
// Calculate the midpoint and retrieve the corresponding value
$mid = (int)floor(($l + $h)/2);
$hit = $this->_getIndex($mid);
// Hit, return it
if (strcmp($m, $hit[0]) === 0) {
return $hit;
}
// Check the lower interval
elseif (strcmp($m, $hit[0]) < 0) {
return $this->_search($m, $l, $mid - 1);
}
// Check the upper interval
else {
return $this->_search($m, $mid + 1, $h);
}
}
/**
* Spell check a word and return a list of suggestions
*
* @param string $word
* @return array
*/
public function getSuggestions($word) {
// Calculate the metaphone key and perform a search
$m = metaphone($word);
$candidates = $this->_search($m);
// Return false if the search fails
if (!$candidates) {
return false;
}
// Calculate the score for each hit
$suggestions = array();
foreach ($candidates as $candidate) {
$score = similar_text($candidate, $word);
$suggestions[] = array('score' => $score, 'word' => $candidate);
}
// Sort them with the most relevant hits first
usort($suggestions, create_function('$a, $b', 'return $a["score"] > $b["score"] ? -1 : 1;'));
return $suggestions;
}
/**
* Spell check an entire sentence
*
* @param string $sentence
* @return array
*/
public function checkSentence($sentence) {
// Dismantle the sentence
$words = explode(" ", $sentence);
// Retrieve the suggestions
foreach ($words as &$word) {
$suggestions = $this->getSuggestions($word);
if (count($suggestions)) {
$word = $suggestions[0]['word'];
}
}
// Put it together again
$sentence = implode(' ', $words);
return $sentence;
}
}
SpellChecker::buildIndex('cracklib-small', 'dict');
define('STARTTIME', microtime(1));
$dict = new SpellChecker('dict');
echo $dict->checkSentence("wuld you plise concidder using another datta strukture")."\n";
echo 'end: ' . (microtime(1) - STARTTIME)."\n";
@amitadvisome
Copy link

csccxzcdsfcswamdsf asfds sads ad as a sdasdsa sa d

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment