Created
January 7, 2013 21:42
-
-
Save bdw/4478770 to your computer and use it in GitHub Desktop.
PHP on-disk dictionary. It stores a key-value hash as a binary tree (depth-first), with a 'right offset pointer'. A long time ago, I had methods that manipulated this on-disk, but unfortunately, I don't anymore.
I will figure out how, though, and rewrite this into a more honest language.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
class Dictionary { | |
var $file; | |
var $index; | |
var $current; | |
function Dictionary($file) { | |
$this->file = $file; | |
$this->index = array(); | |
} | |
function store() { | |
$fd = fopen($this->file, 'w'); | |
$k = array_keys($this->index); | |
sort($k); | |
$tree = $this->write_node(0, sizeof($k) , $k); | |
fwrite($fd, $tree); | |
fclose($fd); | |
$this->current = true; | |
} | |
function write_node($l, $u, $k) { | |
if($l < $u) { | |
$m = ($l + $u) >> 1; | |
$left = $this->write_node($l, $m, $k); | |
$right = $this->write_node($m + 1, $u, $k); | |
$record = $this->write_record($k[$m], strlen($left)); | |
return $record . $left . $right; | |
} | |
return ''; | |
} | |
function write_record($word, $bytes) { | |
return sprintf("%02X %02X %08X %s %s\n", strlen($word), | |
strlen($this->index[$word]), $bytes, | |
$word, $this->index[$word]); | |
} | |
function read_record($fd) { | |
$head = fread($fd, 15); | |
if(strlen($head) != 15) return null; | |
$vars = sscanf($head, '%02X %02X %08X '); | |
$data = fread($fd, $vars[0] + $vars[1] + 2); | |
return array(substr($data, 0, $vars[0]), | |
substr($data, $vars[0]+1, -1), $vars[2]); | |
} | |
function load() { | |
$fd = fopen($this->file, 'r'); | |
$this->index = array(); | |
while(!feof($fd)) { | |
$result = $this->read_record($fd); | |
if(empty($result)) break; | |
list($word, $data, $next) = $result; | |
$this->index[$word] = $data; | |
} | |
fclose($fd); | |
$this->current = true; | |
} | |
function unload() { | |
$this->index = array(); | |
$this->current = false; | |
} | |
function find($word) { | |
if($this->current) | |
return $this->index[$word]; | |
$fd = fopen($this->file,'r'); | |
$data = null; | |
while(!feof($fd)) { | |
$record = $this->read_record($fd); | |
if(empty($record)) break; | |
$c = strcmp($record[0], $word); | |
if($c == 0) { $data = $record[1]; break; } | |
else if($c > 0) continue; | |
else if($record[2] == 0) break; | |
else if($c < 0) fseek($fd, $record[2], SEEK_CUR); | |
} | |
fclose($fd); | |
return $data; | |
} | |
function add($word, $data) { | |
$this->index[$word] = $data; | |
} | |
function find_many($words) { | |
$fd = fopen($this->file, 'r'); | |
$data = $this->seek_from($fd, 0, $words); | |
fclose($fd); | |
return $data; | |
} | |
/* There is a bug in this function and | |
that is why I compensate by searching | |
with regular search afterwards. */ | |
function seek_from($fd, $pos, $words) { | |
if($pos != 0) | |
fseek($fd, $pos, SEEK_SET); | |
$data = array(); | |
if(empty($words)) return $data; | |
$rec = $this->read_record($fd); | |
$pos = ftell($fd); | |
list($left, $right, $found) = $this->divide_words($rec[0], $words); | |
if($found) | |
$data[$found] = $rec[1]; | |
if(!empty($left)) | |
$data = array_merge($data, $this->seek_from($fd, 0, $left)); | |
if(!empty($right) && $rec[2] != 0) | |
$data = array_merge($data, $this->seek_from($fd, $pos + $rec[2], $right)); | |
return $data; | |
} | |
function divide_words($c, $words) { | |
$a = array(); | |
$b = array(); | |
foreach($words as $i => $w) { | |
$t = strcmp($c, $w); | |
if($t == 0) $f = $w; | |
else if($t < 0) $b[] = $w; | |
else $a[] = $w; | |
} | |
return array($a, $b, $f); | |
} | |
} | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment