The Nilsimsa hashing algorithm, adjusted to support variable length output. Original: https://github.com/opctim/php-nilsimsa/blob/master/src/Nilsimsa.php
<?php | |
/** | |
* PHP Library to calculate and compare Nilsimsa digests. | |
* | |
* The Nilsimsa hash is a locality senstive hash function. Generally similar | |
* documents will have similar Nilsimsa digests. The Hamming distance between | |
* the digests can be used to approximate the similarity between documents. For | |
* further information consult http://en.wikipedia.org/wiki/Nilsimsa_Hash and | |
* the references (particularly Damiani et al.) | |
* | |
* Implementation details: | |
* The Nilsimsa class takes in a data parameter which is the string of the | |
* document to digest Calling the methods hexdigest() and digest() give the | |
* nilsimsa digests in hex or array format. The helper function compare_digests | |
* takes in two digests and computes the Nilsimsa score. You can also use | |
* compare_files() and compare_strings() to compare files and strings directly. | |
* | |
* This code is a port of py-nilsimsa located at | |
* https://code.google.com/p/py-nilsimsa/ | |
*/ | |
/** | |
* The MIT License (MIT) | |
* | |
* Copyright (c) 2015 Bill Eager | |
* | |
* Permission is hereby granted, free of charge, to any person obtaining a copy | |
* of this software and associated documentation files (the 'Software'), to deal | |
* in the Software without restriction, including without limitation the rights | |
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
* copies of the Software, and to permit persons to whom the Software is | |
* furnished to do so, subject to the following conditions: | |
* | |
* The above copyright notice and this permission notice shall be included in | |
* all copies or substantial portions of the Software. | |
* | |
* THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
* THE SOFTWARE. | |
*/ | |
class Nilsimsa | |
{ | |
/** | |
* Tran53 hash constant | |
* @var array | |
*/ | |
const TRAN = [ | |
0x02,0xd6,0x9e,0x6f,0xf9,0x1d,0x04,0xab,0xd0,0x22,0x16,0x1f,0xd8,0x73,0xa1,0xac, | |
0x3b,0x70,0x62,0x96,0x1e,0x6e,0x8f,0x39,0x9d,0x05,0x14,0x4a,0xa6,0xbe,0xae,0x0e, | |
0xcf,0xb9,0x9c,0x9a,0xc7,0x68,0x13,0xe1,0x2d,0xa4,0xeb,0x51,0x8d,0x64,0x6b,0x50, | |
0x23,0x80,0x03,0x41,0xec,0xbb,0x71,0xcc,0x7a,0x86,0x7f,0x98,0xf2,0x36,0x5e,0xee, | |
0x8e,0xce,0x4f,0xb8,0x32,0xb6,0x5f,0x59,0xdc,0x1b,0x31,0x4c,0x7b,0xf0,0x63,0x01, | |
0x6c,0xba,0x07,0xe8,0x12,0x77,0x49,0x3c,0xda,0x46,0xfe,0x2f,0x79,0x1c,0x9b,0x30, | |
0xe3,0x00,0x06,0x7e,0x2e,0x0f,0x38,0x33,0x21,0xad,0xa5,0x54,0xca,0xa7,0x29,0xfc, | |
0x5a,0x47,0x69,0x7d,0xc5,0x95,0xb5,0xf4,0x0b,0x90,0xa3,0x81,0x6d,0x25,0x55,0x35, | |
0xf5,0x75,0x74,0x0a,0x26,0xbf,0x19,0x5c,0x1a,0xc6,0xff,0x99,0x5d,0x84,0xaa,0x66, | |
0x3e,0xaf,0x78,0xb3,0x20,0x43,0xc1,0xed,0x24,0xea,0xe6,0x3f,0x18,0xf3,0xa0,0x42, | |
0x57,0x08,0x53,0x60,0xc3,0xc0,0x83,0x40,0x82,0xd7,0x09,0xbd,0x44,0x2a,0x67,0xa8, | |
0x93,0xe0,0xc2,0x56,0x9f,0xd9,0xdd,0x85,0x15,0xb4,0x8a,0x27,0x28,0x92,0x76,0xde, | |
0xef,0xf8,0xb2,0xb7,0xc9,0x3d,0x45,0x94,0x4b,0x11,0x0d,0x65,0xd5,0x34,0x8b,0x91, | |
0x0c,0xfa,0x87,0xe9,0x7c,0x5b,0xb1,0x4d,0xe5,0xd4,0xcb,0x10,0xa2,0x17,0x89,0xbc, | |
0xdb,0xb0,0xe2,0x97,0x88,0x52,0xf7,0x48,0xd3,0x61,0x2c,0x3a,0x2b,0xd1,0x8c,0xfb, | |
0xf1,0xcd,0xe4,0x6a,0xe7,0xa9,0xfd,0xc4,0x37,0xc8,0xd2,0xf6,0xdf,0x58,0x72,0x4e, | |
]; | |
/** | |
* Stores whether the digest is complete | |
* | |
* @var boolean | |
*/ | |
private $digestComputed; | |
/** | |
* Stores the number of characters in the string digested | |
* | |
* @var int | |
*/ | |
private $numChar; | |
/** | |
* Stores the accumulator as a 256-bit vector | |
* | |
* @var array | |
*/ | |
private $acc; | |
/** | |
* Stores the active window used in {process} for hashing | |
* | |
* @var array | |
*/ | |
private $window; | |
/** | |
* @var mixed | |
*/ | |
private $digest; | |
/** | |
* The target length of the hash. | |
* @var int | |
*/ | |
private $length; | |
/** | |
* Constructor | |
* | |
* @param string $data The data to process | |
*/ | |
public function __construct($data = null, $length = null) | |
{ | |
if($length !== null) $this->length = $length; | |
$this->digestComputed = false; | |
$this->numChar = 0; | |
$this->acc = array_fill(0, $this->length, 0); | |
$this->window = []; | |
if ($data) { | |
$this->process($data); | |
} | |
} | |
/** | |
* Computes the hash of all of the trigrams in the chunk using a window of | |
* length 5 | |
* | |
* @param string $chunk The chunk to process | |
*/ | |
public function process($chunk) | |
{ | |
foreach (str_split($chunk) as $char) { | |
$this->numChar++; | |
$c = ord($char); | |
$windowLength = count($this->window); | |
if ($windowLength > 1) { | |
// seen at least three characters | |
$this->acc[$this->tranHash( | |
$c, $this->window[0], $this->window[1], 0 | |
)] | |
+= 1; | |
} | |
if ($windowLength > 2) { | |
// seen at least four characters | |
$this->acc[$this->tranHash( | |
$c, $this->window[0], $this->window[2], 1 | |
)] | |
+= 1; | |
$this->acc[$this->tranHash( | |
$c, $this->window[1], $this->window[2], 2 | |
)] | |
+= 1; | |
} | |
if ($windowLength > 3) { | |
// have a full window | |
$this->acc[$this->tranHash( | |
$c, $this->window[0], $this->window[3], 3 | |
)] | |
+= 1; | |
$this->acc[$this->tranHash( | |
$c, $this->window[1], $this->window[3], 4 | |
)] | |
+= 1; | |
$this->acc[$this->tranHash( | |
$c, $this->window[2], $this->window[3], 5 | |
)] | |
+= 1; | |
// duplicate hashes, used to maintain 8 trigrams per character | |
$this->acc[$this->tranHash( | |
$this->window[3], $this->window[0], $c, 6 | |
)] | |
+= 1; | |
$this->acc[$this->tranHash( | |
$this->window[3], $this->window[2], $c, 7 | |
)] | |
+= 1; | |
} | |
// add current character to the window, remove the previous character | |
array_unshift($this->window, $c); | |
if ($windowLength >= 4) { | |
$this->window = array_slice($this->window, 0, 4); | |
} | |
} | |
} | |
/** | |
* Implementation of the Tran53 hash algorithm | |
* | |
* @param int $a Input A | |
* @param int $b Input B | |
* @param int $c Input C | |
* @param int $n Input N | |
* | |
* @return int | |
*/ | |
public function tranHash($a, $b, $c, $n) | |
{ | |
return (( | |
(self::TRAN[($a + $n) & 255] ^ self::TRAN[$b] * ($n + $n + 1)) + | |
self::TRAN[($c) ^ self::TRAN[$n]] | |
) & ($this->length-1)); // Was 255 | |
} | |
/** | |
* Returns the digest as a hex string. Computes it if it isn't computed | |
* already. | |
* | |
* @return string The digest | |
*/ | |
public function hexDigest() | |
{ | |
if ( ! $this->digestComputed) { | |
$this->computeDigest(); | |
} | |
$output = null; | |
foreach ($this->digest as $i) { | |
$output .= sprintf('%02x', $i); | |
} | |
return $output; | |
} | |
/** | |
* Returns the digest as an array. Computes it if it isn't computed already. | |
* | |
* @return array The digest | |
*/ | |
public function digest() | |
{ | |
if ( ! $this->digestComputed) { | |
$this->computeDigest(); | |
} | |
return $this->digest; | |
} | |
/** | |
* Using a threshold (mean of the accumulator), computes the nilsimsa | |
* digest after completion. Sets complete flag to true and stores result in | |
* $this->digest | |
*/ | |
public function computeDigest() | |
{ | |
$numTrigrams = 0; | |
if ($this->numChar == 3) { | |
// 3 chars -> 1 trigram | |
$numTrigrams = 1; | |
} | |
else if ($this->numChar == 4) { | |
// 4 chars -> 4 trigrams | |
$numTrigrams = 4; | |
} | |
else if ($this->numChar > 4) { | |
// > 4 chars -> 8 for each CHAR | |
$numTrigrams = 8 * $this->numChar - 28; | |
} | |
// threshhold is the mean of the acc buckets | |
$threshold = $numTrigrams / $this->length; | |
$digest = array_fill(0, $this->length/8, 0); | |
for ($i = 0; $i < ($this->length-2); $i++) { | |
if ($this->acc[$i] > $threshold) { | |
// equivalent to i/8, 2**(i mod 7) | |
$digest[$i >> 3] += 1 << ($i & 7); | |
} | |
} | |
// set flag to true | |
$this->digestComputed = true; | |
// store result in digest, reversed | |
$this->digest = array_reverse($digest); | |
} | |
static function hash($data, $length = 256) { | |
$hasher = new self($data, $length); | |
return $hasher->hexDigest(); | |
} | |
} | |
function lines_count($handle) { | |
fseek($handle, 0); | |
$count = 0; | |
while(fgets($handle) !== false) $count++; | |
return $count; | |
} | |
$mode = $argv[1] ?? "help"; | |
switch($mode) { | |
case "typos": | |
$handle = fopen("typos.csv", "r"); | |
$line_count = lines_count($handle); | |
echo("$line_count lines total\n"); | |
$sizes = [ 256, 128, 64, 32, 16, 8 ]; | |
foreach($sizes as $size) { | |
fseek($handle, 0);fgets($handle); // Skipt he first line since it's the header | |
$count = 0; $count_same = 0; $skipped = 0; | |
$same = []; $not_same = []; | |
while(($line = fgets($handle)) !== false) { | |
$parts = explode(",", trim($line), 2); | |
if(strlen($parts[1]) < 3) { | |
$skipped++; | |
continue; | |
} | |
$hash_a = Nilsimsa::hash($parts[0], $size); | |
$hash_b = Nilsimsa::hash($parts[1], $size); | |
$count++; | |
if($hash_a == $hash_b) { | |
$count_same++; | |
$same[] = $parts; | |
} | |
else $not_same[] = $parts; | |
echo("$count_same / $count ($skipped skipped)\r"); | |
} | |
file_put_contents("$size-same.csv", implode("\n", array_map(function ($el) { | |
return implode(",", $el); | |
}, $same))); | |
file_put_contents("$size-not-same.csv", implode("\n", array_map(function ($el) { | |
return implode(",", $el); | |
}, $not_same))); | |
echo(str_pad($size, 10)."→ $count_same / $count (".round(($count_same/$count)*100, 2)."%), $skipped skipped\n"); | |
} | |
break; | |
case "helloworld": | |
foreach([ 256, 128, 64, 32, 16, 8 ] as $size) { | |
echo(str_pad($size, 10).Nilsimsa::hash("pineapple", $size)); | |
echo(str_pad($size, 10).Nilsimsa::hash("pinnapple", $size)); | |
echo("\n"); | |
} | |
break; | |
case "help": | |
default: | |
echo("Mode $mode not recognised. Available modes:\n"); | |
echo(" helloworld Show different hash sizes\n"); | |
echo(" typos Compare typos in typos.csv and calculate statistics\n"); | |
break; | |
} | |
/* | |
* TODO: Explore BK-Trees. SymSpell might be orders of magnitudes faster, but that's compared to a regular BK-Tree - and it's *much* more complicated. | |
* If we instead use Nilsimsa + the hamming distance comparison funnction (which we removed & will need to reinstate), is it faster than doing lots of `levenshtein()` calls? | |
* Experimentation is needed. | |
* See also gmp_hamdist() (which requires the gmp PHP extension). | |
*/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment