Last active
August 29, 2015 13:57
-
-
Save tdebatty/9488748 to your computer and use it in GitHub Desktop.
A PHP implementation of the spamsum algorithm for computing context triggered piecewise hashes (CTPH), also called fuzzy hashes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* A PHP implementation of spamsum algorithm | |
* https://junkcode.samba.org/ftp/unpacked/junkcode/spamsum/ | |
* | |
* The spamsum algorithm is also used by ssdeep | |
* http://ssdeep.sourceforge.net/ | |
* | |
* Usage: | |
* $ss = new SpamSum(); | |
* $ss->HashString(file_get_contents("myfile.php")); | |
* echo $ss; | |
* | |
* OR | |
* echo SpamSum::Hash(file_get_contents("myfile.php")); | |
*/ | |
class SpamSum | |
{ | |
public static function Hash($string, $bsize = 0) { | |
$ss = new SpamSum(); | |
$ss->HashString($string, $bsize); | |
return $ss; | |
} | |
const SPAMSUM_LENGTH = 64; | |
const MIN_BLOCKSIZE = 3; | |
const HASH_PRIME = 0x01000193; | |
const HASH_INIT = 0x28021967; | |
const B64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; | |
public $blocksize; | |
protected $left; | |
protected $right; | |
public function HashString($string, $bsize = 0) { | |
$b64 = self::B64; | |
$length = strlen($string); | |
$in = unpack('C*', $string); | |
// Reindex (to start from 0) | |
foreach ($in as $k => $v) { | |
$in[$k - 1] = $v; | |
} | |
unset($in[count($in)]); | |
if ($bsize == 0) { | |
/* guess a reasonable block size */ | |
$this->blocksize = self::MIN_BLOCKSIZE; | |
while ($this->blocksize * self::SPAMSUM_LENGTH < $length) { | |
$this->blocksize = $this->blocksize * 2; | |
} | |
} else { | |
$this->blocksize = $bsize; | |
} | |
again: | |
$this->left = array(); | |
$this->right = array(); | |
$k = $j = 0; | |
$h3 = $h2 = self::HASH_INIT; | |
$h = $this->rolling_hash_reset(); | |
for ($i = 0; $i < $length; $i++) { | |
/* at each character we update the rolling hash and the normal | |
* hash. When the rolling hash hits the reset value then we emit | |
* the normal hash as a element of the signature and reset both | |
* hashes | |
*/ | |
$h = $this->rolling_hash($in[$i]); | |
$h2 = self::sum_hash($in[$i], $h2); | |
$h3 = self::sum_hash($in[$i], $h3); | |
if ($h % $this->blocksize == ($this->blocksize - 1)) { | |
/* we have hit a reset point. We now emit a hash which is based | |
* on all chacaters in the piece of the string between the last | |
* reset point and this one | |
*/ | |
$this->left[$j] = $b64[$h2 % 64]; | |
if ($j < self::SPAMSUM_LENGTH - 1) { | |
/* we can have a problem with the tail overflowing. The easiest way | |
* to cope with this is to only reset the second hash if we have | |
* room for more characters in our signature. This has the effect of | |
* combining the last few pieces of the message into a single piece | |
*/ | |
$h2 = self::HASH_INIT; | |
$j++; | |
} | |
} | |
/* this produces a second signature with a block size of block_size*2. | |
* By producing dual signatures in this way the effect of small changes | |
* in the string near a block size boundary is greatly reduced. | |
*/ | |
if ($h % ($this->blocksize * 2) == (($this->blocksize * 2) - 1)) { | |
$this->right[$k] = $b64[$h3 % 64]; | |
if ($k < self::SPAMSUM_LENGTH / 2 - 1) { | |
$h3 = self::HASH_INIT; | |
$k++; | |
} | |
} | |
} | |
/* If we have anything left then add it to the end. This ensures that the | |
* last part of the string is always considered | |
*/ | |
if ($h != 0) { | |
$this->left[$j] = $b64[$h2 % 64]; | |
$this->right[$k] = $b64[$h3 % 64]; | |
} | |
/* Our blocksize guess may have been way off - repeat if necessary | |
*/ | |
if ($bsize == 0 | |
&& $this->blocksize > self::MIN_BLOCKSIZE | |
&& $j < self::SPAMSUM_LENGTH / 2) { | |
$this->blocksize = $this->blocksize / 2; | |
goto again; | |
} | |
return $this->__toString(); | |
} | |
public function __toString() { | |
return | |
$this->blocksize . ":" . | |
implode("", $this->left) . ":" . | |
implode("", $this->right); | |
} | |
public function BlockSize() { | |
return $this->blocksize; | |
} | |
public function Left() { | |
return $this->left; | |
} | |
public function Right() { | |
return $this->right; | |
} | |
/* A simple non-rolling hash, based on the FNV hash | |
*/ | |
protected static function sum_hash($c, $h) { | |
$h = ($h * self::HASH_PRIME) % pow(2, 32); | |
$h = ($h ^ $c) % pow(2, 32); | |
return $h; | |
} | |
/* A rolling hash, based on the Adler checksum. By using a rolling hash | |
* we can perform auto resynchronisation after inserts/deletes internally, | |
* h1 is the sum of the bytes in the window and h2 is the sum of the bytes | |
* times the index h3 is a shift/xor based rolling hash, and is mostly | |
* needed to ensure that we can cope with large blocksize values | |
*/ | |
const ROLLING_WINDOW = 7; | |
protected $rolling_window = array(); | |
protected $rolling_h1; | |
protected $rolling_h2; | |
protected $rolling_h3; | |
protected $rolling_n; | |
protected function rolling_hash($c) { | |
$this->rolling_h2 -= $this->rolling_h1; | |
$this->rolling_h2 += self::ROLLING_WINDOW * $c; | |
$this->rolling_h1 += $c; | |
$this->rolling_h1 -= $this->rolling_window[$this->rolling_n % self::ROLLING_WINDOW]; | |
$this->rolling_window[$this->rolling_n % self::ROLLING_WINDOW] = $c; | |
$this->rolling_n++; | |
$this->rolling_h3 = ($this->rolling_h3 << 5) & 0xFFFFFFFF; | |
$this->rolling_h3 ^= $c; | |
return $this->rolling_h1 + $this->rolling_h2 + $this->rolling_h3; | |
} | |
protected function rolling_hash_reset() { | |
for ($i = 0; $i < self::ROLLING_WINDOW; $i++) { | |
$this->rolling_window[$i] = 0; | |
} | |
$this->rolling_h1 = 0; | |
$this->rolling_h2 = 0; | |
$this->rolling_h3 = 0; | |
$this->rolling_n = 0; | |
return 0; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment