Skip to content

Instantly share code, notes, and snippets.

@tdebatty
Last active August 29, 2015 13:57
Show Gist options
  • Save tdebatty/9488748 to your computer and use it in GitHub Desktop.
Save tdebatty/9488748 to your computer and use it in GitHub Desktop.
A PHP implementation of the spamsum algorithm for computing context triggered piecewise hashes (CTPH), also called fuzzy hashes
<?php
/**
* A PHP implementation of spamsum algorithm
* https://junkcode.samba.org/ftp/unpacked/junkcode/spamsum/
*
* The spamsum algorithm is also used by ssdeep
* http://ssdeep.sourceforge.net/
*
* Usage:
* $ss = new SpamSum();
* $ss->HashString(file_get_contents("myfile.php"));
* echo $ss;
*
* OR
* echo SpamSum::Hash(file_get_contents("myfile.php"));
*/
class SpamSum
{
public static function Hash($string, $bsize = 0) {
$ss = new SpamSum();
$ss->HashString($string, $bsize);
return $ss;
}
const SPAMSUM_LENGTH = 64;
const MIN_BLOCKSIZE = 3;
const HASH_PRIME = 0x01000193;
const HASH_INIT = 0x28021967;
const B64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
public $blocksize;
protected $left;
protected $right;
public function HashString($string, $bsize = 0) {
$b64 = self::B64;
$length = strlen($string);
$in = unpack('C*', $string);
// Reindex (to start from 0)
foreach ($in as $k => $v) {
$in[$k - 1] = $v;
}
unset($in[count($in)]);
if ($bsize == 0) {
/* guess a reasonable block size */
$this->blocksize = self::MIN_BLOCKSIZE;
while ($this->blocksize * self::SPAMSUM_LENGTH < $length) {
$this->blocksize = $this->blocksize * 2;
}
} else {
$this->blocksize = $bsize;
}
again:
$this->left = array();
$this->right = array();
$k = $j = 0;
$h3 = $h2 = self::HASH_INIT;
$h = $this->rolling_hash_reset();
for ($i = 0; $i < $length; $i++) {
/* at each character we update the rolling hash and the normal
* hash. When the rolling hash hits the reset value then we emit
* the normal hash as a element of the signature and reset both
* hashes
*/
$h = $this->rolling_hash($in[$i]);
$h2 = self::sum_hash($in[$i], $h2);
$h3 = self::sum_hash($in[$i], $h3);
if ($h % $this->blocksize == ($this->blocksize - 1)) {
/* we have hit a reset point. We now emit a hash which is based
* on all chacaters in the piece of the string between the last
* reset point and this one
*/
$this->left[$j] = $b64[$h2 % 64];
if ($j < self::SPAMSUM_LENGTH - 1) {
/* we can have a problem with the tail overflowing. The easiest way
* to cope with this is to only reset the second hash if we have
* room for more characters in our signature. This has the effect of
* combining the last few pieces of the message into a single piece
*/
$h2 = self::HASH_INIT;
$j++;
}
}
/* this produces a second signature with a block size of block_size*2.
* By producing dual signatures in this way the effect of small changes
* in the string near a block size boundary is greatly reduced.
*/
if ($h % ($this->blocksize * 2) == (($this->blocksize * 2) - 1)) {
$this->right[$k] = $b64[$h3 % 64];
if ($k < self::SPAMSUM_LENGTH / 2 - 1) {
$h3 = self::HASH_INIT;
$k++;
}
}
}
/* If we have anything left then add it to the end. This ensures that the
* last part of the string is always considered
*/
if ($h != 0) {
$this->left[$j] = $b64[$h2 % 64];
$this->right[$k] = $b64[$h3 % 64];
}
/* Our blocksize guess may have been way off - repeat if necessary
*/
if ($bsize == 0
&& $this->blocksize > self::MIN_BLOCKSIZE
&& $j < self::SPAMSUM_LENGTH / 2) {
$this->blocksize = $this->blocksize / 2;
goto again;
}
return $this->__toString();
}
public function __toString() {
return
$this->blocksize . ":" .
implode("", $this->left) . ":" .
implode("", $this->right);
}
public function BlockSize() {
return $this->blocksize;
}
public function Left() {
return $this->left;
}
public function Right() {
return $this->right;
}
/* A simple non-rolling hash, based on the FNV hash
*/
protected static function sum_hash($c, $h) {
$h = ($h * self::HASH_PRIME) % pow(2, 32);
$h = ($h ^ $c) % pow(2, 32);
return $h;
}
/* A rolling hash, based on the Adler checksum. By using a rolling hash
* we can perform auto resynchronisation after inserts/deletes internally,
* h1 is the sum of the bytes in the window and h2 is the sum of the bytes
* times the index h3 is a shift/xor based rolling hash, and is mostly
* needed to ensure that we can cope with large blocksize values
*/
const ROLLING_WINDOW = 7;
protected $rolling_window = array();
protected $rolling_h1;
protected $rolling_h2;
protected $rolling_h3;
protected $rolling_n;
protected function rolling_hash($c) {
$this->rolling_h2 -= $this->rolling_h1;
$this->rolling_h2 += self::ROLLING_WINDOW * $c;
$this->rolling_h1 += $c;
$this->rolling_h1 -= $this->rolling_window[$this->rolling_n % self::ROLLING_WINDOW];
$this->rolling_window[$this->rolling_n % self::ROLLING_WINDOW] = $c;
$this->rolling_n++;
$this->rolling_h3 = ($this->rolling_h3 << 5) & 0xFFFFFFFF;
$this->rolling_h3 ^= $c;
return $this->rolling_h1 + $this->rolling_h2 + $this->rolling_h3;
}
protected function rolling_hash_reset() {
for ($i = 0; $i < self::ROLLING_WINDOW; $i++) {
$this->rolling_window[$i] = 0;
}
$this->rolling_h1 = 0;
$this->rolling_h2 = 0;
$this->rolling_h3 = 0;
$this->rolling_n = 0;
return 0;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment