Created
August 29, 2012 00:38
-
-
Save cowtowncoder/3505664 to your computer and use it in GitHub Desktop.
Incremental Murmur3/32bit, my first take (has unit tests, seems to work; caveat emptor)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
public final class IncrementalMurmur3Hasher | |
{ | |
protected final static int c1 = 0xcc9e2d51; | |
protected final static int c2 = 0x1b873593; | |
protected final static int c3 = 0xe6546b64; | |
private final int _seed; | |
/** | |
* Number of bytes for which checksum has been calculated | |
*/ | |
private long _totalBytes; | |
private int _partialBytes; | |
private int _partialByteCount; | |
/** | |
* Currently calculated partial hash value. Note that it will remain | |
* valid until {@link #reset} is called, so it is possible to calculate | |
* partial checksums with the same instance. | |
*/ | |
private int _currentHash; | |
public IncrementalMurmur3Hasher() { | |
this(0); | |
} | |
public IncrementalMurmur3Hasher(int seed) | |
{ | |
_seed = seed; | |
reset(); | |
} | |
/* | |
/********************************************************************** | |
/* Public API | |
/********************************************************************** | |
*/ | |
/** | |
* Method for checking number of bytes that have been sent to | |
* {@link #update} since creation or last call to {@link #reset} | |
*/ | |
@Override | |
public long getLength() { | |
return _totalBytes; | |
} | |
@Override | |
public int calculateHash() | |
{ | |
/* Since we only retain 'partial' value, we now need to perform | |
* finalizations, as expected. | |
*/ | |
int h1 = _currentHash; | |
// First: any partial data ("tail") we may have needs to be added | |
if (_partialByteCount > 0) { | |
int k1 = _partialBytes; | |
k1 *= c1; | |
k1 = Integer.rotateLeft(k1, 15); | |
k1 *= c2; | |
h1 ^= k1; | |
} | |
// And then finalize the calculation | |
h1 ^= (int) (_totalBytes); | |
// fmix(h1); | |
h1 ^= h1 >>> 16; | |
h1 *= 0x85ebca6b; | |
h1 ^= h1 >>> 13; | |
h1 *= 0xc2b2ae35; | |
h1 ^= h1 >>> 16; | |
return h1; | |
} | |
@Override | |
public void reset() { | |
_partialBytes = _partialByteCount = 0; | |
_totalBytes = 0L; | |
_currentHash = _seed; | |
} | |
@Override | |
public void update(byte[] data, int offset, int len) | |
{ | |
if (data == null || offset < 0 || len < 1 || (offset+len) > data.length) { | |
if (len == 0) { | |
return; | |
} | |
throw new IllegalArgumentException(); | |
} | |
// We will process all bytes, one way or another, so | |
_totalBytes += len; | |
// First things first: any partial data to use? | |
if (_partialByteCount > 0) { | |
int count = _resolvePartial(data, offset, len); | |
if ((len -= count) == 0) { | |
return; | |
} | |
offset += count; | |
} | |
// HotSpot seems to like optimizing 'simple' chunks; main loop offlined | |
_update(data, offset, len); | |
} | |
private final void _update(byte[] data, int offset, final int len) | |
{ | |
int h1 = _currentHash; | |
final int roundedEnd = offset + (len & 0xfffffffc); // round down to 4 byte block | |
for (; offset < roundedEnd; offset += 4) { | |
// int k1 = _gatherIntLE(data, offset); | |
int k1 = (data[offset] & 0xff) | ((data[offset+1] & 0xff) << 8) | ((data[offset+2] & 0xff) << 16) | (data[offset+3] << 24); | |
k1 *= c1; | |
k1 = Integer.rotateLeft(k1, 15); | |
k1 *= c2; | |
h1 ^= k1; | |
h1 = Integer.rotateLeft(h1, 13); | |
h1 = h1*5 + c3; | |
} | |
_currentHash = h1; | |
// and save the tail, if any | |
int remainder = len&3; | |
if (remainder > 0) { | |
_stashTail(data, offset, remainder); | |
} | |
} | |
/* | |
/********************************************************************** | |
/* Helper methods | |
/********************************************************************** | |
*/ | |
/** | |
* Helper method for resolving existing partial data, before the | |
* fast main loop. | |
*/ | |
private final int _resolvePartial(byte[] data, final int offset, final int len) | |
{ | |
int ptr = offset; | |
final int end = ptr+len; | |
// may have 1, 2 or 3 bytes, in LSB | |
switch (_partialByteCount) { // how many bytes have we stashed prior? | |
case 1: | |
if (ptr >= end) { | |
return 0; | |
} | |
_partialBytes |= ((data[ptr++] & 0xFF) << 8); | |
// fall through | |
case 2: | |
if (ptr >= end) { | |
_partialByteCount = 2; | |
return (ptr - offset); | |
} | |
_partialBytes |= ((data[ptr++] & 0xFF) << 16); | |
// fall through | |
case 3: | |
if (ptr >= end) { | |
_partialByteCount = 3; | |
return (ptr - offset); | |
} | |
_partialBytes |= ((data[ptr++] & 0xFF) << 24); | |
break; | |
default: | |
throw new IllegalStateException("Partial byte count = "+_partialByteCount+"; must be (1,3)"); | |
} | |
// good: got a full int32, process: | |
_partialByteCount = 0; | |
int k1 = _partialBytes; | |
k1 *= c1; | |
k1 = Integer.rotateLeft(k1, 15); | |
k1 *= c2; | |
int h1 = _currentHash; | |
h1 ^= k1; | |
h1 = Integer.rotateLeft(h1, 13); | |
h1 = h1*5 + c3; | |
_currentHash = h1; | |
return (ptr - offset); | |
} | |
/** | |
* Helper method for stashing partial data (1-3 bytes) after main | |
* loop. | |
*/ | |
private final void _stashTail(byte[] data, int offset, int remainder) | |
{ | |
_partialByteCount = remainder; | |
int k1 = data[offset] & 0xFF; | |
if (remainder > 1) { | |
k1 |= (data[++offset] & 0xff) << 8; | |
if (remainder > 2) { | |
k1 |= (data[++offset] & 0xff) << 16; | |
} | |
} | |
_partialBytes = k1; | |
} | |
protected final static int _gatherIntLE(byte[] data, int index) | |
{ | |
int i = data[index] & 0xFF; | |
i |= (data[++index] & 0xFF) << 8; | |
i |= (data[++index] & 0xFF) << 16; | |
i |= (data[++index] << 24); | |
return i; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment