Reference:
- https://stackoverflow.com/questions/13305290/is-there-a-pure-python-implementation-of-murmurhash
- https://github.com/vesoft-inc/nebula/blob/master/src/common/base/MurmurHash2.h
- https://github.com/jievince/rdf-converter/blob/master/murmur2.go
def bytes_to_long(bytes):
assert len(bytes) == 8
return sum((b << (k * 8) for k, b in enumerate(bytes)))
def murmur64(data, seed = 0xc70f6907):
import ctypes
m = ctypes.c_uint64(0xc6a4a7935bd1e995).value
r = ctypes.c_uint32(47).value
MASK = ctypes.c_uint64(2 ** 64 - 1).value
data_as_bytes = bytearray(data)
seed = ctypes.c_uint64(seed).value
h = seed ^ ((m * len(data_as_bytes)) & MASK)
off = int(len(data_as_bytes)/8)*8
for ll in range(0, off, 8):
k = bytes_to_long(data_as_bytes[ll:ll + 8])
k = (k * m) & MASK
k = k ^ ((k >> r) & MASK)
k = (k * m) & MASK
h = (h ^ k)
h = (h * m) & MASK
l = len(data_as_bytes) & 7
if l >= 7:
h = (h ^ (data_as_bytes[off+6] << 48))
if l >= 6:
h = (h ^ (data_as_bytes[off+5] << 40))
if l >= 5:
h = (h ^ (data_as_bytes[off+4] << 32))
if l >= 4:
h = (h ^ (data_as_bytes[off+3] << 24))
if l >= 3:
h = (h ^ (data_as_bytes[off+2] << 16))
if l >= 2:
h = (h ^ (data_as_bytes[off+1] << 8))
if l >= 1:
h = (h ^ data_as_bytes[off])
h = (h * m) & MASK
h = h ^ ((h >> r) & MASK)
h = (h * m) & MASK
h = h ^ ((h >> r) & MASK)
return ctypes.c_long(h).value
print(str(murmur64(bytes("to_be_hashed", encoding = "utf8"), seed=0xc70f6907)))
❯ python test.py
-1098333533029391540
❯ nebula-console-3.0 -addr 10.1.1.168 -port 39669 -user root -p nebula -e 'YIELD hash("to_be_hashed");'
(root@nebula) [(none)]> YIELD hash("to_be_hashed");
+----------------------+
| hash("to_be_hashed") |
+----------------------+
| -1098333533029391540 |
+----------------------+
Got 1 rows (time spent 751/5528 us)
Tue, 17 May 2022 15:18:45 CST
note: it's reported this behaved differently in Python running on Windows.