Skip to content

Commit e95d982

Browse files
committed
feat: add MurmurHash2 hash() implementation (#355)
1 parent ba345a6 commit e95d982

File tree

2 files changed

+51
-0
lines changed

2 files changed

+51
-0
lines changed

nebula3/utils/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
from .hash import hash
2+

nebula3/utils/hash.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# nebula3/hash.py
2+
from __future__ import annotations
3+
4+
_M: int = 0xC6A4A7935BD1E995
5+
_R: int = 47
6+
_MASK64: int = (1 << 64) - 1
7+
8+
9+
def _read_u64_le(buf: bytes) -> int:
10+
""" Convert little-endian bytes of up to 8 bytes to an unsigned integer. """
11+
return int.from_bytes(buf, byteorder="little", signed=False)
12+
13+
14+
def hash(data: bytes | str, seed: int = 0xC70F6907) -> int:
15+
"""MurmurHash2 64-bit variant:
16+
:Param data: supports str (utf-8 encoding), bytes, bytearray
17+
:Param seed: defaults to 0xC70F6907
18+
:return: Python int, in the range of signed 64-bit
19+
"""
20+
if isinstance(data, str):
21+
data_as_bytes = data.encode("utf-8")
22+
elif isinstance(data, (bytes, bytearray)):
23+
data_as_bytes = bytes(data)
24+
else:
25+
raise TypeError("Input must be str, bytes, or bytearray")
26+
27+
h = (seed ^ (_M * len(data_as_bytes) & _MASK64)) & _MASK64
28+
off = len(data_as_bytes) // 8 * 8
29+
for i in range(0, off, 8):
30+
k = _read_u64_le(data_as_bytes[i: i + 8])
31+
k = (k * _M) & _MASK64
32+
k ^= (k >> _R)
33+
k = (k * _M) & _MASK64
34+
h ^= k
35+
h = (h * _M) & _MASK64
36+
37+
tail = data_as_bytes[off:]
38+
if tail:
39+
t = _read_u64_le(tail)
40+
h ^= t
41+
h = (h * _M) & _MASK64
42+
43+
h ^= (h >> _R)
44+
h = (h * _M) & _MASK64
45+
h ^= (h >> _R)
46+
47+
if h & (1 << 63):
48+
h -= 1 << 64
49+
return h

0 commit comments

Comments
 (0)