|
| 1 | +# nebula3/hash.py |
| 2 | +from __future__ import annotations |
| 3 | + |
| 4 | +_M: int = 0xC6A4A7935BD1E995 |
| 5 | +_R: int = 47 |
| 6 | +_MASK64: int = (1 << 64) - 1 |
| 7 | + |
| 8 | + |
| 9 | +def _read_u64_le(buf: bytes) -> int: |
| 10 | + """ Convert little-endian bytes of up to 8 bytes to an unsigned integer. """ |
| 11 | + return int.from_bytes(buf, byteorder="little", signed=False) |
| 12 | + |
| 13 | + |
| 14 | +def hash(data: bytes | str, seed: int = 0xC70F6907) -> int: |
| 15 | + """MurmurHash2 64-bit variant: |
| 16 | + :Param data: supports str (utf-8 encoding), bytes, bytearray |
| 17 | + :Param seed: defaults to 0xC70F6907 |
| 18 | + :return: Python int, in the range of signed 64-bit |
| 19 | + """ |
| 20 | + if isinstance(data, str): |
| 21 | + data_as_bytes = data.encode("utf-8") |
| 22 | + elif isinstance(data, (bytes, bytearray)): |
| 23 | + data_as_bytes = bytes(data) |
| 24 | + else: |
| 25 | + raise TypeError("Input must be str, bytes, or bytearray") |
| 26 | + |
| 27 | + h = (seed ^ (_M * len(data_as_bytes) & _MASK64)) & _MASK64 |
| 28 | + off = len(data_as_bytes) // 8 * 8 |
| 29 | + for i in range(0, off, 8): |
| 30 | + k = _read_u64_le(data_as_bytes[i: i + 8]) |
| 31 | + k = (k * _M) & _MASK64 |
| 32 | + k ^= (k >> _R) |
| 33 | + k = (k * _M) & _MASK64 |
| 34 | + h ^= k |
| 35 | + h = (h * _M) & _MASK64 |
| 36 | + |
| 37 | + tail = data_as_bytes[off:] |
| 38 | + if tail: |
| 39 | + t = _read_u64_le(tail) |
| 40 | + h ^= t |
| 41 | + h = (h * _M) & _MASK64 |
| 42 | + |
| 43 | + h ^= (h >> _R) |
| 44 | + h = (h * _M) & _MASK64 |
| 45 | + h ^= (h >> _R) |
| 46 | + |
| 47 | + if h & (1 << 63): |
| 48 | + h -= 1 << 64 |
| 49 | + return h |
0 commit comments