From e95d982e1739a7f80c5dbff9785dfd66b29cff31 Mon Sep 17 00:00:00 2001 From: caifengze Date: Sun, 25 May 2025 17:01:48 +0800 Subject: [PATCH 1/4] feat: add MurmurHash2 hash() implementation (#355) --- nebula3/utils/__init__.py | 2 ++ nebula3/utils/hash.py | 49 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+) create mode 100644 nebula3/utils/__init__.py create mode 100644 nebula3/utils/hash.py diff --git a/nebula3/utils/__init__.py b/nebula3/utils/__init__.py new file mode 100644 index 00000000..0a71cf71 --- /dev/null +++ b/nebula3/utils/__init__.py @@ -0,0 +1,2 @@ +from .hash import hash + diff --git a/nebula3/utils/hash.py b/nebula3/utils/hash.py new file mode 100644 index 00000000..77ee0e21 --- /dev/null +++ b/nebula3/utils/hash.py @@ -0,0 +1,49 @@ +# nebula3/hash.py +from __future__ import annotations + +_M: int = 0xC6A4A7935BD1E995 +_R: int = 47 +_MASK64: int = (1 << 64) - 1 + + +def _read_u64_le(buf: bytes) -> int: + """ Convert little-endian bytes of up to 8 bytes to an unsigned integer. """ + return int.from_bytes(buf, byteorder="little", signed=False) + + +def hash(data: bytes | str, seed: int = 0xC70F6907) -> int: + """MurmurHash2 64-bit variant: + :Param data: supports str (utf-8 encoding), bytes, bytearray + :Param seed: defaults to 0xC70F6907 + :return: Python int, in the range of signed 64-bit + """ + if isinstance(data, str): + data_as_bytes = data.encode("utf-8") + elif isinstance(data, (bytes, bytearray)): + data_as_bytes = bytes(data) + else: + raise TypeError("Input must be str, bytes, or bytearray") + + h = (seed ^ (_M * len(data_as_bytes) & _MASK64)) & _MASK64 + off = len(data_as_bytes) // 8 * 8 + for i in range(0, off, 8): + k = _read_u64_le(data_as_bytes[i: i + 8]) + k = (k * _M) & _MASK64 + k ^= (k >> _R) + k = (k * _M) & _MASK64 + h ^= k + h = (h * _M) & _MASK64 + + tail = data_as_bytes[off:] + if tail: + t = _read_u64_le(tail) + h ^= t + h = (h * _M) & _MASK64 + + h ^= (h >> _R) + h = (h * _M) & _MASK64 + h ^= (h >> _R) + + if h & (1 << 63): + h -= 1 << 64 + return h From ab387dc6c9ee87c155e1bb8bd88cc19be433bd95 Mon Sep 17 00:00:00 2001 From: caifengze Date: Sun, 25 May 2025 17:04:11 +0800 Subject: [PATCH 2/4] feat: add MurmurHash2 hash() implementation test (#355) --- tests/test_hash.py | 38 ++++++++++++++++++++++++++++++++++ tests/test_hash_integration.py | 32 ++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100644 tests/test_hash.py create mode 100644 tests/test_hash_integration.py diff --git a/tests/test_hash.py b/tests/test_hash.py new file mode 100644 index 00000000..ca5ddfb8 --- /dev/null +++ b/tests/test_hash.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python +# --coding:utf-8-- + +# Copyright (c) 2020 vesoft inc. All rights reserved. +# +# This source code is licensed under Apache 2.0 License. + +import pytest +from nebula3.utils.hash import hash as murmur_hash + +TEST_VECTORS = [ + (b"", 6142509188972423790), + (b"a", 4993892634952068459), + (b"abcdefgh", 8664279048047335611), # length-8 cases + (b"abcdefghi", -5409788147785758033), + ("to_be_hashed", -1098333533029391540), + ("中文", -8591787916246384322), +] + +@pytest.mark.parametrize("data, expected", TEST_VECTORS) +def test_known_vectors(data, expected): + assert murmur_hash(data) == expected + + +def test_str_bytes_equiv(): + """ + Ensure str and bytes inputs produce the same hash. + """ + s = "pytest" + assert murmur_hash(s) == murmur_hash(s.encode("utf-8")) + + +def test_type_error(): + """ + TypeError + """ + with pytest.raises(TypeError): + murmur_hash(12345) diff --git a/tests/test_hash_integration.py b/tests/test_hash_integration.py new file mode 100644 index 00000000..8a8d4f49 --- /dev/null +++ b/tests/test_hash_integration.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python +# --coding:utf-8-- + +# Copyright (c) 2020 vesoft inc. All rights reserved. +# +# This source code is licensed under Apache 2.0 License. + +import pytest +from nebula3.Config import Config +from nebula3.gclient.net import ConnectionPool +from nebula3.utils.hash import hash as murmur_hash + +@pytest.fixture(scope="module") +def nebula_session(): + config = Config() + config.max_connection_pool_size = 10 + pool = ConnectionPool() + pool.init([("127.0.0.1", 9669)], config) + session = pool.get_session('root', 'nebula') + yield session + pool.close() + +@pytest.mark.parametrize("data", [ + "", "a", "abcdefgh", "abcdefghi", "to_be_hashed", "中文" +]) +def test_hash_against_server(nebula_session, data): + # Local Computing + expected = murmur_hash(data) + result = nebula_session.execute(f'YIELD hash("{data}")') + assert result.is_succeeded(), result.error_msg() + actual = result.row_values(0)[0].as_int() + assert actual == expected From 01d21cb461888d201cf58f5fdf7e4319a0d249c8 Mon Sep 17 00:00:00 2001 From: caifengze Date: Sun, 25 May 2025 18:10:30 +0800 Subject: [PATCH 3/4] =?UTF-8?q?feat:=20add=20seed=E2=80=90variation,=20ide?= =?UTF-8?q?mpotent=20and=20large=E2=80=90input=20tests=20(#355)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/test_hash.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/tests/test_hash.py b/tests/test_hash.py index ca5ddfb8..2cfd848c 100644 --- a/tests/test_hash.py +++ b/tests/test_hash.py @@ -11,12 +11,13 @@ TEST_VECTORS = [ (b"", 6142509188972423790), (b"a", 4993892634952068459), - (b"abcdefgh", 8664279048047335611), # length-8 cases + (b"abcdefgh", 8664279048047335611), # length-8 bytes cases (b"abcdefghi", -5409788147785758033), ("to_be_hashed", -1098333533029391540), ("中文", -8591787916246384322), ] + @pytest.mark.parametrize("data, expected", TEST_VECTORS) def test_known_vectors(data, expected): assert murmur_hash(data) == expected @@ -36,3 +37,24 @@ def test_type_error(): """ with pytest.raises(TypeError): murmur_hash(12345) + + +def test_seed_variation(): + """Different seed values should produce different hashes.""" + data = b"seed_test" + hash1 = murmur_hash(data, seed=0) + hash2 = murmur_hash(data, seed=1) + assert hash1 != hash2 + + +def test_idempotent(): + """Repeated calls with same input must yield the same result.""" + data = b"consistent" + assert murmur_hash(data) == murmur_hash(data) + + +def test_large_input_performance(): + """Large inputs should be processed without error and return an int.""" + data = b"x" * 10_000 + result = murmur_hash(data) + assert isinstance(result, int) From a0c7d64a16e218cc565d4fda07ed2b9b9a96a35f Mon Sep 17 00:00:00 2001 From: caifengze Date: Mon, 26 May 2025 12:56:34 +0800 Subject: [PATCH 4/4] style: format with black --- nebula3/utils/__init__.py | 1 - nebula3/utils/hash.py | 10 +++++----- tests/test_hash_integration.py | 10 ++++++---- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/nebula3/utils/__init__.py b/nebula3/utils/__init__.py index 0a71cf71..427e116d 100644 --- a/nebula3/utils/__init__.py +++ b/nebula3/utils/__init__.py @@ -1,2 +1 @@ from .hash import hash - diff --git a/nebula3/utils/hash.py b/nebula3/utils/hash.py index 77ee0e21..e5ac3dd7 100644 --- a/nebula3/utils/hash.py +++ b/nebula3/utils/hash.py @@ -7,7 +7,7 @@ def _read_u64_le(buf: bytes) -> int: - """ Convert little-endian bytes of up to 8 bytes to an unsigned integer. """ + """Convert little-endian bytes of up to 8 bytes to an unsigned integer.""" return int.from_bytes(buf, byteorder="little", signed=False) @@ -27,9 +27,9 @@ def hash(data: bytes | str, seed: int = 0xC70F6907) -> int: h = (seed ^ (_M * len(data_as_bytes) & _MASK64)) & _MASK64 off = len(data_as_bytes) // 8 * 8 for i in range(0, off, 8): - k = _read_u64_le(data_as_bytes[i: i + 8]) + k = _read_u64_le(data_as_bytes[i : i + 8]) k = (k * _M) & _MASK64 - k ^= (k >> _R) + k ^= k >> _R k = (k * _M) & _MASK64 h ^= k h = (h * _M) & _MASK64 @@ -40,9 +40,9 @@ def hash(data: bytes | str, seed: int = 0xC70F6907) -> int: h ^= t h = (h * _M) & _MASK64 - h ^= (h >> _R) + h ^= h >> _R h = (h * _M) & _MASK64 - h ^= (h >> _R) + h ^= h >> _R if h & (1 << 63): h -= 1 << 64 diff --git a/tests/test_hash_integration.py b/tests/test_hash_integration.py index 8a8d4f49..819de512 100644 --- a/tests/test_hash_integration.py +++ b/tests/test_hash_integration.py @@ -10,19 +10,21 @@ from nebula3.gclient.net import ConnectionPool from nebula3.utils.hash import hash as murmur_hash + @pytest.fixture(scope="module") def nebula_session(): config = Config() config.max_connection_pool_size = 10 pool = ConnectionPool() pool.init([("127.0.0.1", 9669)], config) - session = pool.get_session('root', 'nebula') + session = pool.get_session("root", "nebula") yield session pool.close() -@pytest.mark.parametrize("data", [ - "", "a", "abcdefgh", "abcdefghi", "to_be_hashed", "中文" -]) + +@pytest.mark.parametrize( + "data", ["", "a", "abcdefgh", "abcdefghi", "to_be_hashed", "中文"] +) def test_hash_against_server(nebula_session, data): # Local Computing expected = murmur_hash(data)