From a83ab7ca0ffd4fba87dcca6a8c812d6a68f526ce Mon Sep 17 00:00:00 2001 From: wudidapaopao Date: Wed, 24 Sep 2025 15:57:19 +0800 Subject: [PATCH 1/7] feat: support executing queries without pyarrow when it's missing --- .../workflows/build_macos_arm64_wheels.yml | 2 +- .github/workflows/build_macos_x86_wheels.yml | 2 +- chdb/state/sqlitelike.py | 63 ++++++++++++------- 3 files changed, 43 insertions(+), 24 deletions(-) diff --git a/.github/workflows/build_macos_arm64_wheels.yml b/.github/workflows/build_macos_arm64_wheels.yml index 96ef0b988a6..5cd4cdc13ec 100644 --- a/.github/workflows/build_macos_arm64_wheels.yml +++ b/.github/workflows/build_macos_arm64_wheels.yml @@ -79,7 +79,7 @@ jobs: brew install ca-certificates lz4 mpdecimal openssl@3 readline sqlite xz z3 zstd brew install --ignore-dependencies llvm@19 brew install git ninja libtool gettext gcc binutils grep findutils nasm - brew install --build-from-source ccache + brew install --build-from-source ccache || echo "ccache installation failed, continuing without it" brew install go cd /usr/local/opt/ && sudo rm -f llvm && sudo ln -sf llvm@19 llvm export PATH=$(brew --prefix llvm@19)/bin:$PATH diff --git a/.github/workflows/build_macos_x86_wheels.yml b/.github/workflows/build_macos_x86_wheels.yml index 85ebe048c87..c3df844ae20 100644 --- a/.github/workflows/build_macos_x86_wheels.yml +++ b/.github/workflows/build_macos_x86_wheels.yml @@ -79,7 +79,7 @@ jobs: brew install ca-certificates lz4 mpdecimal openssl@3 readline sqlite xz z3 zstd brew install --ignore-dependencies llvm@19 brew install git ninja libtool gettext gcc binutils grep findutils nasm - brew install --build-from-source ccache + brew install --build-from-source ccache || echo "ccache installation failed, continuing without it" brew install go cd /usr/local/opt/ && sudo rm -f llvm && sudo ln -sf llvm@19 llvm export PATH=$(brew --prefix llvm@19)/bin:$PATH diff --git a/chdb/state/sqlitelike.py b/chdb/state/sqlitelike.py index 7694cb42ece..e0f6ad57dc9 100644 --- a/chdb/state/sqlitelike.py +++ b/chdb/state/sqlitelike.py @@ -1,13 +1,32 @@ -from typing import Optional, Any +from typing import Optional, Any, TYPE_CHECKING, List, Tuple from chdb import _chdb -# try import pyarrow if failed, raise ImportError with suggestion -try: - import pyarrow as pa # noqa -except ImportError as e: - print(f"ImportError: {e}") - print('Please install pyarrow via "pip install pyarrow"') - raise ImportError("Failed to import pyarrow") from None +if TYPE_CHECKING: + import pyarrow as pa + + +def _import_pyarrow(): + """Lazy import pyarrow when needed.""" + try: + import pyarrow as pa + return pa + except ImportError: + raise ImportError( + "PyArrow is required for this feature. " + "Install with: pip install pyarrow" + ) from None + + +def _import_pandas(): + """Lazy import pandas when needed.""" + try: + import pandas as pd + return pd + except ImportError: + raise ImportError( + "Pandas is required for DataFrame conversion. " + "Install with: pip install pandas" + ) from None _arrow_format = set({"dataframe", "arrowtable"}) @@ -32,11 +51,11 @@ def to_arrowTable(res): pyarrow.Table: PyArrow Table containing the query results Raises: - ImportError: If pyarrow or pandas packages are not installed + ImportError: If pyarrow package is not installed .. note:: - This function requires both pyarrow and pandas to be installed. - Install them with: ``pip install pyarrow pandas`` + This function requires pyarrow to be installed. + Install with: ``pip install pyarrow`` .. warning:: Empty results return an empty PyArrow Table with no schema. @@ -52,14 +71,7 @@ def to_arrowTable(res): num text 0 1 hello """ - # try import pyarrow and pandas, if failed, raise ImportError with suggestion - try: - import pyarrow as pa # noqa - import pandas as pd # noqa - except ImportError as e: - print(f"ImportError: {e}") - print('Please install pyarrow and pandas via "pip install pyarrow pandas"') - raise ImportError("Failed to import pyarrow or pandas") from None + pa = _import_pyarrow() if len(res) == 0: return pa.Table.from_batches([], schema=pa.schema([])) return pa.RecordBatchFileReader(res.bytes()).read_all() @@ -102,6 +114,7 @@ def to_df(r): text object dtype: object """ + _import_pandas() t = to_arrowTable(r) return t.to_pandas(use_threads=True) @@ -230,7 +243,7 @@ def cancel(self): except Exception as e: raise RuntimeError(f"Failed to cancel streaming query: {str(e)}") from e - def record_batch(self, rows_per_batch: int = 1000000) -> pa.RecordBatchReader: + def record_batch(self, rows_per_batch: int = 1000000) -> "pa.RecordBatchReader": """ Create a PyArrow RecordBatchReader from this StreamingResult. @@ -242,10 +255,11 @@ def record_batch(self, rows_per_batch: int = 1000000) -> pa.RecordBatchReader: rows_per_batch (int): Number of rows per batch. Defaults to 1000000. Returns: - pa.RecordBatchReader: PyArrow RecordBatchReader for efficient streaming + pyarrow.RecordBatchReader: PyArrow RecordBatchReader for efficient streaming Raises: ValueError: If the StreamingResult was not created with arrow format + ImportError: If PyArrow is not installed """ if not self._supports_record_batch: raise ValueError( @@ -253,6 +267,7 @@ def record_batch(self, rows_per_batch: int = 1000000) -> pa.RecordBatchReader: "Please use format='Arrow' when calling send_query." ) + pa = _import_pyarrow() chdb_reader = ChdbRecordBatchReader(self, rows_per_batch) return pa.RecordBatchReader.from_batches(chdb_reader.schema(), chdb_reader) @@ -275,10 +290,12 @@ def __init__(self, chdb_stream_result, batch_size_rows): self._current_rows = 0 self._first_batch = None self._first_batch_consumed = True + self._pa = _import_pyarrow() self._schema = self.schema() def schema(self): if self._schema is None: + pa = self._pa # Get the first chunk to determine schema chunk = self._stream_result.fetch() if chunk is not None: @@ -304,6 +321,8 @@ def schema(self): return self._schema def read_next_batch(self): + pa = self._pa + if self._accumulator: result = self._accumulator.pop(0) return result @@ -600,7 +619,7 @@ class Cursor: def __init__(self, connection): self._conn = connection self._cursor = self._conn.cursor() - self._current_table: Optional[pa.Table] = None + self._current_table: Optional[List[Tuple]] = None self._current_row: int = 0 def execute(self, query: str) -> None: From 11cd637177ed4197f9cd6097f948a296ecc12134 Mon Sep 17 00:00:00 2001 From: wudidapaopao Date: Wed, 15 Oct 2025 18:07:22 +0800 Subject: [PATCH 2/7] test: add tests for cases where pandas and pyarrow are not present --- .../workflows/build_linux_arm64_wheels-gh.yml | 14 +- .github/workflows/build_linux_x86_wheels.yml | 12 +- .../workflows/build_macos_arm64_wheels.yml | 12 +- .github/workflows/build_macos_x86_wheels.yml | 12 +- chdb/__init__.py | 5 +- chdb/dataframe/query.py | 9 +- setup.py | 7 +- tests/run_all.py | 19 + tests/test_basic.py | 3 +- tests/test_optional_dependencies.py | 101 ++++++ tests/test_parallel.py | 1 + tests/test_query_json.py | 332 +----------------- tests/test_query_json_arrow.py | 72 ++++ tests/test_query_json_dataframe.py | 296 ++++++++++++++++ tests/test_query_py.py | 264 +------------- tests/test_query_py_arrow.py | 149 ++++++++ tests/test_query_py_dataframe.py | 137 ++++++++ tests/test_stateful.py | 31 +- tests/test_stateful_arrow.py | 28 ++ tests/test_stateful_dataframe.py | 22 ++ 20 files changed, 897 insertions(+), 629 deletions(-) create mode 100644 tests/test_optional_dependencies.py create mode 100644 tests/test_query_json_arrow.py create mode 100644 tests/test_query_json_dataframe.py create mode 100644 tests/test_query_py_arrow.py create mode 100644 tests/test_query_py_dataframe.py create mode 100644 tests/test_stateful_arrow.py create mode 100644 tests/test_stateful_dataframe.py diff --git a/.github/workflows/build_linux_arm64_wheels-gh.yml b/.github/workflows/build_linux_arm64_wheels-gh.yml index fe9e78277a5..5c6f5ddec2f 100644 --- a/.github/workflows/build_linux_arm64_wheels-gh.yml +++ b/.github/workflows/build_linux_arm64_wheels-gh.yml @@ -103,7 +103,7 @@ jobs: echo "Installing dependencies for Python $version" pyenv shell $version python -m pip install --upgrade pip - python -m pip install setuptools tox pandas pyarrow twine psutil deltalake wheel + python -m pip install setuptools tox twine psutil deltalake wheel pyenv shell --unset done - name: Upgrade Rust toolchain @@ -123,7 +123,7 @@ jobs: which clang++-19 clang++-19 --version sudo apt-get install -y make cmake ccache ninja-build yasm gawk wget - # Install WebAssembly linker (wasm-ld) + # Install WebAssembly linker (wasm-ld) sudo apt-get install -y lld-19 # Create symlink for wasm-ld if ! command -v wasm-ld &> /dev/null; then @@ -262,7 +262,17 @@ jobs: pyenv shell $version python -m pip install dist/*.whl --force-reinstall python -c "import chdb; res = chdb.query('select 1112222222,555', 'CSV'); print(f'Python $version: {res}')" + + # First test: without optional dependencies + echo "Testing without pandas and pyarrow..." + python -m pip uninstall -y pandas pyarrow || true + make test + + # Second test: with optional dependencies + echo "Testing with pandas and pyarrow..." + python -m pip install pandas pyarrow make test + pyenv shell --unset done continue-on-error: false diff --git a/.github/workflows/build_linux_x86_wheels.yml b/.github/workflows/build_linux_x86_wheels.yml index 5a35668ead9..cb22bfe7823 100644 --- a/.github/workflows/build_linux_x86_wheels.yml +++ b/.github/workflows/build_linux_x86_wheels.yml @@ -103,7 +103,7 @@ jobs: echo "Installing dependencies for Python $version" pyenv shell $version python -m pip install --upgrade pip - python -m pip install setuptools tox pandas pyarrow twine psutil deltalake wheel + python -m pip install setuptools tox twine psutil deltalake wheel pyenv shell --unset done - name: Upgrade Rust toolchain @@ -261,7 +261,17 @@ jobs: pyenv shell $version python -m pip install dist/*.whl --force-reinstall python -c "import chdb; res = chdb.query('select 1112222222,555', 'CSV'); print(f'Python $version: {res}')" + + # First test: without optional dependencies + echo "Testing without pandas and pyarrow..." + python -m pip uninstall -y pandas pyarrow || true + make test + + # Second test: with optional dependencies + echo "Testing with pandas and pyarrow..." + python -m pip install pandas pyarrow make test + pyenv shell --unset done continue-on-error: false diff --git a/.github/workflows/build_macos_arm64_wheels.yml b/.github/workflows/build_macos_arm64_wheels.yml index 61c895f7601..14cad93afc8 100644 --- a/.github/workflows/build_macos_arm64_wheels.yml +++ b/.github/workflows/build_macos_arm64_wheels.yml @@ -64,7 +64,7 @@ jobs: echo "Installing dependencies for Python $version" pyenv shell $version python -m pip install --upgrade pip - python -m pip install setuptools wheel tox pandas pyarrow twine psutil deltalake wheel>=0.40.0 + python -m pip install setuptools wheel tox twine psutil deltalake wheel>=0.40.0 pyenv shell --unset done - name: Remove /usr/local/bin/python3 @@ -202,7 +202,17 @@ jobs: pyenv shell $version python -m pip install dist/*.whl --force-reinstall python -c "import chdb; res = chdb.query('select 1112222222,555', 'CSV'); print(f'Python $version: {res}')" + + # First test: without optional dependencies + echo "Testing without pandas and pyarrow..." + python -m pip uninstall -y pandas pyarrow || true + make test + + # Second test: with optional dependencies + echo "Testing with pandas and pyarrow..." + python -m pip install pandas pyarrow make test + pyenv shell --unset done continue-on-error: false diff --git a/.github/workflows/build_macos_x86_wheels.yml b/.github/workflows/build_macos_x86_wheels.yml index 8217b1ad033..d11aec3febc 100644 --- a/.github/workflows/build_macos_x86_wheels.yml +++ b/.github/workflows/build_macos_x86_wheels.yml @@ -64,7 +64,7 @@ jobs: echo "Installing dependencies for Python $version" pyenv shell $version python -m pip install --upgrade pip - python -m pip install setuptools tox pandas pyarrow twine psutil deltalake wheel>=0.40.0 + python -m pip install setuptools tox twine psutil deltalake wheel>=0.40.0 pyenv shell --unset done - name: Remove /usr/local/bin/python3 @@ -224,7 +224,17 @@ jobs: pyenv shell $version python -m pip install dist/*.whl --force-reinstall python -c "import chdb; res = chdb.query('select 1112222222,555', 'CSV'); print(f'Python $version: {res}')" + + # First test: without optional dependencies + echo "Testing without pandas and pyarrow..." + python -m pip uninstall -y pandas pyarrow || true + make test + + # Second test: with optional dependencies + echo "Testing with pandas and pyarrow..." + python -m pip install pandas pyarrow make test + pyenv shell --unset done continue-on-error: false diff --git a/chdb/__init__.py b/chdb/__init__.py index 0674a46927c..0343be3a4d0 100644 --- a/chdb/__init__.py +++ b/chdb/__init__.py @@ -96,11 +96,10 @@ def to_arrowTable(res): # try import pyarrow and pandas, if failed, raise ImportError with suggestion try: import pyarrow as pa # noqa - import pandas as pd # noqa except ImportError as e: print(f"ImportError: {e}") - print('Please install pyarrow and pandas via "pip install pyarrow pandas"') - raise ImportError("Failed to import pyarrow or pandas") from None + print('Please install pyarrow via "pip install pyarrow"') + raise ImportError("Failed to import pyarrow") from None if len(res) == 0: return pa.Table.from_batches([], schema=pa.schema([])) return pa.RecordBatchFileReader(res.bytes()).read_all() diff --git a/chdb/dataframe/query.py b/chdb/dataframe/query.py index cc7e21ac2d2..98bf670621a 100644 --- a/chdb/dataframe/query.py +++ b/chdb/dataframe/query.py @@ -2,8 +2,13 @@ import tempfile from io import BytesIO import re -import pandas as pd -import pyarrow as pa +try: + import pandas as pd + import pyarrow as pa +except ImportError as e: + print(f'ImportError: {e}') + print('Please install pyarrow and pandas via "pip install pyarrow pandas"') + raise ImportError('Failed to import pyarrow or pandas') from None from chdb import query as chdb_query diff --git a/setup.py b/setup.py index 9b3d601f389..f182bcd883d 100644 --- a/setup.py +++ b/setup.py @@ -175,9 +175,12 @@ def build_extensions(self): ext_modules=ext_modules, python_requires=">=3.8", install_requires=[ - "pyarrow>=13.0.0", - "pandas>=2.0.0", ], + extras_require={ + "arrow": ["pandas>=2.0.0", "pyarrow>=13.0.0"], + "pandas": ["pandas>=2.0.0", "pyarrow>=13.0.0"], + "all": ["pandas>=2.0.0", "pyarrow>=13.0.0"], + }, cmdclass={"build_ext": BuildExt}, test_suite="tests", zip_safe=False, diff --git a/tests/run_all.py b/tests/run_all.py index 077f92e4ae6..6e6cc671553 100755 --- a/tests/run_all.py +++ b/tests/run_all.py @@ -13,6 +13,25 @@ class Colors: test_loader = unittest.TestLoader() test_suite = test_loader.discover('./') +# Print all test files that will be executed +print(f"\n{Colors.BOLD}Discovered Test Files:{Colors.END}") +test_files = set() +def extract_test_files(suite): + for test in suite: + if hasattr(test, '_tests'): + extract_test_files(test) + elif hasattr(test, '__module__'): + test_files.add(test.__module__) + +extract_test_files(test_suite) + +# Filter out system modules, only show actual test files +filtered_test_files = {f for f in test_files if f != "unittest.loader"} + +for test_file in sorted(filtered_test_files): + print(f" • {test_file}") +print(f"\nTotal test files: {len(filtered_test_files)}\n") + test_runner = unittest.TextTestRunner(verbosity=2) ret = test_runner.run(test_suite) diff --git a/tests/test_basic.py b/tests/test_basic.py index 661b5db9948..2ac873649c9 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -14,6 +14,7 @@ def test_basic(self): self.assertTrue(len(res.error_message()) == 0) with self.assertRaises(Exception): res = chdb.query("SELECT 1", "unknown_format") + class TestOutput(unittest.TestCase): def test_output(self): for format, output in format_output.items(): @@ -30,7 +31,7 @@ def test_output(self): continue if "Vertical" in format: continue - + if format in ("JSONEachRowWithProgress", "JSONStringsEachRowWithProgress"): data_str = str(data) lines = data_str.split('\n') diff --git a/tests/test_optional_dependencies.py b/tests/test_optional_dependencies.py new file mode 100644 index 00000000000..95af35452bd --- /dev/null +++ b/tests/test_optional_dependencies.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python3 + +import unittest +import sys +from chdb import session + + +def check_pyarrow_available(): + """Check if pyarrow is available for import.""" + try: + import pyarrow + return True + except ImportError: + return False + + +def check_pandas_available(): + """Check if pandas is available for import.""" + try: + import pandas + return True + except ImportError: + return False + + +class TestOptionalDependencies(unittest.TestCase): + def setUp(self) -> None: + self.sess = session.Session() + return super().setUp() + + def tearDown(self) -> None: + self.sess.close() + return super().tearDown() + + def test_arrowtable_output_format(self): + """Test ArrowTable output format with/without pyarrow dependency.""" + pyarrow_available = check_pyarrow_available() + + if pyarrow_available: + # If pyarrow is available, should work normally + try: + ret = self.sess.query("SELECT 1 AS x, 'hello' AS y", "ArrowTable") + self.assertIsNotNone(ret) + # Verify it's actually an ArrowTable + import pyarrow as pa + self.assertIsInstance(ret, pa.Table) + self.assertEqual(ret.column('x').to_pylist(), [1]) + self.assertEqual(ret.column('y').to_pylist(), ['hello']) + except Exception as e: + self.fail(f"ArrowTable format should work when pyarrow is available, but got: {e}") + else: + # If pyarrow is not available, should raise ImportError + with self.assertRaises(ImportError) as context: + self.sess.query("SELECT 1 AS x, 'hello' AS y", "ArrowTable") + + # Verify the error message mentions pyarrow + error_msg = str(context.exception).lower() + self.assertIn('pyarrow', error_msg) + + def test_dataframe_output_format(self): + """Test DataFrame output format with/without pandas dependency.""" + pandas_available = check_pandas_available() + + # DataFrame format requires pandas + if pandas_available: + # If both are available, should work normally + try: + ret = self.sess.query("SELECT 1 AS x, 'hello' AS y", "DataFrame") + self.assertIsNotNone(ret) + # Verify it's actually a DataFrame + import pandas as pd + self.assertIsInstance(ret, pd.DataFrame) + self.assertEqual(ret['x'].iloc[0], 1) + self.assertEqual(ret['y'].iloc[0], 'hello') + except Exception as e: + self.fail(f"DataFrame format should work when pandas is available, but got: {e}") + else: + # If pandas is missing, should raise ImportError + with self.assertRaises(ImportError) as context: + self.sess.query("SELECT 1 AS x, 'hello' AS y", "DataFrame") + + # Verify the error message mentions the missing dependency + error_msg = str(context.exception).lower() + self.assertIn('pandas', error_msg) + + def test_dependency_status_logging(self): + """Log the current dependency status for debugging.""" + pyarrow_status = "available" if check_pyarrow_available() else "not available" + pandas_status = "available" if check_pandas_available() else "not available" + + print(f"\nDependency Status:") + print(f" PyArrow: {pyarrow_status}") + print(f" Pandas: {pandas_status}") + print(f" Python version: {sys.version}") + + # This test always passes, it's just for logging + self.assertTrue(True) + + +if __name__ == "__main__": + unittest.main(verbosity=2) diff --git a/tests/test_parallel.py b/tests/test_parallel.py index c409ee5e5a5..725833509c4 100755 --- a/tests/test_parallel.py +++ b/tests/test_parallel.py @@ -4,6 +4,7 @@ import sys import unittest import chdb +import pyarrow # noqa from utils import data_file # run query parallel in n thread and benchmark diff --git a/tests/test_query_json.py b/tests/test_query_json.py index c52a24f589c..284343dfb14 100644 --- a/tests/test_query_json.py +++ b/tests/test_query_json.py @@ -1,26 +1,14 @@ #!python3 -import io -import json -import math -import random -import shutil -import uuid import unittest -import numpy as np -import pandas as pd -import pyarrow as pa -import chdb import chdb.session as chs -from datetime import date, datetime, time, timedelta -from decimal import Decimal +import chdb +from datetime import date -test_json_query_dir = ".tmp_test_json_query" EXPECTED1 = """"['urgent','important']",100.3,"[]" \\N,\\N,"[1,666]" """ -EXPECTED2 = '"apple1",3,\\N\n\\N,4,2\n' dict1 = { "c1": [1, 2, 3, 4, 5, 6, 7, 8], @@ -47,104 +35,13 @@ ] } -complex_dict = { - "date_col": [ - {"date": date(2023, 5, 15)}, - {"date": date(2024, 1, 1)} - ], - 'datetime_col': [ - {"datetime": datetime(2024, 5, 30, 14, 30)}, - {"datetime": datetime(2023, 12, 31, 23, 59, 59)} - ], - 'time_col': [ - {"time": time(12, 30)}, - {"time": time(0, 0)} - ], - 'timedelta_col': [ - {"timedelta": timedelta(days=1, hours=12, minutes=30)}, - {"timedelta": timedelta(days=2, hours=3, minutes=20)} - ], - 'bytes_col': [ - {"bytes": b'\xe8\x8b\xb9\xe6\x9e\x9c'}, - {"bytes": bytes([0x6d, 0x65, 0x6d, 0x6f, 0x72, 0x79])} - ], - 'bytearray_col': [ - {"bytearray": bytearray(b"bytearray1")}, - {"bytearray": bytearray('苹果'.encode('utf-8'))} - ], - 'memoryview_col': [ - {"memoryview": memoryview(b"memory1")}, - {"memoryview": memoryview('苹果'.encode('utf-8'))} - ], - 'uuid_col': [ - {"uuid": uuid.UUID('00000000-0000-0000-0000-000000000001')}, - {"uuid": uuid.UUID('00000000-0000-0000-0000-000000000000')} - ], - "string_col": [ - {"str": "hello"}, - {"str": "苹果"} - ], - 'special_values_col': [ - {"null_val": None, "bool_val": True, "empty_list": [], "empty_dict": {}, "empty_array": np.array([])}, - ], - 'special_num_col': [ - {"special_num_val": [0.0, -1.1, float('nan'), float('inf'), float('-inf'), math.inf, math.nan]} - ], - 'numpy_num_col': [ - {"numpy_val": np.array([np.int32(42), np.int64(-1), np.float32(3.14), np.float64(2.718)])}, - ], - 'numpy_bool_col': [ - {"numpy_val": np.array([np.bool_(True), np.bool_(False)])}, - ], - 'numpy_datetime_col': [ - {"numpy_val": np.datetime64('2025-05-30T20:08:08.123')} - ], - 'decimal_col': [ - {"decimal_val": Decimal('123456789.1234567890')}, - {"decimal_val": Decimal('-987654321.9876543210')}, - {"decimal_val": Decimal('0.0000000001')} - ] -} - -df1 = pd.DataFrame({ - "c1": dict1["c1"], - "c2": dict1["c2"], - "c3": dict1["c3"], - "c4": dict1["c4"] -}) - dict2 = { - "c1": df1['c1'], - "c2": df1['c2'], - "c3": df1['c3'], - "c4": df1['c4'] -} - -dict3 = { - "c1": [1, 2, 3, 4], - "c2": ["banana", "water", "apple", "water"], - "c3": [ - {"deep": {"level2": {"level3": 100.3}}}, - {"mixed_list": [{"a": 1}, {"a": 666}]}, - {"nested_int": 1, "mixed": "text", "float_val": 3.14}, - {"list_val": [1,2,3], "tuple_val": (4,5)}, - ], - "c4": [ - {"coordinates": [1.1, 2.2], "tags": ("urgent", "important")}, - {"metadata": {"created_at": "2024-01-01", "active": True}}, - {"scores": [85.5, 92.3, 77.8], "status": "pass"}, - {"nested_list": [[1,2], [3,4], [5,6]]}, - ] + "c1": dict1['c1'], + "c2": dict1['c2'], + "c3": dict1['c3'], + "c4": dict1['c4'] } -df2 = pd.DataFrame({ - "c1": dict3["c1"], - "c2": dict3["c2"], - "c3": dict3["c3"], - "c4": dict3["c4"] -}) - -arrow_table1 = pa.Table.from_pandas(df2) class MyReader(chdb.PyReader): def __init__(self, data): @@ -170,13 +67,11 @@ def get_schema(self): class TestQueryJSON(unittest.TestCase): def setUp(self) -> None: - shutil.rmtree(test_json_query_dir, ignore_errors=True) - self.sess = chs.Session(test_json_query_dir) + self.sess = chs.Session() return super().setUp() def tearDown(self) -> None: self.sess.close() - shutil.rmtree(test_json_query_dir, ignore_errors=True) return super().tearDown() def test_query_py_reader1(self): @@ -198,220 +93,11 @@ def test_query_dict1(self): self.assertEqual(str(ret), EXPECTED1) - def test_query_df1(self): - data = { - 'dict_col1': [ - {'id1': 1, 'name1': 'apple1' }, - {'id2': 2, 'name2': 'apple2' } - ], - 'dict_col2': [ - {'id': 3, 'name': 'apple3' }, - {'id': 4, 'name': 'apple4' } - ], - } - - df_object = pd.DataFrame(data) - - ret = self.sess.query("SELECT dict_col1.name1, dict_col2.id, dict_col1.id2 FROM Python(df_object)") - self.assertEqual(str(ret), EXPECTED2) - - def test_query_df2(self): + def test_query_dict2(self): self.sess.query("SET pandas_analyze_sample = 1") ret = self.sess.query("SELECT c4.tags, c3.deep.level2.level3, c3.mixed_list[].a FROM Python(dict1) WHERE c1 <= 2 ORDER BY c1") self.assertEqual(str(ret), EXPECTED1) - def test_pandas_analyze_sample(self): - data = { - 'int_col': [i for i in range(1, 20001)], - 'obj_col': [ - {'id': i, 'value': f'value_{i}', 'flag': random.choice([True, False])} - if i != 2 else 100 - for i in range(1, 20001) - ] - } - df = pd.DataFrame(data) - - self.sess.query("SET pandas_analyze_sample = 20000") - with self.assertRaises(Exception): - self.sess.query("SELECT obj_col.id FROM Python(df) ORDER BY int_col LIMIT 1") - - self.sess.query("SET pandas_analyze_sample = 10000") - ret = self.sess.query("SELECT obj_col.id FROM Python(df) ORDER BY int_col LIMIT 1") - self.assertEqual(str(ret), '1\n') - - self.sess.query("SET pandas_analyze_sample = 0") - with self.assertRaises(Exception): - self.sess.query("SELECT obj_col.id FROM Python(df) ORDER BY int_col LIMIT 1") - - self.sess.query("SET pandas_analyze_sample = -1") - ret = self.sess.query("SELECT obj_col.id FROM Python(df) ORDER BY int_col LIMIT 1") - self.assertEqual(str(ret), '1\n') - - def test_date_data_types(self): - df = pd.DataFrame({ - 'datetime': complex_dict['datetime_col'], - 'time': complex_dict['time_col'], - 'date': complex_dict['date_col'], - 'timedelta': complex_dict['timedelta_col'] - }) - - ret = self.sess.query(""" - SELECT date.date, time.time, datetime.datetime, timedelta.timedelta - FROM Python(df) - """) - self.assertEqual(str(ret), '"2023-05-15","12:30:00","2024-05-30 14:30:00","1 day, 12:30:00"\n"2024-01-01","00:00:00","2023-12-31 23:59:59","2 days, 3:20:00"\n') - - def test_binary_data_types(self): - df = pd.DataFrame({ - 'bytes': complex_dict['bytes_col'], - 'bytearray': complex_dict['bytearray_col'], - 'memoryview': complex_dict['memoryview_col'], - 'uuid': complex_dict['uuid_col'] - }) - - ret = self.sess.query(""" - SELECT bytes.bytes, bytearray.bytearray, memoryview.memoryview, uuid.uuid - FROM Python(df) - """) - self.assertEqual(str(ret), '"苹果","bytearray1","memory1","00000000-0000-0000-0000-000000000001"\n"memory","苹果","苹果","00000000-0000-0000-0000-000000000000"\n') - - def test_none_data_types(self): - df = pd.DataFrame({ - 'special_values': complex_dict['special_values_col'] - }) - - ret = self.sess.query(""" - SELECT special_values.null_val - FROM Python(df) - """) - self.assertEqual(str(ret), '\\N\n') - - def test_bool_data_types(self): - df = pd.DataFrame({ - 'special_values': complex_dict['special_values_col'] - }) - - ret = self.sess.query(""" - SELECT special_values.bool_val - FROM Python(df) - """) - self.assertEqual(str(ret), 'true\n') - - def test_empty_list_data_types(self): - df = pd.DataFrame({ - 'special_values': complex_dict['special_values_col'] - }) - - ret = self.sess.query(""" - SELECT special_values.empty_list - FROM Python(df) - """) - self.assertEqual(str(ret), '"[]"\n') - - ret = self.sess.query(""" - SELECT special_values.empty_dict - FROM Python(df) - """) - self.assertEqual(str(ret), '\\N\n') - - ret = self.sess.query(""" - SELECT special_values.empty_array - FROM Python(df) - """) - self.assertEqual(str(ret), '"[]"\n') - - def test_special_num_data_types(self): - df = pd.DataFrame({ - 'special_num': complex_dict['special_num_col'] - }) - - ret = self.sess.query(""" - SELECT special_num.special_num_val - FROM Python(df) - """) - self.assertEqual(str(ret), '"[0,-1.1,NULL,NULL,NULL,NULL,NULL]"\n') - - def test_decimal_data_types(self): - df = pd.DataFrame({ - 'decimals': complex_dict['decimal_col'] - }) - - self.sess.query("SET allow_suspicious_types_in_order_by = 1") - ret = self.sess.query(""" - SELECT decimals.decimal_val - FROM Python(df) - ORDER BY decimals.decimal_val DESC - """) - self.assertEqual(str(ret), '"1E-10"\n"123456789.1234567890"\n"-987654321.9876543210"\n') - - def test_string_data_types(self): - df = pd.DataFrame({ - 'string_values': complex_dict['string_col'] - }) - - self.sess.query("SET allow_suspicious_types_in_order_by = 1") - ret = self.sess.query(""" - SELECT string_values.str - FROM Python(df) - ORDER BY string_values.str - """) - self.assertEqual(str(ret), '"hello"\n"苹果"\n') - - def test_special_numpy_types(self): - df = pd.DataFrame({ - 'numpy': complex_dict['numpy_num_col'] - }) - - ret = self.sess.query(""" - SELECT numpy.numpy_val - FROM Python(df) - """) - self.assertEqual(str(ret), '"[42,-1,3.140000104904175,2.718]"\n') - - df = pd.DataFrame({ - 'numpy': complex_dict['numpy_bool_col'] - }) - - ret = self.sess.query(""" - SELECT numpy.numpy_val - FROM Python(df) - """) - self.assertEqual(str(ret), '"[true,false]"\n') - - df = pd.DataFrame({ - 'numpy': complex_dict['numpy_datetime_col'] - }) - - ret = self.sess.query(""" - SELECT numpy.numpy_val - FROM Python(df) - """) - self.assertEqual(str(ret), '"2025-05-30 20:08:08.123000000"\n') - - def test_query_pyarrow_table1(self): - ret = self.sess.query("SELECT c4.tags, c3.deep.level2.level3, c3.mixed_list[].a FROM Python(arrow_table1) WHERE c1 <= 2 ORDER BY c1") - - self.assertEqual(str(ret), EXPECTED1) - - def test_pyarrow_complex_types(self): - struct_type = pa.struct([ - pa.field('level1', pa.struct([ - pa.field('level2', pa.string()) - ])), - pa.field('array_col', pa.list_(pa.int32())) - ]) - - data = [ - {'level1': {'level2': 'value1'}, 'array_col': [1,2]}, - {'level1': {'level2': None}, 'array_col': []} - ] - - arrow_table = pa.Table.from_arrays([ - pa.array(data, type=struct_type) - ], names=["struct_col"]) - - ret = self.sess.query("SELECT struct_col.level1.level2 FROM Python(arrow_table)") - self.assertEqual(str(ret), '"value1"\n\\N\n') if __name__ == "__main__": - unittest.main(verbosity=3) + unittest.main(verbosity=2) diff --git a/tests/test_query_json_arrow.py b/tests/test_query_json_arrow.py new file mode 100644 index 00000000000..304646e7104 --- /dev/null +++ b/tests/test_query_json_arrow.py @@ -0,0 +1,72 @@ +#!python3 + +import unittest +import pyarrow as pa +import chdb.session as chs + +EXPECTED1 = """"['urgent','important']",100.3,"[]" +\\N,\\N,"[1,666]" +""" + +dict3 = { + "c1": [1, 2, 3, 4], + "c2": ["banana", "water", "apple", "water"], + "c3": [ + {"deep": {"level2": {"level3": 100.3}}}, + {"mixed_list": [{"a": 1}, {"a": 666}]}, + {"nested_int": 1, "mixed": "text", "float_val": 3.14}, + {"list_val": [1,2,3], "tuple_val": (4,5)}, + ], + "c4": [ + {"coordinates": [1.1, 2.2], "tags": ("urgent", "important")}, + {"metadata": {"created_at": "2024-01-01", "active": True}}, + {"scores": [85.5, 92.3, 77.8], "status": "pass"}, + {"nested_list": [[1,2], [3,4], [5,6]]}, + ] +} + +arrow_table1 = pa.table({ + "c1": dict3["c1"], + "c2": dict3["c2"], + "c3": dict3["c3"], + "c4": dict3["c4"] +}) + + +class TestQueryJSONArrow(unittest.TestCase): + def setUp(self) -> None: + self.sess = chs.Session() + return super().setUp() + + def tearDown(self) -> None: + self.sess.close() + return super().tearDown() + + def test_query_pyarrow_table1(self): + ret = self.sess.query("SELECT c4.tags, c3.deep.level2.level3, c3.mixed_list[].a FROM Python(arrow_table1) WHERE c1 <= 2 ORDER BY c1") + + self.assertEqual(str(ret), EXPECTED1) + + def test_pyarrow_complex_types(self): + struct_type = pa.struct([ + pa.field('level1', pa.struct([ + pa.field('level2', pa.string()) + ])), + pa.field('array_col', pa.list_(pa.int32())) + ]) + + data = [ + {'level1': {'level2': 'value1'}, 'array_col': [1,2]}, + {'level1': {'level2': None}, 'array_col': []} + ] + + arrow_table = pa.Table.from_arrays([ + pa.array(data, type=struct_type) + ], names=["struct_col"]) + + ret = self.sess.query("SELECT struct_col.level1.level2 FROM Python(arrow_table)") + self.assertEqual(str(ret), '"value1"\n\\N\n') + + +if __name__ == "__main__": + unittest.main(verbosity=2) diff --git a/tests/test_query_json_dataframe.py b/tests/test_query_json_dataframe.py new file mode 100644 index 00000000000..98b04a2cb0b --- /dev/null +++ b/tests/test_query_json_dataframe.py @@ -0,0 +1,296 @@ +#!python3 + +import math +import random +import uuid +import unittest +import numpy as np +import pandas as pd +import chdb.session as chs +from datetime import date, datetime, time, timedelta +from decimal import Decimal + + +EXPECTED2 = '"apple1",3,\\N\n\\N,4,2\n' + +complex_dict = { + "date_col": [ + {"date": date(2023, 5, 15)}, + {"date": date(2024, 1, 1)} + ], + 'datetime_col': [ + {"datetime": datetime(2024, 5, 30, 14, 30)}, + {"datetime": datetime(2023, 12, 31, 23, 59, 59)} + ], + 'time_col': [ + {"time": time(12, 30)}, + {"time": time(0, 0)} + ], + 'timedelta_col': [ + {"timedelta": timedelta(days=1, hours=12, minutes=30)}, + {"timedelta": timedelta(days=2, hours=3, minutes=20)} + ], + 'bytes_col': [ + {"bytes": b'\xe8\x8b\xb9\xe6\x9e\x9c'}, + {"bytes": bytes([0x6d, 0x65, 0x6d, 0x6f, 0x72, 0x79])} + ], + 'bytearray_col': [ + {"bytearray": bytearray(b"bytearray1")}, + {"bytearray": bytearray('苹果'.encode('utf-8'))} + ], + 'memoryview_col': [ + {"memoryview": memoryview(b"memory1")}, + {"memoryview": memoryview('苹果'.encode('utf-8'))} + ], + 'uuid_col': [ + {"uuid": uuid.UUID('00000000-0000-0000-0000-000000000001')}, + {"uuid": uuid.UUID('00000000-0000-0000-0000-000000000000')} + ], + "string_col": [ + {"str": "hello"}, + {"str": "苹果"} + ], + 'special_values_col': [ + {"null_val": None, "bool_val": True, "empty_list": [], "empty_dict": {}, "empty_array": np.array([])}, + ], + 'special_num_col': [ + {"special_num_val": [0.0, -1.1, float('nan'), float('inf'), float('-inf'), math.inf, math.nan]} + ], + 'numpy_num_col': [ + {"numpy_val": np.array([np.int32(42), np.int64(-1), np.float32(3.14), np.float64(2.718)])}, + ], + 'numpy_bool_col': [ + {"numpy_val": np.array([np.bool_(True), np.bool_(False)])}, + ], + 'numpy_datetime_col': [ + {"numpy_val": np.datetime64('2025-05-30T20:08:08.123')} + ], + 'decimal_col': [ + {"decimal_val": Decimal('123456789.1234567890')}, + {"decimal_val": Decimal('-987654321.9876543210')}, + {"decimal_val": Decimal('0.0000000001')} + ] +} + +dict3 = { + "c1": [1, 2, 3, 4], + "c2": ["banana", "water", "apple", "water"], + "c3": [ + {"deep": {"level2": {"level3": 100.3}}}, + {"mixed_list": [{"a": 1}, {"a": 666}]}, + {"nested_int": 1, "mixed": "text", "float_val": 3.14}, + {"list_val": [1,2,3], "tuple_val": (4,5)}, + ], + "c4": [ + {"coordinates": [1.1, 2.2], "tags": ("urgent", "important")}, + {"metadata": {"created_at": "2024-01-01", "active": True}}, + {"scores": [85.5, 92.3, 77.8], "status": "pass"}, + {"nested_list": [[1,2], [3,4], [5,6]]}, + ] +} + +df2 = pd.DataFrame({ + "c1": dict3["c1"], + "c2": dict3["c2"], + "c3": dict3["c3"], + "c4": dict3["c4"] +}) + + +class TestQueryJSONDataFrame(unittest.TestCase): + def setUp(self) -> None: + self.sess = chs.Session() + return super().setUp() + + def tearDown(self) -> None: + self.sess.close() + return super().tearDown() + + def test_query_df1(self): + data = { + 'dict_col1': [ + {'id1': 1, 'name1': 'apple1' }, + {'id2': 2, 'name2': 'apple2' } + ], + 'dict_col2': [ + {'id': 3, 'name': 'apple3' }, + {'id': 4, 'name': 'apple4' } + ], + } + + df_object = pd.DataFrame(data) + + ret = self.sess.query("SELECT dict_col1.name1, dict_col2.id, dict_col1.id2 FROM Python(df_object)") + self.assertEqual(str(ret), EXPECTED2) + + def test_pandas_analyze_sample(self): + data = { + 'int_col': [i for i in range(1, 20001)], + 'obj_col': [ + {'id': i, 'value': f'value_{i}', 'flag': random.choice([True, False])} + if i != 2 else 100 + for i in range(1, 20001) + ] + } + df = pd.DataFrame(data) + + self.sess.query("SET pandas_analyze_sample = 20000") + with self.assertRaises(Exception): + self.sess.query("SELECT obj_col.id FROM Python(df) ORDER BY int_col LIMIT 1") + + self.sess.query("SET pandas_analyze_sample = 10000") + ret = self.sess.query("SELECT obj_col.id FROM Python(df) ORDER BY int_col LIMIT 1") + self.assertEqual(str(ret), '1\n') + + self.sess.query("SET pandas_analyze_sample = 0") + with self.assertRaises(Exception): + self.sess.query("SELECT obj_col.id FROM Python(df) ORDER BY int_col LIMIT 1") + + self.sess.query("SET pandas_analyze_sample = -1") + ret = self.sess.query("SELECT obj_col.id FROM Python(df) ORDER BY int_col LIMIT 1") + self.assertEqual(str(ret), '1\n') + + def test_date_data_types(self): + df = pd.DataFrame({ + 'datetime': complex_dict['datetime_col'], + 'time': complex_dict['time_col'], + 'date': complex_dict['date_col'], + 'timedelta': complex_dict['timedelta_col'] + }) + + ret = self.sess.query(""" + SELECT date.date, time.time, datetime.datetime, timedelta.timedelta + FROM Python(df) + """) + self.assertEqual(str(ret), '"2023-05-15","12:30:00","2024-05-30 14:30:00","1 day, 12:30:00"\n"2024-01-01","00:00:00","2023-12-31 23:59:59","2 days, 3:20:00"\n') + + def test_binary_data_types(self): + df = pd.DataFrame({ + 'bytes': complex_dict['bytes_col'], + 'bytearray': complex_dict['bytearray_col'], + 'memoryview': complex_dict['memoryview_col'], + 'uuid': complex_dict['uuid_col'] + }) + + ret = self.sess.query(""" + SELECT bytes.bytes, bytearray.bytearray, memoryview.memoryview, uuid.uuid + FROM Python(df) + """) + self.assertEqual(str(ret), '"苹果","bytearray1","memory1","00000000-0000-0000-0000-000000000001"\n"memory","苹果","苹果","00000000-0000-0000-0000-000000000000"\n') + + def test_none_data_types(self): + df = pd.DataFrame({ + 'special_values': complex_dict['special_values_col'] + }) + + ret = self.sess.query(""" + SELECT special_values.null_val + FROM Python(df) + """) + self.assertEqual(str(ret), '\\N\n') + + def test_bool_data_types(self): + df = pd.DataFrame({ + 'special_values': complex_dict['special_values_col'] + }) + + ret = self.sess.query(""" + SELECT special_values.bool_val + FROM Python(df) + """) + self.assertEqual(str(ret), 'true\n') + + def test_empty_list_data_types(self): + df = pd.DataFrame({ + 'special_values': complex_dict['special_values_col'] + }) + + ret = self.sess.query(""" + SELECT special_values.empty_list + FROM Python(df) + """) + self.assertEqual(str(ret), '"[]"\n') + + ret = self.sess.query(""" + SELECT special_values.empty_dict + FROM Python(df) + """) + self.assertEqual(str(ret), '\\N\n') + + ret = self.sess.query(""" + SELECT special_values.empty_array + FROM Python(df) + """) + self.assertEqual(str(ret), '"[]"\n') + + def test_special_num_data_types(self): + df = pd.DataFrame({ + 'special_num': complex_dict['special_num_col'] + }) + + ret = self.sess.query(""" + SELECT special_num.special_num_val + FROM Python(df) + """) + self.assertEqual(str(ret), '"[0,-1.1,NULL,NULL,NULL,NULL,NULL]"\n') + + def test_decimal_data_types(self): + df = pd.DataFrame({ + 'decimals': complex_dict['decimal_col'] + }) + + self.sess.query("SET allow_suspicious_types_in_order_by = 1") + ret = self.sess.query(""" + SELECT decimals.decimal_val + FROM Python(df) + ORDER BY decimals.decimal_val DESC + """) + self.assertEqual(str(ret), '"1E-10"\n"123456789.1234567890"\n"-987654321.9876543210"\n') + + def test_string_data_types(self): + df = pd.DataFrame({ + 'string_values': complex_dict['string_col'] + }) + + self.sess.query("SET allow_suspicious_types_in_order_by = 1") + ret = self.sess.query(""" + SELECT string_values.str + FROM Python(df) + ORDER BY string_values.str + """) + self.assertEqual(str(ret), '"hello"\n"苹果"\n') + + def test_special_numpy_types(self): + df = pd.DataFrame({ + 'numpy': complex_dict['numpy_num_col'] + }) + + ret = self.sess.query(""" + SELECT numpy.numpy_val + FROM Python(df) + """) + self.assertEqual(str(ret), '"[42,-1,3.140000104904175,2.718]"\n') + + df = pd.DataFrame({ + 'numpy': complex_dict['numpy_bool_col'] + }) + + ret = self.sess.query(""" + SELECT numpy.numpy_val + FROM Python(df) + """) + self.assertEqual(str(ret), '"[true,false]"\n') + + df = pd.DataFrame({ + 'numpy': complex_dict['numpy_datetime_col'] + }) + + ret = self.sess.query(""" + SELECT numpy.numpy_val + FROM Python(df) + """) + self.assertEqual(str(ret), '"2025-05-30 20:08:08.123000000"\n') + + +if __name__ == "__main__": + unittest.main(verbosity=2) diff --git a/tests/test_query_py.py b/tests/test_query_py.py index ea6a2074665..0b4641d02c7 100644 --- a/tests/test_query_py.py +++ b/tests/test_query_py.py @@ -1,15 +1,7 @@ #!python3 -import io -import json import random import unittest -import numpy as np -import pandas as pd -import pyarrow as pa -from pyarrow import csv -import pyarrow.json -import pyarrow.parquet import chdb @@ -18,40 +10,6 @@ "tom",5 """ -EXPECTED_MULTILPE_TABLES = """1,"tom" -""" - - -SMALL_CSV = """score1,score2,score3 -70906,0.9166144356547409,draw -580525,0.9944755780981678,lose -254703,0.5290208413632235,lose -522924,0.9837867058675329,lose -382278,0.4781036385988161,lose -380893,0.48907718034312386,draw -221497,0.32603538643678,draw -446521,0.1386178708257899,win -522220,0.6633602572635723,draw -717410,0.6095994785374601,draw -""" - -SCORES_CSV = """score,result,dateOfBirth -758270,lose,1983-07-24 -355079,win,2000-11-27 -451231,lose,1980-03-11 -854953,lose,1996-08-10 -294257,lose,1966-12-12 -756327,lose,1997-08-29 -379755,lose,1981-10-24 -916108,lose,1950-08-30 -467033,win,2007-09-15 -639860,win,1989-06-30 -""" - -ARROW_DATA_JSONL = """{"match_id": 3943077, "match_date": "2024-07-15", "kick_off": "04:15:00.000", "competition": {"competition_id": 223, "country_name": "South America", "competition_name": "Copa America"}, "season": {"season_id": 282, "season_name": "2024"}, "home_team": {"home_team_id": 779, "home_team_name": "Argentina", "home_team_gender": "male", "home_team_group": null, "country": {"id": 11, "name": "Argentina"}, "managers": [{"id": 5677, "name": "Lionel Sebasti\u00e1n Scaloni", "nickname": null, "dob": "1978-05-16", "country": {"id": 11, "name": "Argentina"}}]}, "away_team": {"away_team_id": 769, "away_team_name": "Colombia", "away_team_gender": "male", "away_team_group": null, "country": {"id": 49, "name": "Colombia"}, "managers": [{"id": 5905, "name": "N\u00e9stor Gabriel Lorenzo", "nickname": null, "dob": "1966-02-28", "country": {"id": 11, "name": "Argentina"}}]}, "home_score": 1, "away_score": 0, "match_status": "available", "match_status_360": "unscheduled", "last_updated": "2024-07-15T15:50:08.671355", "last_updated_360": null, "metadata": {"data_version": "1.1.0", "shot_fidelity_version": "2", "xy_fidelity_version": "2"}, "match_week": 6, "competition_stage": {"id": 26, "name": "Final"}, "stadium": {"id": 5337, "name": "Hard Rock Stadium", "country": {"id": 241, "name": "United States of America"}}, "referee": {"id": 2638, "name": "Raphael Claus", "country": {"id": 31, "name": "Brazil"}}} -{"match_id": 3943076, "match_date": "2024-07-14", "kick_off": "03:00:00.000", "competition": {"competition_id": 223, "country_name": "South America", "competition_name": "Copa America"}, "season": {"season_id": 282, "season_name": "2024"}, "home_team": {"home_team_id": 1833, "home_team_name": "Canada", "home_team_gender": "male", "home_team_group": null, "country": {"id": 40, "name": "Canada"}, "managers": [{"id": 165, "name": "Jesse Marsch", "nickname": null, "dob": "1973-11-08", "country": {"id": 241, "name": "United States of America"}}]}, "away_team": {"away_team_id": 783, "away_team_name": "Uruguay", "away_team_gender": "male", "away_team_group": null, "country": {"id": 242, "name": "Uruguay"}, "managers": [{"id": 269, "name": "Marcelo Alberto Bielsa Caldera", "nickname": "Marcelo Bielsa", "dob": "1955-07-21", "country": {"id": 11, "name": "Argentina"}}]}, "home_score": 2, "away_score": 2, "match_status": "available", "match_status_360": "unscheduled", "last_updated": "2024-07-15T07:57:02.660641", "last_updated_360": null, "metadata": {"data_version": "1.1.0", "shot_fidelity_version": "2", "xy_fidelity_version": "2"}, "match_week": 6, "competition_stage": {"id": 25, "name": "3rd Place Final"}, "stadium": {"id": 52985, "name": "Bank of America Stadium", "country": {"id": 241, "name": "United States of America"}}, "referee": {"id": 1849, "name": "Alexis Herrera", "country": {"id": 246, "name": "Venezuela\u00a0(Bolivarian Republic)"}}} -""" - class myReader(chdb.PyReader): def __init__(self, data): @@ -60,7 +18,6 @@ def __init__(self, data): super().__init__(data) def read(self, col_names, count): - print("Python func read", col_names, count, self.cursor) if self.cursor >= len(self.data["a"]): return [] block = [self.data[col] for col in col_names] @@ -69,18 +26,6 @@ def read(self, col_names, count): class TestQueryPy(unittest.TestCase): - - # def test_query_np(self): - # t3 = { - # "a": np.array([1, 2, 3, 4, 5, 6]), - # "b": np.array(["tom", "jerry", "auxten", "tom", "jerry", "auxten"]), - # } - - # ret = chdb.query( - # "SELECT b, sum(a) FROM Python(t3) GROUP BY b ORDER BY b", "debug" - # ) - # self.assertEqual(str(ret), EXPECTED) - def test_query_py(self): reader = myReader( { @@ -99,161 +44,6 @@ def test_string_with_null_character(self): result_data = res.bytes().decode('utf-8') self.assertIn('hello\0world', result_data) - def test_query_df(self): - df = pd.DataFrame( - { - "a": [1, 2, 3, 4, 5, 6], - "b": ["tom", "jerry", "auxten", "tom", "jerry", "auxten"], - } - ) - - ret = chdb.query("SELECT b, sum(a) FROM Python(df) GROUP BY b ORDER BY b") - self.assertEqual(str(ret), EXPECTED) - - def test_query_df_with_index(self): - df = pd.DataFrame( - { - "a": [1, 2, 3, 4, 5, 6], - "b": ["tom", "jerry", "auxten", "tom", "jerry", "auxten"], - }, - index=[3, 1, 2, 4, 5, 6], - ) - - ret = chdb.query("SELECT * FROM Python(df)") - self.assertIn("tom", str(ret)) - - df = pd.DataFrame( - { - "a": [1, 2, 3, 4, 5, 6], - "b": ["tom", "jerry", "auxten", "tom", "jerry", "auxten"], - }, - index=[0, 1, 2, 4, 5, 6], - ) - - ret = chdb.query("SELECT * FROM Python(df)") - self.assertIn("tom", str(ret)) - - df = pd.DataFrame( - { - "a": [1, 2, 3, 4, 5, 6], - "b": ["tom", "jerry", "auxten", "tom", "jerry", "auxten"], - }, - index=['a', 1, 2, 4, 5, 6], - ) - - ret = chdb.query("SELECT * FROM Python(df)") - self.assertIn("tom", str(ret)) - - def test_query_arrow(self): - table = pa.table( - { - "a": pa.array([1, 2, 3, 4, 5, 6]), - "b": pa.array(["tom", "jerry", "auxten", "tom", "jerry", "auxten"]), - } - ) - - ret = chdb.query( - "SELECT b, sum(a) FROM Python(table) GROUP BY b ORDER BY b" - ) - self.assertEqual(str(ret), EXPECTED) - - def test_query_arrow2(self): - t2 = pa.table( - { - "a": [1, 2, 3, 4, 5, 6], - "b": ["tom", "jerry", "auxten", "tom", "jerry", "auxten"], - } - ) - - ret = chdb.query( - "SELECT b, sum(a) FROM Python(t2) GROUP BY b ORDER BY b" - ) - self.assertEqual(str(ret), EXPECTED) - - def test_query_arrow3(self): - table = csv.read_csv(io.BytesIO(SCORES_CSV.encode())) - ret = chdb.query( - """ - SELECT sum(score), avg(score), median(score), - avgIf(score, dateOfBirth > '1980-01-01') as avgIf, - countIf(result = 'win') AS wins, - countIf(result = 'draw') AS draws, - countIf(result = 'lose') AS losses, - count() - FROM Python(table) - """, - ) - self.assertEqual( - str(ret), - "5872873,587287.3,553446.5,470878.25,3,0,7,10\n", - ) - - def test_query_arrow4(self): - arrow_table = pa.json.read_json(io.BytesIO(ARROW_DATA_JSONL.encode())) - # print(arrow_table.schema) - ret = chdb.query("SELECT * FROM Python(arrow_table) LIMIT 10", "JSONEachRow") - # print(ret) - self.assertEqual("", ret.error_message()) - - def test_query_arrow5(self): - arrow_table = pa.parquet.read_table( - "data/sample_2021-04-01_performance_mobile_tiles.parquet" - ) - # print("Arrow Schema:\n", arrow_table.schema) - ret = chdb.query("SELECT * FROM Python(arrow_table) LIMIT 1", "JSONCompact") - # print("JSON:\n", ret) - schema = json.loads(str(ret)).get("meta") - # shema is array like: - # [{"name":"quadkey","type":"String"},{"name":"tile","type":"String"}] - schema_dict = {x["name"]: x["type"] for x in schema} - self.assertDictEqual( - schema_dict, - { - "quadkey": "String", - "tile": "String", - "tile_x": "Float64", - "tile_y": "Float64", - "avg_d_kbps": "Int64", - "avg_u_kbps": "Int64", - "avg_lat_ms": "Int64", - "avg_lat_down_ms": "Float64", - "avg_lat_up_ms": "Float64", - "tests": "Int64", - "devices": "Int64", - }, - ) - ret = chdb.query( - """ - WITH numericColumns AS ( - SELECT * EXCEPT ('tile.*') EXCEPT(quadkey) - FROM Python(arrow_table) - ) - SELECT * APPLY(max), * APPLY(median) APPLY(x -> round(x, 2)) - FROM numericColumns - """, - "JSONCompact", - ) - # print("JSONCompact:\n", ret) - self.assertDictEqual( - {x["name"]: x["type"] for x in json.loads(str(ret)).get("meta")}, - { - "max(avg_d_kbps)": "Int64", - "max(avg_lat_down_ms)": "Float64", - "max(avg_lat_ms)": "Int64", - "max(avg_lat_up_ms)": "Float64", - "max(avg_u_kbps)": "Int64", - "max(devices)": "Int64", - "max(tests)": "Int64", - "round(median(avg_d_kbps), 2)": "Float64", - "round(median(avg_lat_down_ms), 2)": "Float64", - "round(median(avg_lat_ms), 2)": "Float64", - "round(median(avg_lat_up_ms), 2)": "Float64", - "round(median(avg_u_kbps), 2)": "Float64", - "round(median(devices), 2)": "Float64", - "round(median(tests), 2)": "Float64", - }, - ) - def test_random_float(self): x = {"col1": [random.uniform(0, 1) for _ in range(0, 100000)]} ret = chdb.sql( @@ -262,7 +52,6 @@ def test_random_float(self): FROM Python(x) """ ) - print(ret.bytes()) self.assertAlmostEqual(float(ret.bytes()), 0.5, delta=0.01) def test_query_dict(self): @@ -293,57 +82,6 @@ def test_query_dict_int(self): """, ) - def test_query_pd_csv(self): - csv_data = pd.read_csv(io.StringIO(SMALL_CSV)) - ret = chdb.query( - """ - SELECT sum(score1), avg(score1), median(score1), - sum(toFloat32(score2)), avg(toFloat32(score2)), median(toFloat32(score2)), - countIf(score3 = 'win') AS wins, - countIf(score3 = 'draw') AS draws, - countIf(score3 = 'lose') AS losses, - count() - FROM Python(csv_data) - """, - ) - self.assertEqual( - str(ret), - "4099877,409987.7,414399.5,6.128691345453262,0.6128691345453262,0.5693101584911346,1,5,4,10\n", - ) - - def test_query_multiple_df(self): - df1 = pd.DataFrame( - { - "a": [1, 2, 3, 4, 5, 6], - "b": ["tom", "jerry", "auxten", "tom", "jerry", "auxten"], - } - ) - - df2 = pd.DataFrame( - { - "a": [7, 8, 9, 10, 11, 12], - "b": ["tom", "jerry", "auxten", "tom", "jerry", "auxten"], - } - ) - - df3 = pd.DataFrame( - { - "a": [13, 14, 15, 16, 17, 18], - "b": ["tom", "jerry", "auxten", "tom", "jerry", "auxten"], - } - ) - - ret = chdb.query( - """ - SELECT * FROM python(df1) WHERE a = 1 - UNION ALL - SELECT * FROM python(df2) WHERE a = 98 - UNION ALL - SELECT * FROM python(df3) WHERE a = 198 - """) - - self.assertEqual(str(ret), EXPECTED_MULTILPE_TABLES) - if __name__ == "__main__": - unittest.main(verbosity=3) + unittest.main(verbosity=2) diff --git a/tests/test_query_py_arrow.py b/tests/test_query_py_arrow.py new file mode 100644 index 00000000000..ae5216644ef --- /dev/null +++ b/tests/test_query_py_arrow.py @@ -0,0 +1,149 @@ +#!python3 + +import io +import json +import unittest +import pyarrow as pa +from pyarrow import csv +import pyarrow.json +import pyarrow.parquet +import chdb + + +EXPECTED = """"auxten",9 +"jerry",7 +"tom",5 +""" + +SCORES_CSV = """score,result,dateOfBirth +758270,lose,1983-07-24 +355079,win,2000-11-27 +451231,lose,1980-03-11 +854953,lose,1996-08-10 +294257,lose,1966-12-12 +756327,lose,1997-08-29 +379755,lose,1981-10-24 +916108,lose,1950-08-30 +467033,win,2007-09-15 +639860,win,1989-06-30 +""" + +ARROW_DATA_JSONL = """{"match_id": 3943077, "match_date": "2024-07-15", "kick_off": "04:15:00.000", "competition": {"competition_id": 223, "country_name": "South America", "competition_name": "Copa America"}, "season": {"season_id": 282, "season_name": "2024"}, "home_team": {"home_team_id": 779, "home_team_name": "Argentina", "home_team_gender": "male", "home_team_group": null, "country": {"id": 11, "name": "Argentina"}, "managers": [{"id": 5677, "name": "Lionel Sebasti\u00e1n Scaloni", "nickname": null, "dob": "1978-05-16", "country": {"id": 11, "name": "Argentina"}}]}, "away_team": {"away_team_id": 769, "away_team_name": "Colombia", "away_team_gender": "male", "away_team_group": null, "country": {"id": 49, "name": "Colombia"}, "managers": [{"id": 5905, "name": "N\u00e9stor Gabriel Lorenzo", "nickname": null, "dob": "1966-02-28", "country": {"id": 11, "name": "Argentina"}}]}, "home_score": 1, "away_score": 0, "match_status": "available", "match_status_360": "unscheduled", "last_updated": "2024-07-15T15:50:08.671355", "last_updated_360": null, "metadata": {"data_version": "1.1.0", "shot_fidelity_version": "2", "xy_fidelity_version": "2"}, "match_week": 6, "competition_stage": {"id": 26, "name": "Final"}, "stadium": {"id": 5337, "name": "Hard Rock Stadium", "country": {"id": 241, "name": "United States of America"}}, "referee": {"id": 2638, "name": "Raphael Claus", "country": {"id": 31, "name": "Brazil"}}} +{"match_id": 3943076, "match_date": "2024-07-14", "kick_off": "03:00:00.000", "competition": {"competition_id": 223, "country_name": "South America", "competition_name": "Copa America"}, "season": {"season_id": 282, "season_name": "2024"}, "home_team": {"home_team_id": 1833, "home_team_name": "Canada", "home_team_gender": "male", "home_team_group": null, "country": {"id": 40, "name": "Canada"}, "managers": [{"id": 165, "name": "Jesse Marsch", "nickname": null, "dob": "1973-11-08", "country": {"id": 241, "name": "United States of America"}}]}, "away_team": {"away_team_id": 783, "away_team_name": "Uruguay", "away_team_gender": "male", "away_team_group": null, "country": {"id": 242, "name": "Uruguay"}, "managers": [{"id": 269, "name": "Marcelo Alberto Bielsa Caldera", "nickname": "Marcelo Bielsa", "dob": "1955-07-21", "country": {"id": 11, "name": "Argentina"}}]}, "home_score": 2, "away_score": 2, "match_status": "available", "match_status_360": "unscheduled", "last_updated": "2024-07-15T07:57:02.660641", "last_updated_360": null, "metadata": {"data_version": "1.1.0", "shot_fidelity_version": "2", "xy_fidelity_version": "2"}, "match_week": 6, "competition_stage": {"id": 25, "name": "3rd Place Final"}, "stadium": {"id": 52985, "name": "Bank of America Stadium", "country": {"id": 241, "name": "United States of America"}}, "referee": {"id": 1849, "name": "Alexis Herrera", "country": {"id": 246, "name": "Venezuela\u00a0(Bolivarian Republic)"}}} +""" + + +class TestQueryPyArrow(unittest.TestCase): + def test_query_arrow1(self): + table = pa.table( + { + "a": pa.array([1, 2, 3, 4, 5, 6]), + "b": pa.array(["tom", "jerry", "auxten", "tom", "jerry", "auxten"]), + } + ) + + ret = chdb.query( + "SELECT b, sum(a) FROM Python(table) GROUP BY b ORDER BY b" + ) + self.assertEqual(str(ret), EXPECTED) + + def test_query_arrow2(self): + t2 = pa.table( + { + "a": [1, 2, 3, 4, 5, 6], + "b": ["tom", "jerry", "auxten", "tom", "jerry", "auxten"], + } + ) + + ret = chdb.query( + "SELECT b, sum(a) FROM Python(t2) GROUP BY b ORDER BY b" + ) + self.assertEqual(str(ret), EXPECTED) + + def test_query_arrow3(self): + table = csv.read_csv(io.BytesIO(SCORES_CSV.encode())) + ret = chdb.query( + """ + SELECT sum(score), avg(score), median(score), + avgIf(score, dateOfBirth > '1980-01-01') as avgIf, + countIf(result = 'win') AS wins, + countIf(result = 'draw') AS draws, + countIf(result = 'lose') AS losses, + count() + FROM Python(table) + """, + ) + self.assertEqual( + str(ret), + "5872873,587287.3,553446.5,470878.25,3,0,7,10\n", + ) + + def test_query_arrow4(self): + arrow_table = pa.json.read_json(io.BytesIO(ARROW_DATA_JSONL.encode())) + # print(arrow_table.schema) + ret = chdb.query("SELECT * FROM Python(arrow_table) LIMIT 10", "JSONEachRow") + # print(ret) + self.assertEqual("", ret.error_message()) + + def test_query_arrow5(self): + arrow_table = pa.parquet.read_table( + "data/sample_2021-04-01_performance_mobile_tiles.parquet" + ) + # print("Arrow Schema:\n", arrow_table.schema) + ret = chdb.query("SELECT * FROM Python(arrow_table) LIMIT 1", "JSONCompact") + # print("JSON:\n", ret) + schema = json.loads(str(ret)).get("meta") + # shema is array like: + # [{"name":"quadkey","type":"String"},{"name":"tile","type":"String"}] + schema_dict = {x["name"]: x["type"] for x in schema} + self.assertDictEqual( + schema_dict, + { + "quadkey": "String", + "tile": "String", + "tile_x": "Float64", + "tile_y": "Float64", + "avg_d_kbps": "Int64", + "avg_u_kbps": "Int64", + "avg_lat_ms": "Int64", + "avg_lat_down_ms": "Float64", + "avg_lat_up_ms": "Float64", + "tests": "Int64", + "devices": "Int64", + }, + ) + ret = chdb.query( + """ + WITH numericColumns AS ( + SELECT * EXCEPT ('tile.*') EXCEPT(quadkey) + FROM Python(arrow_table) + ) + SELECT * APPLY(max), * APPLY(median) APPLY(x -> round(x, 2)) + FROM numericColumns + """, + "JSONCompact", + ) + # print("JSONCompact:\n", ret) + self.assertDictEqual( + {x["name"]: x["type"] for x in json.loads(str(ret)).get("meta")}, + { + "max(avg_d_kbps)": "Int64", + "max(avg_lat_down_ms)": "Float64", + "max(avg_lat_ms)": "Int64", + "max(avg_lat_up_ms)": "Float64", + "max(avg_u_kbps)": "Int64", + "max(devices)": "Int64", + "max(tests)": "Int64", + "round(median(avg_d_kbps), 2)": "Float64", + "round(median(avg_lat_down_ms), 2)": "Float64", + "round(median(avg_lat_ms), 2)": "Float64", + "round(median(avg_lat_up_ms), 2)": "Float64", + "round(median(avg_u_kbps), 2)": "Float64", + "round(median(devices), 2)": "Float64", + "round(median(tests), 2)": "Float64", + }, + ) + + +if __name__ == "__main__": + unittest.main(verbosity=2) diff --git a/tests/test_query_py_dataframe.py b/tests/test_query_py_dataframe.py new file mode 100644 index 00000000000..6dd6cb7bec5 --- /dev/null +++ b/tests/test_query_py_dataframe.py @@ -0,0 +1,137 @@ +#!python3 + +import io +import json +import random +import unittest +import numpy as np +import pandas as pd +import pyarrow as pa +from pyarrow import csv +import pyarrow.json +import pyarrow.parquet +import chdb + + +EXPECTED = """"auxten",9 +"jerry",7 +"tom",5 +""" + +EXPECTED_MULTILPE_TABLES = """1,"tom" +""" + +SMALL_CSV = """score1,score2,score3 +70906,0.9166144356547409,draw +580525,0.9944755780981678,lose +254703,0.5290208413632235,lose +522924,0.9837867058675329,lose +382278,0.4781036385988161,lose +380893,0.48907718034312386,draw +221497,0.32603538643678,draw +446521,0.1386178708257899,win +522220,0.6633602572635723,draw +717410,0.6095994785374601,draw +""" + + +class TestQueryPyDataFrame(unittest.TestCase): + def test_query_df(self): + df = pd.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6], + "b": ["tom", "jerry", "auxten", "tom", "jerry", "auxten"], + } + ) + + ret = chdb.query("SELECT b, sum(a) FROM Python(df) GROUP BY b ORDER BY b") + self.assertEqual(str(ret), EXPECTED) + + def test_query_df_with_index(self): + df = pd.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6], + "b": ["tom", "jerry", "auxten", "tom", "jerry", "auxten"], + }, + index=[3, 1, 2, 4, 5, 6], + ) + + ret = chdb.query("SELECT * FROM Python(df)") + self.assertIn("tom", str(ret)) + + df = pd.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6], + "b": ["tom", "jerry", "auxten", "tom", "jerry", "auxten"], + }, + index=[0, 1, 2, 4, 5, 6], + ) + + ret = chdb.query("SELECT * FROM Python(df)") + self.assertIn("tom", str(ret)) + + df = pd.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6], + "b": ["tom", "jerry", "auxten", "tom", "jerry", "auxten"], + }, + index=['a', 1, 2, 4, 5, 6], + ) + + ret = chdb.query("SELECT * FROM Python(df)") + self.assertIn("tom", str(ret)) + + def test_query_pd_csv(self): + csv_data = pd.read_csv(io.StringIO(SMALL_CSV)) + ret = chdb.query( + """ + SELECT sum(score1), avg(score1), median(score1), + sum(toFloat32(score2)), avg(toFloat32(score2)), median(toFloat32(score2)), + countIf(score3 = 'win') AS wins, + countIf(score3 = 'draw') AS draws, + countIf(score3 = 'lose') AS losses, + count() + FROM Python(csv_data) + """, + ) + self.assertEqual( + str(ret), + "4099877,409987.7,414399.5,6.128691345453262,0.6128691345453262,0.5693101584911346,1,5,4,10\n", + ) + + def test_query_multiple_df(self): + df1 = pd.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6], + "b": ["tom", "jerry", "auxten", "tom", "jerry", "auxten"], + } + ) + + df2 = pd.DataFrame( + { + "a": [7, 8, 9, 10, 11, 12], + "b": ["tom", "jerry", "auxten", "tom", "jerry", "auxten"], + } + ) + + df3 = pd.DataFrame( + { + "a": [13, 14, 15, 16, 17, 18], + "b": ["tom", "jerry", "auxten", "tom", "jerry", "auxten"], + } + ) + + ret = chdb.query( + """ + SELECT * FROM python(df1) WHERE a = 1 + UNION ALL + SELECT * FROM python(df2) WHERE a = 98 + UNION ALL + SELECT * FROM python(df3) WHERE a = 198 + """) + + self.assertEqual(str(ret), EXPECTED_MULTILPE_TABLES) + + +if __name__ == "__main__": + unittest.main(verbosity=2) diff --git a/tests/test_stateful.py b/tests/test_stateful.py index 6ea670ca5f2..6ab14fb100b 100644 --- a/tests/test_stateful.py +++ b/tests/test_stateful.py @@ -1,16 +1,11 @@ #!python3 -import time import shutil -import psutil import unittest from chdb import session -import chdb test_state_dir = ".state_tmp_auxten_test_stateful" -current_process = psutil.Process() -check_thread_count = False class TestStateful(unittest.TestCase): @@ -54,7 +49,6 @@ def test_path(self): ret = sess.query("SELECT * FROM db_xxx.log_table_xxx", "CSV") self.assertEqual(str(ret), "1\n2\n3\n4\n") - ret.show() sess.close() # reuse session @@ -115,30 +109,7 @@ def test_context_mgr(self): with self.assertRaises(Exception): ret = sess.query("SELECT chdb_xxx_notexist()", "CSV") - def test_query_fmt(self): - with session.Session() as sess: - # Dataframe result - ret = sess.query("SELECT 1 AS x", "DataFrame") - self.assertEqual(ret.x[0], 1) - # ArrowTable - ret = sess.query("SELECT 1 AS x", "ArrowTable") - self.assertEqual( - str(ret), - """pyarrow.Table -x: uint8 not null ----- -x: [[1]]""", - ) - - # def test_zfree_thread_count(self): - # time.sleep(3) - # thread_count = current_process.num_threads() - # print("Number of threads using psutil library: ", thread_count) - # if check_thread_count: - # self.assertEqual(thread_count, 1) - if __name__ == "__main__": shutil.rmtree(test_state_dir, ignore_errors=True) - check_thread_count = True - unittest.main() + unittest.main(verbosity=2) diff --git a/tests/test_stateful_arrow.py b/tests/test_stateful_arrow.py new file mode 100644 index 00000000000..90eafe22939 --- /dev/null +++ b/tests/test_stateful_arrow.py @@ -0,0 +1,28 @@ +#!python3 + +import unittest +import pyarrow # noqa +from chdb import session + + +class TestStatefulArrow(unittest.TestCase): + def setUp(self) -> None: + return super().setUp() + + def tearDown(self) -> None: + return super().tearDown() + + def test_query_fmt(self): + with session.Session() as sess: + ret = sess.query("SELECT 1 AS x", "ArrowTable") + self.assertEqual( + str(ret), + """pyarrow.Table +x: uint8 not null +---- +x: [[1]]""", + ) + + +if __name__ == "__main__": + unittest.main(verbosity=2) diff --git a/tests/test_stateful_dataframe.py b/tests/test_stateful_dataframe.py new file mode 100644 index 00000000000..864589520cb --- /dev/null +++ b/tests/test_stateful_dataframe.py @@ -0,0 +1,22 @@ +#!python3 + +import unittest +import pandas # noqa +from chdb import session + + +class TestStatefulDataFrame(unittest.TestCase): + def setUp(self) -> None: + return super().setUp() + + def tearDown(self) -> None: + return super().tearDown() + + def test_query_fmt(self): + with session.Session() as sess: + ret = sess.query("SELECT 1 AS x", "DataFrame") + self.assertEqual(ret.x[0], 1) + + +if __name__ == "__main__": + unittest.main(verbosity=2) From a49538ecefcc6f187feff78956cf2a98bde19358 Mon Sep 17 00:00:00 2001 From: wudidapaopao Date: Wed, 15 Oct 2025 18:48:16 +0800 Subject: [PATCH 3/7] test: fix tests --- .github/workflows/build_linux_arm64_wheels-gh.yml | 2 +- .github/workflows/build_linux_x86_wheels.yml | 2 +- .github/workflows/build_macos_arm64_wheels.yml | 2 +- .github/workflows/build_macos_x86_wheels.yml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build_linux_arm64_wheels-gh.yml b/.github/workflows/build_linux_arm64_wheels-gh.yml index 5c6f5ddec2f..e6cfab93cc5 100644 --- a/.github/workflows/build_linux_arm64_wheels-gh.yml +++ b/.github/workflows/build_linux_arm64_wheels-gh.yml @@ -103,7 +103,7 @@ jobs: echo "Installing dependencies for Python $version" pyenv shell $version python -m pip install --upgrade pip - python -m pip install setuptools tox twine psutil deltalake wheel + python -m pip install setuptools tox pandas pyarrow twine psutil deltalake wheel pyenv shell --unset done - name: Upgrade Rust toolchain diff --git a/.github/workflows/build_linux_x86_wheels.yml b/.github/workflows/build_linux_x86_wheels.yml index cb22bfe7823..e1e98d62b5b 100644 --- a/.github/workflows/build_linux_x86_wheels.yml +++ b/.github/workflows/build_linux_x86_wheels.yml @@ -103,7 +103,7 @@ jobs: echo "Installing dependencies for Python $version" pyenv shell $version python -m pip install --upgrade pip - python -m pip install setuptools tox twine psutil deltalake wheel + python -m pip install setuptools tox pandas pyarrow twine psutil deltalake wheel pyenv shell --unset done - name: Upgrade Rust toolchain diff --git a/.github/workflows/build_macos_arm64_wheels.yml b/.github/workflows/build_macos_arm64_wheels.yml index 14cad93afc8..6efa3a70c82 100644 --- a/.github/workflows/build_macos_arm64_wheels.yml +++ b/.github/workflows/build_macos_arm64_wheels.yml @@ -64,7 +64,7 @@ jobs: echo "Installing dependencies for Python $version" pyenv shell $version python -m pip install --upgrade pip - python -m pip install setuptools wheel tox twine psutil deltalake wheel>=0.40.0 + python -m pip install setuptools wheel tox pandas pyarrow twine psutil deltalake wheel>=0.40.0 pyenv shell --unset done - name: Remove /usr/local/bin/python3 diff --git a/.github/workflows/build_macos_x86_wheels.yml b/.github/workflows/build_macos_x86_wheels.yml index d11aec3febc..725bdf1b727 100644 --- a/.github/workflows/build_macos_x86_wheels.yml +++ b/.github/workflows/build_macos_x86_wheels.yml @@ -64,7 +64,7 @@ jobs: echo "Installing dependencies for Python $version" pyenv shell $version python -m pip install --upgrade pip - python -m pip install setuptools tox twine psutil deltalake wheel>=0.40.0 + python -m pip install setuptools tox pandas pyarrow twine psutil deltalake wheel>=0.40.0 pyenv shell --unset done - name: Remove /usr/local/bin/python3 From b8acacd5f7b04fd709fa453d3104448b3d996567 Mon Sep 17 00:00:00 2001 From: wudidapaopao Date: Wed, 15 Oct 2025 20:32:26 +0800 Subject: [PATCH 4/7] test: add tests --- tests/run_all.py | 174 ++++++++++++++++++++++++++------------ tests/test_basic.py | 9 +- tests/test_basic_arrow.py | 32 +++++++ 3 files changed, 155 insertions(+), 60 deletions(-) create mode 100644 tests/test_basic_arrow.py diff --git a/tests/run_all.py b/tests/run_all.py index 6e6cc671553..a15a7b615f5 100755 --- a/tests/run_all.py +++ b/tests/run_all.py @@ -1,7 +1,8 @@ #!python3 -import sys import unittest +import os +import glob class Colors: GREEN = '\033[92m' @@ -10,57 +11,120 @@ class Colors: BOLD = '\033[1m' END = '\033[0m' -test_loader = unittest.TestLoader() -test_suite = test_loader.discover('./') - -# Print all test files that will be executed -print(f"\n{Colors.BOLD}Discovered Test Files:{Colors.END}") -test_files = set() -def extract_test_files(suite): - for test in suite: - if hasattr(test, '_tests'): - extract_test_files(test) - elif hasattr(test, '__module__'): - test_files.add(test.__module__) - -extract_test_files(test_suite) - -# Filter out system modules, only show actual test files -filtered_test_files = {f for f in test_files if f != "unittest.loader"} - -for test_file in sorted(filtered_test_files): - print(f" • {test_file}") -print(f"\nTotal test files: {len(filtered_test_files)}\n") - -test_runner = unittest.TextTestRunner(verbosity=2) -ret = test_runner.run(test_suite) - -total = ret.testsRun -failures = len(ret.failures) -errors = len(ret.errors) -success = total - failures - errors - -if failures + errors == 0: - print(f"\n{Colors.GREEN}{Colors.BOLD}✓ ALL TESTS PASSED{Colors.END}") - print(f"{Colors.GREEN}Success: {success}, Total: {total}{Colors.END}") -else: - print(f"\n{Colors.RED}{Colors.BOLD}✖ TEST FAILURES{Colors.END}") - print(f"{Colors.RED}Failed: {failures}, Errors: {errors}, Success: {success}, Total: {total}{Colors.END}") - - if failures > 0: - print(f"\n{Colors.YELLOW}Failed Tests:{Colors.END}") - for failure in ret.failures: - test_case, traceback = failure - print(f"{Colors.RED}• {test_case.id()}{Colors.END}") - - if errors > 0: - print(f"\n{Colors.YELLOW}Errored Tests:{Colors.END}") - for error in ret.errors: - test_case, traceback = error - print(f"{Colors.RED}• {test_case.id()}{Colors.END}") - -# if any test fails, exit with non-zero code -if len(ret.failures) > 0 or len(ret.errors) > 0: - exit(1) -else: - exit(0) +def check_optional_dependencies(): + has_pyarrow = False + has_pandas = False + + try: + import pyarrow # type: ignore + has_pyarrow = True + print(f"{Colors.GREEN}PyArrow {pyarrow.__version__} is available{Colors.END}") + except ImportError: + print(f"{Colors.YELLOW}PyArrow not installed{Colors.END}") + + try: + import pandas # type: ignore + has_pandas = True + print(f"{Colors.GREEN}Pandas {pandas.__version__} is available{Colors.END}") + except ImportError: + print(f"{Colors.YELLOW}Pandas not installed{Colors.END}") + + return has_pyarrow and has_pandas + +def main(): + has_pyarrow_and_pandas = check_optional_dependencies() + + BASIC_TEST_FILES = [ + 'test_basic.py', + 'test_command_line.py', + 'test_conn_cursor.py', + 'test_dbapi_persistence.py', + 'test_dbapi.py', + 'test_delta_lake.py', + 'test_drop_table.py', + 'test_early_gc.py', + 'test_final_join.py', + 'test_gc.py', + 'test_insert_error_handling.py', + 'test_insert_vector.py', + 'test_issue104.py', + 'test_issue135.py', + 'test_issue229.py', + 'test_issue31.py', + 'test_issue60.py', + 'test_materialize.py', + 'test_multiple_query.py', + 'test_open_session_after_failure.py', + 'test_signal_handler.py', + 'test_statistics.py', + 'test_streaming_query.py', + 'test_udf.py', + 'test_usedb.py', + 'test_query_json.py', + ] + + test_loader = unittest.TestLoader() + test_suite = unittest.TestSuite() + + if has_pyarrow_and_pandas: + print(f"\n{Colors.GREEN}{Colors.BOLD}All dependencies available - running all tests{Colors.END}") + all_test_files = glob.glob('test_*.py') + test_files_to_run = [f for f in all_test_files] + else: + print(f"\n{Colors.YELLOW}{Colors.BOLD}Some dependencies missing - running basic tests only{Colors.END}") + test_files_to_run = BASIC_TEST_FILES.copy() + + print(f"\n{Colors.GREEN}Running test files: {', '.join(test_files_to_run)}{Colors.END}\n") + + for test_file in test_files_to_run: + if not test_file.endswith('.py'): + test_file += '.py' + if os.path.exists(test_file): + module_name = test_file[:-3].replace('/', '.') + try: + suite = test_loader.loadTestsFromName(module_name) + test_suite.addTest(suite) + print(f"{Colors.GREEN}Loaded {test_file}{Colors.END}") + except Exception as e: + print(f"{Colors.YELLOW}Warning: Could not load {test_file}: {e}{Colors.END}") + else: + print(f"{Colors.RED}Error: Test file {test_file} not found{Colors.END}") + + run_tests(test_suite) + +def run_tests(test_suite): + test_runner = unittest.TextTestRunner(verbosity=2) + ret = test_runner.run(test_suite) + + total = ret.testsRun + failures = len(ret.failures) + errors = len(ret.errors) + success = total - failures - errors + + if failures + errors == 0: + print(f"\n{Colors.GREEN}{Colors.BOLD}✓ ALL TESTS PASSED{Colors.END}") + print(f"{Colors.GREEN}Success: {success}, Total: {total}{Colors.END}") + else: + print(f"\n{Colors.RED}{Colors.BOLD}✖ TEST FAILURES{Colors.END}") + print(f"{Colors.RED}Failed: {failures}, Errors: {errors}, Success: {success}, Total: {total}{Colors.END}") + + if failures > 0: + print(f"\n{Colors.YELLOW}Failed Tests:{Colors.END}") + for failure in ret.failures: + test_case, traceback = failure + print(f"{Colors.RED}• {test_case.id()}{Colors.END}") + + if errors > 0: + print(f"\n{Colors.YELLOW}Errored Tests:{Colors.END}") + for error in ret.errors: + test_case, traceback = error + print(f"{Colors.RED}• {test_case.id()}{Colors.END}") + + # if any test fails, exit with non-zero code + if len(ret.failures) > 0 or len(ret.errors) > 0: + exit(1) + else: + exit(0) + +if __name__ == '__main__': + main() diff --git a/tests/test_basic.py b/tests/test_basic.py index 2ac873649c9..1c260642d96 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -18,11 +18,10 @@ def test_basic(self): class TestOutput(unittest.TestCase): def test_output(self): for format, output in format_output.items(): - res = chdb.query("SELECT * FROM file('" + data_file + "', Parquet) limit 10", format) if format == "ArrowTable": - data = reset_elapsed(f"{res}") - else: - data = reset_elapsed(res.bytes()) + continue + res = chdb.query("SELECT * FROM file('" + data_file + "', Parquet) limit 10", format) + data = reset_elapsed(res.bytes()) # Arrow format output is not deterministic if format in ("Arrow", "ArrowStream"): continue @@ -46,4 +45,4 @@ def test_output(self): if __name__ == '__main__': - unittest.main() + unittest.main(verbosity=2) diff --git a/tests/test_basic_arrow.py b/tests/test_basic_arrow.py new file mode 100644 index 00000000000..95434f6f4ca --- /dev/null +++ b/tests/test_basic_arrow.py @@ -0,0 +1,32 @@ +#!python3 + +import os +import unittest +import pyarrow # type: ignore +import chdb +from format_output import format_output +from utils import data_file, reset_elapsed + + +class TestBasic(unittest.TestCase): + def test_basic(self): + res = chdb.query("SELECT 1", "CSV") + self.assertEqual(len(res), 2) # "1\n" + self.assertFalse(res.has_error()) + self.assertTrue(len(res.error_message()) == 0) + with self.assertRaises(Exception): + res = chdb.query("SELECT 1", "unknown_format") + + +class TestOutput(unittest.TestCase): + def test_output(self): + for format, output in format_output.items(): + if format != "ArrowTable": + continue + res = chdb.query("SELECT * FROM file('" + data_file + "', Parquet) limit 10", format) + data = reset_elapsed(f"{res}") + self.assertEqual(data, output["data"]) + + +if __name__ == '__main__': + unittest.main(verbosity=2) From 21ac77b595e5f5c45fd0777e2a85805d2e080c7d Mon Sep 17 00:00:00 2001 From: wudidapaopao Date: Thu, 16 Oct 2025 02:25:09 +0800 Subject: [PATCH 5/7] fix: fix tests --- programs/local/PybindWrapper.h | 1 + programs/local/PythonConversion.cpp | 13 ++++++---- programs/local/PythonTableCache.cpp | 3 ++- programs/local/PythonUtils.h | 37 +++++++---------------------- 4 files changed, 20 insertions(+), 34 deletions(-) diff --git a/programs/local/PybindWrapper.h b/programs/local/PybindWrapper.h index d653ab1ea73..93aff6e14a8 100644 --- a/programs/local/PybindWrapper.h +++ b/programs/local/PybindWrapper.h @@ -6,6 +6,7 @@ namespace pybind11 { +bool gil_check(); void gil_assert(); } diff --git a/programs/local/PythonConversion.cpp b/programs/local/PythonConversion.cpp index 0a44e66dc2d..91151a0813f 100644 --- a/programs/local/PythonConversion.cpp +++ b/programs/local/PythonConversion.cpp @@ -285,12 +285,15 @@ void convert_to_json_str(const py::handle & obj, String & ret) d.SetObject(); rapidjson::Document::AllocatorType & allocator = d.GetAllocator(); + auto sys_modules = py::module_::import("sys").attr("modules"); + bool has_numpy = sys_modules.contains(py::str("numpy")); + std::function convert; convert = [&](const py::handle & obj, rapidjson::Value & json_value) { if (py::isinstance(obj)) { json_value.SetObject(); - for (auto & item : py::cast(obj)) + for (const auto & item : py::cast(obj)) { rapidjson::Value key; auto key_str = py::str(item.first).cast(); @@ -306,7 +309,7 @@ void convert_to_json_str(const py::handle & obj, String & ret) { json_value.SetArray(); auto tmp_list = py::cast(obj); - for (auto & item : tmp_list) + for (const auto & item : tmp_list) { rapidjson::Value element; convert(item, element); @@ -317,14 +320,14 @@ void convert_to_json_str(const py::handle & obj, String & ret) { json_value.SetArray(); auto tmp_tuple = py::cast(obj); - for (auto & item : tmp_tuple) + for (const auto & item : tmp_tuple) { rapidjson::Value element; convert(item, element); json_value.PushBack(element, allocator); } } - else if (py::isinstance(obj)) + else if (has_numpy && py::isinstance(obj)) { auto arr = py::cast(obj); json_value.SetArray(); @@ -337,7 +340,7 @@ void convert_to_json_str(const py::handle & obj, String & ret) auto item = my_list.attr("__getitem__")(i); convert(item, element); json_value.PushBack(element, allocator); - } + } } else { diff --git a/programs/local/PythonTableCache.cpp b/programs/local/PythonTableCache.cpp index acb32bcfd78..b2c8fbb7ea6 100644 --- a/programs/local/PythonTableCache.cpp +++ b/programs/local/PythonTableCache.cpp @@ -1,4 +1,5 @@ #include "PythonTableCache.h" +#include "PandasDataFrame.h" #include "PybindWrapper.h" #include "PythonUtils.h" @@ -32,7 +33,7 @@ static py::object findQueryableObj(const String & var_name) { // Get the object using Python's indexing syntax obj = namespace_obj[py::cast(var_name)]; - if (DB::isInheritsFromPyReader(obj) || DB::isPandasDf(obj) || DB::isPyarrowTable(obj) || DB::hasGetItem(obj)) + if (DB::isInheritsFromPyReader(obj) || PandasDataFrame::isPandasDataframe(obj) || DB::isPyarrowTable(obj) || DB::hasGetItem(obj)) { return obj; } diff --git a/programs/local/PythonUtils.h b/programs/local/PythonUtils.h index 7d9384c5633..3accd24a845 100644 --- a/programs/local/PythonUtils.h +++ b/programs/local/PythonUtils.h @@ -1,8 +1,8 @@ #pragma once #include "config.h" +#include "PybindWrapper.h" -#include #include #include #include @@ -68,35 +68,21 @@ inline bool isInheritsFromPyReader(const py::object & obj) return execWithGIL([&]() { return _isInheritsFromPyReader(obj); }); } -// Helper function to check if object is a pandas DataFrame -inline bool isPandasDf(const py::object & obj) -{ - return execWithGIL( - [&]() - { - auto pd_data_frame_type = py::module_::import("pandas").attr("DataFrame"); - return py::isinstance(obj, pd_data_frame_type); - }); -} - // Helper function to check if object is a PyArrow Table inline bool isPyarrowTable(const py::object & obj) { - return execWithGIL( - [&]() - { - auto table_type = py::module_::import("pyarrow").attr("Table"); - return py::isinstance(obj, table_type); - }); + chassert(py::gil_check()); + auto dict = py::module_::import("sys").attr("modules"); + if (!dict.contains(py::str("pyarrow"))) + return false; + + return py::isinstance(obj, py::module_::import("pyarrow").attr("Table")); } inline bool hasGetItem(const py::object & obj) { - return execWithGIL( - [&]() - { - return py::hasattr(obj, "__getitem__"); - }); + chassert(py::gil_check()); + return py::hasattr(obj, "__getitem__"); } // Specific wrappers for common use cases @@ -105,11 +91,6 @@ inline auto castToPyList(const py::object & obj) return execWithGIL([&]() { return obj.cast(); }); } -inline auto castToPyArray(const py::object & obj) -{ - return execWithGIL([&]() { return obj.cast(); }); -} - inline std::string castToStr(const py::object & obj) { return execWithGIL([&]() { return py::str(obj).cast(); }); From 07f8ecce4b438205060d5caadbc52b846a9f3c1b Mon Sep 17 00:00:00 2001 From: wudidapaopao Date: Thu, 16 Oct 2025 03:04:03 +0800 Subject: [PATCH 6/7] fix: fix tests --- tests/run_all.py | 1 + tests/test_insert_error_handling.py | 1 + 2 files changed, 2 insertions(+) diff --git a/tests/run_all.py b/tests/run_all.py index a15a7b615f5..8331a0cbd71 100755 --- a/tests/run_all.py +++ b/tests/run_all.py @@ -55,6 +55,7 @@ def main(): 'test_materialize.py', 'test_multiple_query.py', 'test_open_session_after_failure.py', + 'test_optional_dependencies.py', 'test_signal_handler.py', 'test_statistics.py', 'test_streaming_query.py', diff --git a/tests/test_insert_error_handling.py b/tests/test_insert_error_handling.py index 8bdb7deec12..228dc8e56f0 100644 --- a/tests/test_insert_error_handling.py +++ b/tests/test_insert_error_handling.py @@ -18,6 +18,7 @@ def setUp(self) -> None: def tearDown(self) -> None: """Clean up test environment.""" shutil.rmtree(test_dir, ignore_errors=True) + self.sess.close() return super().tearDown() def test_incomplete_insert_values_throws_error(self): From 6c3ce66d8a58ac42f401f24226a5f94ce984bb8f Mon Sep 17 00:00:00 2001 From: wudidapaopao Date: Thu, 23 Oct 2025 09:50:05 +0200 Subject: [PATCH 7/7] test: fix tests --- tests/test_arrow_table_queries.py | 1 - tests/test_query_json_arrow.py | 4 +- tests/test_query_py_arrow.py | 63 ++++++++++++++++++------------- 3 files changed, 39 insertions(+), 29 deletions(-) diff --git a/tests/test_arrow_table_queries.py b/tests/test_arrow_table_queries.py index 96452a3a243..9fb5e5e81d0 100644 --- a/tests/test_arrow_table_queries.py +++ b/tests/test_arrow_table_queries.py @@ -10,7 +10,6 @@ from chdb import session from urllib.request import urlretrieve -# Clean up and create session in the test methods instead of globally class TestChDBArrowTable(unittest.TestCase): @classmethod diff --git a/tests/test_query_json_arrow.py b/tests/test_query_json_arrow.py index 304646e7104..df65dbfa770 100644 --- a/tests/test_query_json_arrow.py +++ b/tests/test_query_json_arrow.py @@ -5,7 +5,7 @@ import chdb.session as chs EXPECTED1 = """"['urgent','important']",100.3,"[]" -\\N,\\N,"[1,666]" +"[]",0,"[1,666]" """ dict3 = { @@ -43,7 +43,7 @@ def tearDown(self) -> None: return super().tearDown() def test_query_pyarrow_table1(self): - ret = self.sess.query("SELECT c4.tags, c3.deep.level2.level3, c3.mixed_list[].a FROM Python(arrow_table1) WHERE c1 <= 2 ORDER BY c1") + ret = self.sess.query("SELECT c4.tags, c3.deep.level2.level3, c3.mixed_list.a FROM Python(arrow_table1) WHERE c1 <= 2 ORDER BY c1") self.assertEqual(str(ret), EXPECTED1) diff --git a/tests/test_query_py_arrow.py b/tests/test_query_py_arrow.py index ae5216644ef..f9c03d45e23 100644 --- a/tests/test_query_py_arrow.py +++ b/tests/test_query_py_arrow.py @@ -75,7 +75,7 @@ def test_query_arrow3(self): ) self.assertEqual( str(ret), - "5872873,587287.3,553446.5,470878.25,3,0,7,10\n", + "5872873,587287.3,553446.5,582813.5,3,0,7,10\n", ) def test_query_arrow4(self): @@ -99,17 +99,17 @@ def test_query_arrow5(self): self.assertDictEqual( schema_dict, { - "quadkey": "String", - "tile": "String", - "tile_x": "Float64", - "tile_y": "Float64", - "avg_d_kbps": "Int64", - "avg_u_kbps": "Int64", - "avg_lat_ms": "Int64", - "avg_lat_down_ms": "Float64", - "avg_lat_up_ms": "Float64", - "tests": "Int64", - "devices": "Int64", + "quadkey": "Nullable(String)", + "tile": "Nullable(String)", + "tile_x": "Nullable(Float64)", + "tile_y": "Nullable(Float64)", + "avg_d_kbps": "Nullable(Int64)", + "avg_u_kbps": "Nullable(Int64)", + "avg_lat_ms": "Nullable(Int64)", + "avg_lat_down_ms": "Nullable(Float64)", + "avg_lat_up_ms": "Nullable(Float64)", + "tests": "Nullable(Int64)", + "devices": "Nullable(Int64)", }, ) ret = chdb.query( @@ -127,23 +127,34 @@ def test_query_arrow5(self): self.assertDictEqual( {x["name"]: x["type"] for x in json.loads(str(ret)).get("meta")}, { - "max(avg_d_kbps)": "Int64", - "max(avg_lat_down_ms)": "Float64", - "max(avg_lat_ms)": "Int64", - "max(avg_lat_up_ms)": "Float64", - "max(avg_u_kbps)": "Int64", - "max(devices)": "Int64", - "max(tests)": "Int64", - "round(median(avg_d_kbps), 2)": "Float64", - "round(median(avg_lat_down_ms), 2)": "Float64", - "round(median(avg_lat_ms), 2)": "Float64", - "round(median(avg_lat_up_ms), 2)": "Float64", - "round(median(avg_u_kbps), 2)": "Float64", - "round(median(devices), 2)": "Float64", - "round(median(tests), 2)": "Float64", + "max(avg_d_kbps)": "Nullable(Int64)", + "max(avg_lat_down_ms)": "Nullable(Float64)", + "max(avg_lat_ms)": "Nullable(Int64)", + "max(avg_lat_up_ms)": "Nullable(Float64)", + "max(avg_u_kbps)": "Nullable(Int64)", + "max(devices)": "Nullable(Int64)", + "max(tests)": "Nullable(Int64)", + "round(median(avg_d_kbps), 2)": "Nullable(Float64)", + "round(median(avg_lat_down_ms), 2)": "Nullable(Float64)", + "round(median(avg_lat_ms), 2)": "Nullable(Float64)", + "round(median(avg_lat_up_ms), 2)": "Nullable(Float64)", + "round(median(avg_u_kbps), 2)": "Nullable(Float64)", + "round(median(devices), 2)": "Nullable(Float64)", + "round(median(tests), 2)": "Nullable(Float64)", }, ) + def test_query_arrow_null_type(self): + null_array = pa.array([None, None, None]) + table = pa.table([null_array], names=["null_col"]) + ret = chdb.query("SELECT * FROM Python(table)") + self.assertEqual(str(ret), "\\N\n\\N\n\\N\n") + + null_array = pa.array([None, 1, None]) + table = pa.table([null_array], names=["null_col"]) + ret = chdb.query("SELECT * FROM Python(table)") + self.assertEqual(str(ret), "\\N\n1\n\\N\n") + if __name__ == "__main__": unittest.main(verbosity=2)