diff --git a/tests/integration/test_trino.py b/tests/integration/test_trino.py index 695d6d7..11b2778 100644 --- a/tests/integration/test_trino.py +++ b/tests/integration/test_trino.py @@ -225,3 +225,134 @@ def test_execute_sql_with_autodetection(self, trino_credentials): assert len(result) == 1 assert "detected" in result.columns assert result["detected"].iloc[0] == test_value + + def test_execute_sql_with_struct_types(self, trino_toolkit_connection): + """ + Test Trino STRUCT/ROW types don't break rendering (BLU-5140 regression). + + Verifies both analyze_columns() for stats and to_records() for cell values. + """ + from deepnote_toolkit.ocelots import DataFrame + from deepnote_toolkit.ocelots.pandas.analyze import analyze_columns + + query = """ + SELECT id, simple_struct FROM ( + SELECT + t.id, + CAST( + ROW( + 'item_' || CAST(t.id AS VARCHAR), + 'value_' || CAST(t.id * 10 AS VARCHAR) + ) + AS ROW(a VARCHAR, b VARCHAR) + ) AS simple_struct + FROM + UNNEST(SEQUENCE(1, 100)) AS t (id) + ) + """ + + result = execute_sql( + template=query, + sql_alchemy_json_env_var=trino_toolkit_connection, + ) + + assert isinstance(result, pd.DataFrame) + assert len(result) == 100 + assert "id" in result.columns + assert "simple_struct" in result.columns + + # Verify NamedRowTuple structure + first_struct = result["simple_struct"].iloc[0] + assert isinstance(first_struct, tuple) + assert len(first_struct) == 2 + assert first_struct[0] == "item_1" + assert first_struct[1] == "value_10" + assert first_struct.a == "item_1" + assert first_struct.b == "value_10" + + # Verify analyze_columns() works without crashing + analysis_result = analyze_columns(result) + assert len(analysis_result) == 2 + + struct_col = next(col for col in analysis_result if col.name == "simple_struct") + assert struct_col.stats is not None + assert struct_col.stats.categories is not None + assert len(struct_col.stats.categories) > 0 + + # Verify to_records() produces stringified values + oc_df = DataFrame.from_native(result) + records = oc_df.to_records(mode="json") + + assert len(records) == 100 + cell_value = records[0]["simple_struct"] + assert isinstance(cell_value, str) + assert "item_1" in cell_value + assert "value_10" in cell_value + + def test_execute_sql_with_array_types(self, trino_toolkit_connection): + """ + Test Trino ARRAY types don't break rendering (BLU-5140 regression). + + Verifies both analyze_columns() for stats and to_records() for cell values. + """ + from deepnote_toolkit.ocelots import DataFrame + from deepnote_toolkit.ocelots.pandas.analyze import analyze_columns + + query = """ + SELECT + id, + tags, + nested_array + FROM ( + SELECT + t.id, + ARRAY['tag_' || CAST(t.id AS VARCHAR), 'item', 'test'] AS tags, + ARRAY[ARRAY[t.id, t.id * 2], ARRAY[t.id * 3, t.id * 4]] AS nested_array + FROM + UNNEST(SEQUENCE(1, 50)) AS t (id) + ) + """ + + result = execute_sql( + template=query, + sql_alchemy_json_env_var=trino_toolkit_connection, + ) + + assert isinstance(result, pd.DataFrame) + assert len(result) == 50 + assert "id" in result.columns + assert "tags" in result.columns + assert "nested_array" in result.columns + + # Verify array data + first_tags = result["tags"].iloc[0] + assert isinstance(first_tags, list) + assert len(first_tags) == 3 + assert first_tags == ["tag_1", "item", "test"] + + first_nested = result["nested_array"].iloc[0] + assert isinstance(first_nested, list) + assert len(first_nested) == 2 + assert first_nested == [[1, 2], [3, 4]] + + # Verify analyze_columns() works without crashing + analysis_result = analyze_columns(result) + assert len(analysis_result) == 3 + + for col_name in ["tags", "nested_array"]: + col = next(c for c in analysis_result if c.name == col_name) + assert col.stats is not None + assert col.stats.categories is not None + + # Verify to_records() produces stringified values + oc_df = DataFrame.from_native(result) + records = oc_df.to_records(mode="json") + + assert len(records) == 50 + tags_value = records[0]["tags"] + nested_value = records[0]["nested_array"] + + assert isinstance(tags_value, str) + assert isinstance(nested_value, str) + assert "tag_1" in tags_value + assert "item" in tags_value diff --git a/tests/unit/test_dataframe_rendering.py b/tests/unit/test_dataframe_rendering.py new file mode 100644 index 0000000..75c29f2 --- /dev/null +++ b/tests/unit/test_dataframe_rendering.py @@ -0,0 +1,210 @@ +""" +Unit tests for DataFrame rendering with structured types. + +These tests simulate the complete rendering flow that happens when the frontend +displays a DataFrame, ensuring both column analysis and data serialization work correctly. + +This is a regression test suite for BLU-5140 where Trino STRUCT/ROW types caused +analyze_columns() to crash, resulting in fallback to plain DataFrame view instead of +the Deepnote native DataFrame view. +""" + +import numpy as np +import pandas as pd +from trino.types import NamedRowTuple + +from deepnote_toolkit.ocelots import DataFrame +from deepnote_toolkit.ocelots.pandas.analyze import analyze_columns + + +def _test_rendering_flow(df, expected_columns): + """ + Simulate the complete rendering flow: + 1. analyze_columns() - for native view with stats + 2. to_records(mode="json") - for cell values + + Both paths must work for the Deepnote native DataFrame view to display correctly. + """ + # 1. column stats (native view) + analysis_result = analyze_columns(df) + + assert len(analysis_result) == len(expected_columns) + + for col_name in expected_columns: + col = next(c for c in analysis_result if c.name == col_name) + assert col.stats is not None, f"analyze_columns() failed for {col_name}" + # Object columns should have categories for display + if df[col_name].dtype == object: + assert ( + col.stats.categories is not None + ), f"No categories for object column {col_name}" + + # 2. cell values + oc_df = DataFrame.from_native(df) + records = oc_df.to_records(mode="json") + + assert len(records) == len(df) + # all values are JSON-serializable (strings, numbers, None) + for record in records: + for col_name in expected_columns: + value = record[col_name] + assert isinstance( + value, (str, int, float, type(None)) + ), f"Value for {col_name} is not JSON-serializable: {type(value)}" + + +def test_rendering_with_dict_objects(): + """Test rendering DataFrame with dict objects (simulates database ROW types).""" + df = pd.DataFrame( + { + "id": [1, 2, 3], + "struct_col": [ + {"a": "item_1", "b": "value_10"}, + {"a": "item_2", "b": "value_20"}, + {"a": "item_3", "b": "value_30"}, + ], + } + ) + + _test_rendering_flow(df, ["id", "struct_col"]) + + +def test_rendering_with_list_objects(): + """Test rendering DataFrame with list objects (simulates database ARRAY types).""" + df = pd.DataFrame( + { + "id": [1, 2, 3], + "array_col": [ + ["tag_1", "item", "test"], + ["tag_2", "item", "test"], + ["tag_3", "item", "test"], + ], + } + ) + + _test_rendering_flow(df, ["id", "array_col"]) + + +def test_rendering_with_tuple_objects(): + """Test rendering DataFrame with tuple objects.""" + df = pd.DataFrame( + { + "id": [1, 2, 3], + "tuple_col": [ + ("item_1", "value_10"), + ("item_2", "value_20"), + ("item_3", "value_30"), + ], + } + ) + + _test_rendering_flow(df, ["id", "tuple_col"]) + + +def test_rendering_with_trino_namedrowtuple(): + """ + Test rendering DataFrame with Trino NamedRowTuple objects. + + This is the exact scenario from BLU-5140 that caused the crash. + Before the fix, pd.Series(np_array.tolist()) would fail because + NamedRowTuple has a broken __array_struct__ attribute. + """ + # Create NamedRowTuple array using np.empty + assignment pattern. + # This avoids pandas conversion issues during DataFrame creation. + # Using [NamedRowTuple(...), ...] would trigger __array_struct__ bug. + np_array = np.empty(3, dtype=object) + np_array[0] = NamedRowTuple(["item_1", "value_10"], ["a", "b"], [None, None]) + np_array[1] = NamedRowTuple(["item_2", "value_20"], ["a", "b"], [None, None]) + np_array[2] = NamedRowTuple(["item_3", "value_30"], ["a", "b"], [None, None]) + + df = pd.DataFrame({"id": [1, 2, 3], "struct_col": np_array}) + + _test_rendering_flow(df, ["id", "struct_col"]) + + # stringified values should preserve structure + oc_df = DataFrame.from_native(df) + records = oc_df.to_records(mode="json") + + struct_value = records[0]["struct_col"] + assert isinstance(struct_value, str) + assert "item_1" in struct_value + assert "value_10" in struct_value + + +def test_rendering_with_nested_structures(): + """Test rendering DataFrame with nested dicts/lists.""" + df = pd.DataFrame( + { + "id": [1, 2, 3], + "nested_col": [ + {"outer": ["inner_1", "inner_2"]}, + {"outer": ["inner_3", "inner_4"]}, + {"outer": ["inner_5", "inner_6"]}, + ], + } + ) + + _test_rendering_flow(df, ["id", "nested_col"]) + + +def test_rendering_with_mixed_types(): + """Test rendering DataFrame with multiple structured type columns.""" + df = pd.DataFrame( + { + "id": [1, 2, 3], + "dict_col": [{"a": 1}, {"b": 2}, {"c": 3}], + "list_col": [[1, 2], [3, 4], [5, 6]], + "tuple_col": [(1, 2), (3, 4), (5, 6)], + } + ) + + _test_rendering_flow(df, ["id", "dict_col", "list_col", "tuple_col"]) + + +def test_rendering_with_namedrowtuple_and_missing_values(): + """Test rendering with NamedRowTuple including None values.""" + # Create NamedRowTuple array using np.empty + assignment pattern. + # Using [NamedRowTuple(...), ...] would trigger __array_struct__ bug. + np_array = np.empty(4, dtype=object) + np_array[0] = NamedRowTuple(["item_1", "value_10"], ["a", "b"], [None, None]) + np_array[1] = None + np_array[2] = NamedRowTuple(["item_2", "value_20"], ["a", "b"], [None, None]) + np_array[3] = NamedRowTuple(["item_1", "value_10"], ["a", "b"], [None, None]) + + df = pd.DataFrame({"id": [1, 2, 3, 4], "struct_col": np_array}) + + # Should not crash with None values + analysis_result = analyze_columns(df) + + struct_col = next(col for col in analysis_result if col.name == "struct_col") + assert struct_col.stats is not None + assert struct_col.stats.categories is not None + + # Should have "Missing" category + category_names = [cat["name"] for cat in struct_col.stats.categories] + assert "Missing" in category_names + + +def test_rendering_preserves_field_names_in_str_representation(): + """ + Test that NamedRowTuple field names are preserved in stringification. + """ + # Create NamedRowTuple array using np.empty + assignment pattern. + # Using [NamedRowTuple(...), ...] would trigger __array_struct__ bug. + np_array = np.empty(1, dtype=object) + np_array[0] = NamedRowTuple( + ["value_a", "value_b"], ["field_a", "field_b"], [None, None] + ) + + df = pd.DataFrame({"struct_col": np_array}) + + # Get the stringified representation + oc_df = DataFrame.from_native(df) + records = oc_df.to_records(mode="json") + + stringified = records[0]["struct_col"] + + # str(NamedRowTuple) produces something like: (field_a: 'value_a', field_b: 'value_b') + # This preserves field name information for better display + assert "field_a: 'value_a'" in stringified + assert "field_b: 'value_b'" in stringified