Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
131 changes: 131 additions & 0 deletions tests/integration/test_trino.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,3 +225,134 @@ def test_execute_sql_with_autodetection(self, trino_credentials):
assert len(result) == 1
assert "detected" in result.columns
assert result["detected"].iloc[0] == test_value

def test_execute_sql_with_struct_types(self, trino_toolkit_connection):
"""
Test Trino STRUCT/ROW types don't break rendering (BLU-5140 regression).

Verifies both analyze_columns() for stats and to_records() for cell values.
"""
from deepnote_toolkit.ocelots import DataFrame
from deepnote_toolkit.ocelots.pandas.analyze import analyze_columns

query = """
SELECT id, simple_struct FROM (
SELECT
t.id,
CAST(
ROW(
'item_' || CAST(t.id AS VARCHAR),
'value_' || CAST(t.id * 10 AS VARCHAR)
)
AS ROW(a VARCHAR, b VARCHAR)
) AS simple_struct
FROM
UNNEST(SEQUENCE(1, 100)) AS t (id)
)
"""

result = execute_sql(
template=query,
sql_alchemy_json_env_var=trino_toolkit_connection,
)

assert isinstance(result, pd.DataFrame)
assert len(result) == 100
assert "id" in result.columns
assert "simple_struct" in result.columns

# Verify NamedRowTuple structure
first_struct = result["simple_struct"].iloc[0]
assert isinstance(first_struct, tuple)
assert len(first_struct) == 2
assert first_struct[0] == "item_1"
assert first_struct[1] == "value_10"
assert first_struct.a == "item_1"
assert first_struct.b == "value_10"

# Verify analyze_columns() works without crashing
analysis_result = analyze_columns(result)
assert len(analysis_result) == 2

struct_col = next(col for col in analysis_result if col.name == "simple_struct")
assert struct_col.stats is not None
assert struct_col.stats.categories is not None
assert len(struct_col.stats.categories) > 0

# Verify to_records() produces stringified values
oc_df = DataFrame.from_native(result)
records = oc_df.to_records(mode="json")

assert len(records) == 100
cell_value = records[0]["simple_struct"]
assert isinstance(cell_value, str)
assert "item_1" in cell_value
assert "value_10" in cell_value

def test_execute_sql_with_array_types(self, trino_toolkit_connection):
"""
Test Trino ARRAY types don't break rendering (BLU-5140 regression).

Verifies both analyze_columns() for stats and to_records() for cell values.
"""
from deepnote_toolkit.ocelots import DataFrame
from deepnote_toolkit.ocelots.pandas.analyze import analyze_columns

query = """
SELECT
id,
tags,
nested_array
FROM (
SELECT
t.id,
ARRAY['tag_' || CAST(t.id AS VARCHAR), 'item', 'test'] AS tags,
ARRAY[ARRAY[t.id, t.id * 2], ARRAY[t.id * 3, t.id * 4]] AS nested_array
FROM
UNNEST(SEQUENCE(1, 50)) AS t (id)
)
"""

result = execute_sql(
template=query,
sql_alchemy_json_env_var=trino_toolkit_connection,
)

assert isinstance(result, pd.DataFrame)
assert len(result) == 50
assert "id" in result.columns
assert "tags" in result.columns
assert "nested_array" in result.columns

# Verify array data
first_tags = result["tags"].iloc[0]
assert isinstance(first_tags, list)
assert len(first_tags) == 3
assert first_tags == ["tag_1", "item", "test"]

first_nested = result["nested_array"].iloc[0]
assert isinstance(first_nested, list)
assert len(first_nested) == 2
assert first_nested == [[1, 2], [3, 4]]

# Verify analyze_columns() works without crashing
analysis_result = analyze_columns(result)
assert len(analysis_result) == 3

for col_name in ["tags", "nested_array"]:
col = next(c for c in analysis_result if c.name == col_name)
assert col.stats is not None
assert col.stats.categories is not None

# Verify to_records() produces stringified values
oc_df = DataFrame.from_native(result)
records = oc_df.to_records(mode="json")

assert len(records) == 50
tags_value = records[0]["tags"]
nested_value = records[0]["nested_array"]

assert isinstance(tags_value, str)
assert isinstance(nested_value, str)
assert "tag_1" in tags_value
assert "item" in tags_value
210 changes: 210 additions & 0 deletions tests/unit/test_dataframe_rendering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
"""
Unit tests for DataFrame rendering with structured types.
These tests simulate the complete rendering flow that happens when the frontend
displays a DataFrame, ensuring both column analysis and data serialization work correctly.
This is a regression test suite for BLU-5140 where Trino STRUCT/ROW types caused
analyze_columns() to crash, resulting in fallback to plain DataFrame view instead of
the Deepnote native DataFrame view.
"""

import numpy as np
import pandas as pd
from trino.types import NamedRowTuple

from deepnote_toolkit.ocelots import DataFrame
from deepnote_toolkit.ocelots.pandas.analyze import analyze_columns


def _test_rendering_flow(df, expected_columns):
"""
Simulate the complete rendering flow:
1. analyze_columns() - for native view with stats
2. to_records(mode="json") - for cell values
Both paths must work for the Deepnote native DataFrame view to display correctly.
"""
# 1. column stats (native view)
analysis_result = analyze_columns(df)

assert len(analysis_result) == len(expected_columns)

for col_name in expected_columns:
col = next(c for c in analysis_result if c.name == col_name)
assert col.stats is not None, f"analyze_columns() failed for {col_name}"
# Object columns should have categories for display
if df[col_name].dtype == object:
assert (
col.stats.categories is not None
), f"No categories for object column {col_name}"

# 2. cell values
oc_df = DataFrame.from_native(df)
records = oc_df.to_records(mode="json")

assert len(records) == len(df)
# all values are JSON-serializable (strings, numbers, None)
for record in records:
for col_name in expected_columns:
value = record[col_name]
assert isinstance(
value, (str, int, float, type(None))
), f"Value for {col_name} is not JSON-serializable: {type(value)}"


def test_rendering_with_dict_objects():
"""Test rendering DataFrame with dict objects (simulates database ROW types)."""
df = pd.DataFrame(
{
"id": [1, 2, 3],
"struct_col": [
{"a": "item_1", "b": "value_10"},
{"a": "item_2", "b": "value_20"},
{"a": "item_3", "b": "value_30"},
],
}
)

_test_rendering_flow(df, ["id", "struct_col"])


def test_rendering_with_list_objects():
"""Test rendering DataFrame with list objects (simulates database ARRAY types)."""
df = pd.DataFrame(
{
"id": [1, 2, 3],
"array_col": [
["tag_1", "item", "test"],
["tag_2", "item", "test"],
["tag_3", "item", "test"],
],
}
)

_test_rendering_flow(df, ["id", "array_col"])


def test_rendering_with_tuple_objects():
"""Test rendering DataFrame with tuple objects."""
df = pd.DataFrame(
{
"id": [1, 2, 3],
"tuple_col": [
("item_1", "value_10"),
("item_2", "value_20"),
("item_3", "value_30"),
],
}
)

_test_rendering_flow(df, ["id", "tuple_col"])


def test_rendering_with_trino_namedrowtuple():
"""
Test rendering DataFrame with Trino NamedRowTuple objects.
This is the exact scenario from BLU-5140 that caused the crash.
Before the fix, pd.Series(np_array.tolist()) would fail because
NamedRowTuple has a broken __array_struct__ attribute.
"""
# Create NamedRowTuple array using np.empty + assignment pattern.
# This avoids pandas conversion issues during DataFrame creation.
# Using [NamedRowTuple(...), ...] would trigger __array_struct__ bug.
np_array = np.empty(3, dtype=object)
np_array[0] = NamedRowTuple(["item_1", "value_10"], ["a", "b"], [None, None])
np_array[1] = NamedRowTuple(["item_2", "value_20"], ["a", "b"], [None, None])
np_array[2] = NamedRowTuple(["item_3", "value_30"], ["a", "b"], [None, None])

df = pd.DataFrame({"id": [1, 2, 3], "struct_col": np_array})

_test_rendering_flow(df, ["id", "struct_col"])

# stringified values should preserve structure
oc_df = DataFrame.from_native(df)
records = oc_df.to_records(mode="json")

struct_value = records[0]["struct_col"]
assert isinstance(struct_value, str)
assert "item_1" in struct_value
assert "value_10" in struct_value


def test_rendering_with_nested_structures():
"""Test rendering DataFrame with nested dicts/lists."""
df = pd.DataFrame(
{
"id": [1, 2, 3],
"nested_col": [
{"outer": ["inner_1", "inner_2"]},
{"outer": ["inner_3", "inner_4"]},
{"outer": ["inner_5", "inner_6"]},
],
}
)

_test_rendering_flow(df, ["id", "nested_col"])


def test_rendering_with_mixed_types():
"""Test rendering DataFrame with multiple structured type columns."""
df = pd.DataFrame(
{
"id": [1, 2, 3],
"dict_col": [{"a": 1}, {"b": 2}, {"c": 3}],
"list_col": [[1, 2], [3, 4], [5, 6]],
"tuple_col": [(1, 2), (3, 4), (5, 6)],
}
)

_test_rendering_flow(df, ["id", "dict_col", "list_col", "tuple_col"])


def test_rendering_with_namedrowtuple_and_missing_values():
"""Test rendering with NamedRowTuple including None values."""
# Create NamedRowTuple array using np.empty + assignment pattern.
# Using [NamedRowTuple(...), ...] would trigger __array_struct__ bug.
np_array = np.empty(4, dtype=object)
np_array[0] = NamedRowTuple(["item_1", "value_10"], ["a", "b"], [None, None])
np_array[1] = None
np_array[2] = NamedRowTuple(["item_2", "value_20"], ["a", "b"], [None, None])
np_array[3] = NamedRowTuple(["item_1", "value_10"], ["a", "b"], [None, None])

df = pd.DataFrame({"id": [1, 2, 3, 4], "struct_col": np_array})

# Should not crash with None values
analysis_result = analyze_columns(df)

struct_col = next(col for col in analysis_result if col.name == "struct_col")
assert struct_col.stats is not None
assert struct_col.stats.categories is not None

# Should have "Missing" category
category_names = [cat["name"] for cat in struct_col.stats.categories]
assert "Missing" in category_names


def test_rendering_preserves_field_names_in_str_representation():
"""
Test that NamedRowTuple field names are preserved in stringification.
"""
# Create NamedRowTuple array using np.empty + assignment pattern.
# Using [NamedRowTuple(...), ...] would trigger __array_struct__ bug.
np_array = np.empty(1, dtype=object)
np_array[0] = NamedRowTuple(
["value_a", "value_b"], ["field_a", "field_b"], [None, None]
)

df = pd.DataFrame({"struct_col": np_array})

# Get the stringified representation
oc_df = DataFrame.from_native(df)
records = oc_df.to_records(mode="json")

stringified = records[0]["struct_col"]

# str(NamedRowTuple) produces something like: (field_a: 'value_a', field_b: 'value_b')
# This preserves field name information for better display
assert "field_a: 'value_a'" in stringified
assert "field_b: 'value_b'" in stringified