deepnote
diff --git a/‎.cursorrules‎
Lines changed: 6 additions & 0 deletions b/‎.cursorrules‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎.github/workflows/cd.yml‎
Lines changed: 5 additions & 3 deletions b/‎.github/workflows/cd.yml‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 68 additions & 31 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 68 additions & 31 deletions
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎deepnote_toolkit/ocelots/pandas/analyze.py‎
Lines changed: 19 additions & 31 deletions b/‎deepnote_toolkit/ocelots/pandas/analyze.py‎
Lines changed: 19 additions & 31 deletions
diff --git a/‎deepnote_toolkit/ocelots/pandas/utils.py‎
Lines changed: 49 additions & 9 deletions b/‎deepnote_toolkit/ocelots/pandas/utils.py‎
Lines changed: 49 additions & 9 deletions
@@ -94,6 +94,12 @@ Additional for integration tests:
 # Run local tests
 ./bin/test-local
 
+# Run a specific test file
+./bin/test-local tests/unit/test_file.py
+
+# ... or specific test from file
+./bin/test-local tests/unit/test_file.py::TestClass::test_method
+
 # Run specific test type
 export TEST_TYPE="unit|integration"
 export TOOLKIT_VERSION="local-build"
 
@@ -20,7 +20,7 @@ env:
   POETRY_VIRTUALENVS_CREATE: true
   POETRY_VIRTUALENVS_IN_PROJECT: true
   POETRY_INSTALLER_PARALLEL: true
-  JUPYTER_FOR_LOCAL_PYTHON_VERSION: "3.9"
+  JUPYTER_FOR_LOCAL_PYTHON_VERSION: "3.11"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
@@ -395,13 +395,15 @@ jobs:
           aws s3 cp "s3://${STAGING_BUCKET}/deepnote-toolkit/${VERSION}/installer.zip" "/tmp/dist${PYTHON_VER}/installer.zip"
 
       - name: Build jupyter-for-local docker image
+        env:
+          PYTHON_VERSION: ${{ env.JUPYTER_FOR_LOCAL_PYTHON_VERSION }}
         run: |
           docker build \
             --progress plain \
-            --build-arg "FROM_PYTHON_TAG=3.9" \
+            --build-arg "FROM_PYTHON_TAG=${PYTHON_VERSION}" \
             --build-arg "BUNDLE_PATH=./" \
             --tag deepnote/jupyter-for-local:${VERSION} \
-            -f dockerfiles/jupyter-for-local/Dockerfile /tmp/dist3.9/
+            -f dockerfiles/jupyter-for-local/Dockerfile /tmp/dist${PYTHON_VERSION}/
 
       - name: Push jupyter-for-local image
         run: |
 
@@ -139,7 +139,7 @@ jobs:
 
           # Only allow licenses compatible with Apache 2.0 (allowlist approach)
           # Ignored packages either have UNKNOWN licenses or not distributed
-          poetry run pip-licenses --allow-only "Apache;MIT;BSD;ISC;Unlicense;CC0;Public Domain;Python Software Foundation;Mozilla Public License 2.0;GNU Library or Lesser General Public License (LGPL)" --partial-match --ignore-packages arro3-core click dependency-groups Flask jeepney jupyter_core MarkupSafe more-itertools pymssql PyMySQL SecretStorage sqlalchemy-spanner typing-extensions typing-inspection urllib3
+          poetry run pip-licenses --allow-only "Apache;MIT;BSD;ISC;Unlicense;CC0;Public Domain;Python Software Foundation;Mozilla Public License 2.0;GNU Library or Lesser General Public License (LGPL)" --partial-match --ignore-packages arro3-core click dependency-groups Flask jeepney jupyter_core matplotlib-inline MarkupSafe more-itertools pymssql PyMySQL SecretStorage sqlalchemy-spanner typing-extensions typing-inspection urllib3
 
           echo "✅ All licenses are compatible with Apache 2.0"
 
@@ -334,62 +334,99 @@ jobs:
         run: poetry install --no-interaction --only-root
 
       - name: Run unit tests
-        id: test-unit
         env:
           TOOLKIT_VERSION: ${{ steps.version.outputs.VERSION }}
-          PYTHON_VERSION: ${{ matrix.python-version }}
-          COVERAGE_FILE: coverage/.coverage.${{ matrix.python-version }}
         run: |
-          set -euo pipefail
-
-          # Create coverage directory
-          mkdir -p coverage
-
-          # Run tests with coverage
           poetry run pytest tests/unit \
             --cov=deepnote_toolkit \
             --cov=installer \
             --cov=deepnote_core \
             --cov-branch \
             --cov-report=term-missing:skip-covered \
-            --cov-report=xml:coverage/coverage-${PYTHON_VERSION}.xml \
-            --cov-report=json:coverage/coverage-${PYTHON_VERSION}.json \
             --junitxml=junit.xml \
             -o junit_family=legacy
 
-          # Check if coverage file was generated
-          if [ -f "coverage/.coverage.${PYTHON_VERSION}" ]; then
-            echo "coverage_generated=true" >> $GITHUB_OUTPUT
-            echo "Coverage files found:"
-            ls -la coverage/
-          else
-            echo "coverage_generated=false" >> $GITHUB_OUTPUT
-            echo "Warning: No coverage file generated"
-          fi
-
       - name: Per-version coverage summary
-        if: steps.test-unit.outputs.coverage_generated == 'true'
-        env:
-          PYTHON_VERSION: ${{ matrix.python-version }}
         run: |
-          echo "## Python ${PYTHON_VERSION} Coverage" >> $GITHUB_STEP_SUMMARY
-          poetry run coverage report --data-file=coverage/.coverage.${PYTHON_VERSION} --format=markdown >> $GITHUB_STEP_SUMMARY
+          echo "## Python ${{ matrix.python-version }} Coverage" >> $GITHUB_STEP_SUMMARY
+          poetry run coverage report --format=markdown >> $GITHUB_STEP_SUMMARY
 
-      - name: Upload test results to Codecov (these are results not coverage reports)
+      - name: Upload test results to Codecov
         if: ${{ !cancelled() }}
         uses: codecov/test-results-action@47f89e9acb64b76debcd5ea40642d25a4adced9f # v1
         with:
           token: ${{ secrets.CODECOV_TOKEN }}
           flags: python-${{ matrix.python-version }}
 
-      - name: Upload coverage to Codecov
+      - name: Upload coverage artifacts
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4
+        with:
+          name: coverage-${{ matrix.python-version }}
+          path: .coverage
+          retention-days: 1
+          include-hidden-files: true
+          if-no-files-found: error
+
+  coverage-combine:
+    name: Combine and Upload Coverage
+    runs-on: ubuntu-latest
+    needs: tests-unit
+    if: always()
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4
+        with:
+          persist-credentials: false
+
+      - name: Set up Python
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5
+        with:
+          python-version: '3.11'
+
+      - name: Install coverage
+        run: pip install coverage[toml]
+
+      - name: Download all coverage artifacts
+        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4
+        with:
+          pattern: coverage-*
+          path: coverage-artifacts/
+
+      - name: Combine coverage files
+        run: |
+          shopt -s nullglob
+          mkdir -p coverage-data
+          
+          i=0
+          for file in coverage-artifacts/*/.coverage; do
+            cp "$file" "coverage-data/.coverage.$i"
+            i=$((i + 1))
+          done
+          
+          coverage combine coverage-data/
+          coverage xml -o coverage-data/coverage.xml
+          coverage report
+
+          echo "## Combined Coverage Report" >> $GITHUB_STEP_SUMMARY
+          coverage report --format=markdown >> $GITHUB_STEP_SUMMARY
+
+      - name: Upload combined coverage to Codecov
         uses: codecov/codecov-action@5a1091511ad55cbe89839c7260b706298ca349f7 # v5
         with:
           token: ${{ secrets.CODECOV_TOKEN }}
-          slug: deepnote/deepnote-toolkit
-          files: ./coverage/coverage-${{ matrix.python-version }}.xml
+          slug: ${{ github.repository }}
+          files: ./coverage-data/coverage.xml
+          flags: combined
+          disable_search: true
           fail_ci_if_error: ${{ github.event.pull_request.head.repo.full_name == github.repository || github.event_name == 'push' }}
 
+      - name: Upload combined coverage report
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4
+        with:
+          name: coverage-combined-report
+          path: coverage-data/coverage.xml
+          retention-days: 30
+
   audit-prod:
     name: Audit - Production
     runs-on: ubuntu-latest
 
@@ -133,6 +133,7 @@ celerybeat.pid
 
 # Environments
 .env
+.env.*
 .venv
 env/
 venv/
 
@@ -6,6 +6,11 @@
 import pandas as pd
 
 from deepnote_toolkit.ocelots.constants import DEEPNOTE_INDEX_COLUMN
+from deepnote_toolkit.ocelots.pandas.utils import (
+    is_numeric_or_temporal,
+    is_type_datetime_or_timedelta,
+    safe_convert_to_string,
+)
 from deepnote_toolkit.ocelots.types import ColumnsStatsRecord, ColumnStats
 
 
@@ -19,12 +24,15 @@ def _count_unique(column):
 
 
 def _get_categories(np_array):
-    pandas_series = pd.Series(np_array.tolist())
+    pandas_series = pd.Series(np_array)
 
     # special treatment for empty values
     num_nans = pandas_series.isna().sum().item()
 
-    counter = Counter(pandas_series.dropna().astype(str))
+    try:
+        counter = Counter(pandas_series.dropna().astype(str))
+    except (TypeError, UnicodeDecodeError, AttributeError):
+        counter = Counter(pandas_series.dropna().apply(safe_convert_to_string))
 
     max_items = 3
     if num_nans > 0:
@@ -46,33 +54,9 @@ def _get_categories(np_array):
     return [{"name": name, "count": count} for name, count in categories]
 
 
-def _is_type_numeric(dtype):
-    """
-    Returns True if dtype is numeric, False otherwise
-
-    Numeric means either a number (int, float, complex) or a datetime or timedelta.
-    It means e.g. that a range of these values can be plotted on a histogram.
-    """
-
-    # datetime doesn't play nice with np.issubdtype, so we need to check explicitly
-    if pd.api.types.is_datetime64_any_dtype(dtype) or pd.api.types.is_timedelta64_dtype(
-        dtype
-    ):
-        return True
-
-    try:
-        return np.issubdtype(dtype, np.number)
-    except TypeError:
-        # np.issubdtype crashes on categorical column dtype, and also on others, e.g. geopandas types
-        return False
-
-
 def _get_histogram(pd_series):
     try:
-        if pd.api.types.is_datetime64_any_dtype(
-            pd_series
-        ) or pd.api.types.is_timedelta64_dtype(pd_series):
-            # convert datetime or timedelta to an integer so that a histogram can be created
+        if is_type_datetime_or_timedelta(pd_series):
             np_array = np.array(pd_series.dropna().astype(int))
         else:
             # let's drop infinite values because they break histograms
@@ -104,11 +88,15 @@ def _calculate_min_max(column):
     """
     Calculate min and max values for a given column.
     """
-    if _is_type_numeric(column.dtype):
+    if not is_numeric_or_temporal(column.dtype):
+        return None, None
+
+    try:
         min_value = str(min(column.dropna())) if len(column.dropna()) > 0 else None
         max_value = str(max(column.dropna())) if len(column.dropna()) > 0 else None
         return min_value, max_value
-    return None, None
+    except (TypeError, ValueError):
+        return None, None
 
 
 def analyze_columns(
@@ -167,7 +155,7 @@ def analyze_columns(
             unique_count=_count_unique(column), nan_count=column.isnull().sum().item()
         )
 
-        if _is_type_numeric(column.dtype):
+        if is_numeric_or_temporal(column.dtype):
             min_value, max_value = _calculate_min_max(column)
             columns[i].stats.min = min_value
             columns[i].stats.max = max_value
@@ -187,7 +175,7 @@ def analyze_columns(
     for i in range(max_columns_to_analyze, len(df.columns)):
         # Ignore columns that are not numeric
         column = df.iloc[:, i]
-        if not _is_type_numeric(column.dtype):
+        if not is_numeric_or_temporal(column.dtype):
             continue
 
         column_name = columns[i].name
 
@@ -5,6 +5,19 @@
 from deepnote_toolkit.ocelots.constants import MAX_STRING_CELL_LENGTH
 
 
+def safe_convert_to_string(value):
+    """
+    Safely convert a value to string, handling cases where str() might fail.
+
+    Note: For bytes, this returns Python's standard string representation (e.g., b'hello')
+    rather than base64 encoding, which is more human-readable.
+    """
+    try:
+        return str(value)
+    except Exception:
+        return "<unconvertible>"
+
+
 # like fillna, but only fills NaT (not a time) values in datetime columns with the specified value
 def fill_nat(df, value):
     df_datetime_columns = df.select_dtypes(
@@ -76,36 +89,63 @@ def deduplicate_columns(df):
 # Cast dataframe contents to strings and trim them to avoid sending too much data
 def cast_objects_to_string(df):
     def to_string_truncated(elem):
-        elem_string = str(elem)
+        elem_string = safe_convert_to_string(elem)
         return (
             (elem_string[: MAX_STRING_CELL_LENGTH - 1] + "…")
             if len(elem_string) > MAX_STRING_CELL_LENGTH
             else elem_string
         )
 
     for column in df:
-        if not _is_type_number(df[column].dtype):
+        if not is_pure_numeric(df[column].dtype):
             # if the dtype is not a number, we want to convert it to string and truncate
             df[column] = df[column].apply(to_string_truncated)
 
     return df
 
 
-def _is_type_number(dtype):
+def is_type_datetime_or_timedelta(series_or_dtype):
     """
-    Returns True if dtype is a number, False otherwise. Datetime and timedelta will return False.
+    Returns True if the series or dtype is datetime or timedelta, False otherwise.
+    """
+    return pd.api.types.is_datetime64_any_dtype(
+        series_or_dtype
+    ) or pd.api.types.is_timedelta64_dtype(series_or_dtype)
+
 
-    The primary intent of this is to recognize a value that will converted to a JSON number during serialization.
+def is_numeric_or_temporal(dtype):
     """
+    Returns True if dtype is numeric or temporal (datetime/timedelta), False otherwise.
 
-    if pd.api.types.is_datetime64_any_dtype(dtype) or pd.api.types.is_timedelta64_dtype(
-        dtype
-    ):
+    This includes numbers (int, float), datetime, and timedelta types.
+    Use this to determine if values can be plotted on a histogram or have min/max calculated.
+    """
+    if is_type_datetime_or_timedelta(dtype):
+        return True
+
+    try:
+        return np.issubdtype(dtype, np.number) and not np.issubdtype(
+            dtype, np.complexfloating
+        )
+    except TypeError:
+        # np.issubdtype crashes on categorical column dtype, and also on others, e.g. geopandas types
+        return False
+
+
+def is_pure_numeric(dtype):
+    """
+    Returns True if dtype is a pure number (int, float), False otherwise.
+
+    Use this to determine if a value will be serialized as a JSON number.
+    """
+    if is_type_datetime_or_timedelta(dtype):
         # np.issubdtype(dtype, np.number) returns True for timedelta, which we don't want
         return False
 
     try:
-        return np.issubdtype(dtype, np.number)
+        return np.issubdtype(dtype, np.number) and not np.issubdtype(
+            dtype, np.complexfloating
+        )
     except TypeError:
         # np.issubdtype crashes on categorical column dtype, and also on others, e.g. geopandas types
         return False
-Original file line number
+Diff line change
 # Environments
 .env
 +.env.*
 .venv
 env/
 venv/