Skip to content

Commit 75cf1bd

Browse files
Merge branch 'main' into codeflash/optimize-find_query_preview_references-mhl9wno5
2 parents 4fd4a2e + 9994c8f commit 75cf1bd

File tree

19 files changed

+1809
-320
lines changed

19 files changed

+1809
-320
lines changed

.cursorrules

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,12 @@ Additional for integration tests:
9494
# Run local tests
9595
./bin/test-local
9696

97+
# Run a specific test file
98+
./bin/test-local tests/unit/test_file.py
99+
100+
# ... or specific test from file
101+
./bin/test-local tests/unit/test_file.py::TestClass::test_method
102+
97103
# Run specific test type
98104
export TEST_TYPE="unit|integration"
99105
export TOOLKIT_VERSION="local-build"

.github/workflows/cd.yml

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ env:
2020
POETRY_VIRTUALENVS_CREATE: true
2121
POETRY_VIRTUALENVS_IN_PROJECT: true
2222
POETRY_INSTALLER_PARALLEL: true
23-
JUPYTER_FOR_LOCAL_PYTHON_VERSION: "3.9"
23+
JUPYTER_FOR_LOCAL_PYTHON_VERSION: "3.11"
2424

2525
concurrency:
2626
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
@@ -395,13 +395,15 @@ jobs:
395395
aws s3 cp "s3://${STAGING_BUCKET}/deepnote-toolkit/${VERSION}/installer.zip" "/tmp/dist${PYTHON_VER}/installer.zip"
396396
397397
- name: Build jupyter-for-local docker image
398+
env:
399+
PYTHON_VERSION: ${{ env.JUPYTER_FOR_LOCAL_PYTHON_VERSION }}
398400
run: |
399401
docker build \
400402
--progress plain \
401-
--build-arg "FROM_PYTHON_TAG=3.9" \
403+
--build-arg "FROM_PYTHON_TAG=${PYTHON_VERSION}" \
402404
--build-arg "BUNDLE_PATH=./" \
403405
--tag deepnote/jupyter-for-local:${VERSION} \
404-
-f dockerfiles/jupyter-for-local/Dockerfile /tmp/dist3.9/
406+
-f dockerfiles/jupyter-for-local/Dockerfile /tmp/dist${PYTHON_VERSION}/
405407
406408
- name: Push jupyter-for-local image
407409
run: |

.github/workflows/ci.yml

Lines changed: 68 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ jobs:
139139
140140
# Only allow licenses compatible with Apache 2.0 (allowlist approach)
141141
# Ignored packages either have UNKNOWN licenses or not distributed
142-
poetry run pip-licenses --allow-only "Apache;MIT;BSD;ISC;Unlicense;CC0;Public Domain;Python Software Foundation;Mozilla Public License 2.0;GNU Library or Lesser General Public License (LGPL)" --partial-match --ignore-packages arro3-core click dependency-groups Flask jeepney jupyter_core MarkupSafe more-itertools pymssql PyMySQL SecretStorage sqlalchemy-spanner typing-extensions typing-inspection urllib3
142+
poetry run pip-licenses --allow-only "Apache;MIT;BSD;ISC;Unlicense;CC0;Public Domain;Python Software Foundation;Mozilla Public License 2.0;GNU Library or Lesser General Public License (LGPL)" --partial-match --ignore-packages arro3-core click dependency-groups Flask jeepney jupyter_core matplotlib-inline MarkupSafe more-itertools pymssql PyMySQL SecretStorage sqlalchemy-spanner typing-extensions typing-inspection urllib3
143143
144144
echo "✅ All licenses are compatible with Apache 2.0"
145145
@@ -334,62 +334,99 @@ jobs:
334334
run: poetry install --no-interaction --only-root
335335

336336
- name: Run unit tests
337-
id: test-unit
338337
env:
339338
TOOLKIT_VERSION: ${{ steps.version.outputs.VERSION }}
340-
PYTHON_VERSION: ${{ matrix.python-version }}
341-
COVERAGE_FILE: coverage/.coverage.${{ matrix.python-version }}
342339
run: |
343-
set -euo pipefail
344-
345-
# Create coverage directory
346-
mkdir -p coverage
347-
348-
# Run tests with coverage
349340
poetry run pytest tests/unit \
350341
--cov=deepnote_toolkit \
351342
--cov=installer \
352343
--cov=deepnote_core \
353344
--cov-branch \
354345
--cov-report=term-missing:skip-covered \
355-
--cov-report=xml:coverage/coverage-${PYTHON_VERSION}.xml \
356-
--cov-report=json:coverage/coverage-${PYTHON_VERSION}.json \
357346
--junitxml=junit.xml \
358347
-o junit_family=legacy
359348
360-
# Check if coverage file was generated
361-
if [ -f "coverage/.coverage.${PYTHON_VERSION}" ]; then
362-
echo "coverage_generated=true" >> $GITHUB_OUTPUT
363-
echo "Coverage files found:"
364-
ls -la coverage/
365-
else
366-
echo "coverage_generated=false" >> $GITHUB_OUTPUT
367-
echo "Warning: No coverage file generated"
368-
fi
369-
370349
- name: Per-version coverage summary
371-
if: steps.test-unit.outputs.coverage_generated == 'true'
372-
env:
373-
PYTHON_VERSION: ${{ matrix.python-version }}
374350
run: |
375-
echo "## Python ${PYTHON_VERSION} Coverage" >> $GITHUB_STEP_SUMMARY
376-
poetry run coverage report --data-file=coverage/.coverage.${PYTHON_VERSION} --format=markdown >> $GITHUB_STEP_SUMMARY
351+
echo "## Python ${{ matrix.python-version }} Coverage" >> $GITHUB_STEP_SUMMARY
352+
poetry run coverage report --format=markdown >> $GITHUB_STEP_SUMMARY
377353
378-
- name: Upload test results to Codecov (these are results not coverage reports)
354+
- name: Upload test results to Codecov
379355
if: ${{ !cancelled() }}
380356
uses: codecov/test-results-action@47f89e9acb64b76debcd5ea40642d25a4adced9f # v1
381357
with:
382358
token: ${{ secrets.CODECOV_TOKEN }}
383359
flags: python-${{ matrix.python-version }}
384360

385-
- name: Upload coverage to Codecov
361+
- name: Upload coverage artifacts
362+
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4
363+
with:
364+
name: coverage-${{ matrix.python-version }}
365+
path: .coverage
366+
retention-days: 1
367+
include-hidden-files: true
368+
if-no-files-found: error
369+
370+
coverage-combine:
371+
name: Combine and Upload Coverage
372+
runs-on: ubuntu-latest
373+
needs: tests-unit
374+
if: always()
375+
steps:
376+
- name: Checkout code
377+
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4
378+
with:
379+
persist-credentials: false
380+
381+
- name: Set up Python
382+
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5
383+
with:
384+
python-version: '3.11'
385+
386+
- name: Install coverage
387+
run: pip install coverage[toml]
388+
389+
- name: Download all coverage artifacts
390+
uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4
391+
with:
392+
pattern: coverage-*
393+
path: coverage-artifacts/
394+
395+
- name: Combine coverage files
396+
run: |
397+
shopt -s nullglob
398+
mkdir -p coverage-data
399+
400+
i=0
401+
for file in coverage-artifacts/*/.coverage; do
402+
cp "$file" "coverage-data/.coverage.$i"
403+
i=$((i + 1))
404+
done
405+
406+
coverage combine coverage-data/
407+
coverage xml -o coverage-data/coverage.xml
408+
coverage report
409+
410+
echo "## Combined Coverage Report" >> $GITHUB_STEP_SUMMARY
411+
coverage report --format=markdown >> $GITHUB_STEP_SUMMARY
412+
413+
- name: Upload combined coverage to Codecov
386414
uses: codecov/codecov-action@5a1091511ad55cbe89839c7260b706298ca349f7 # v5
387415
with:
388416
token: ${{ secrets.CODECOV_TOKEN }}
389-
slug: deepnote/deepnote-toolkit
390-
files: ./coverage/coverage-${{ matrix.python-version }}.xml
417+
slug: ${{ github.repository }}
418+
files: ./coverage-data/coverage.xml
419+
flags: combined
420+
disable_search: true
391421
fail_ci_if_error: ${{ github.event.pull_request.head.repo.full_name == github.repository || github.event_name == 'push' }}
392422

423+
- name: Upload combined coverage report
424+
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4
425+
with:
426+
name: coverage-combined-report
427+
path: coverage-data/coverage.xml
428+
retention-days: 30
429+
393430
audit-prod:
394431
name: Audit - Production
395432
runs-on: ubuntu-latest

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,7 @@ celerybeat.pid
133133

134134
# Environments
135135
.env
136+
.env.*
136137
.venv
137138
env/
138139
venv/

deepnote_toolkit/ocelots/pandas/analyze.py

Lines changed: 19 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,11 @@
66
import pandas as pd
77

88
from deepnote_toolkit.ocelots.constants import DEEPNOTE_INDEX_COLUMN
9+
from deepnote_toolkit.ocelots.pandas.utils import (
10+
is_numeric_or_temporal,
11+
is_type_datetime_or_timedelta,
12+
safe_convert_to_string,
13+
)
914
from deepnote_toolkit.ocelots.types import ColumnsStatsRecord, ColumnStats
1015

1116

@@ -19,12 +24,15 @@ def _count_unique(column):
1924

2025

2126
def _get_categories(np_array):
22-
pandas_series = pd.Series(np_array.tolist())
27+
pandas_series = pd.Series(np_array)
2328

2429
# special treatment for empty values
2530
num_nans = pandas_series.isna().sum().item()
2631

27-
counter = Counter(pandas_series.dropna().astype(str))
32+
try:
33+
counter = Counter(pandas_series.dropna().astype(str))
34+
except (TypeError, UnicodeDecodeError, AttributeError):
35+
counter = Counter(pandas_series.dropna().apply(safe_convert_to_string))
2836

2937
max_items = 3
3038
if num_nans > 0:
@@ -46,33 +54,9 @@ def _get_categories(np_array):
4654
return [{"name": name, "count": count} for name, count in categories]
4755

4856

49-
def _is_type_numeric(dtype):
50-
"""
51-
Returns True if dtype is numeric, False otherwise
52-
53-
Numeric means either a number (int, float, complex) or a datetime or timedelta.
54-
It means e.g. that a range of these values can be plotted on a histogram.
55-
"""
56-
57-
# datetime doesn't play nice with np.issubdtype, so we need to check explicitly
58-
if pd.api.types.is_datetime64_any_dtype(dtype) or pd.api.types.is_timedelta64_dtype(
59-
dtype
60-
):
61-
return True
62-
63-
try:
64-
return np.issubdtype(dtype, np.number)
65-
except TypeError:
66-
# np.issubdtype crashes on categorical column dtype, and also on others, e.g. geopandas types
67-
return False
68-
69-
7057
def _get_histogram(pd_series):
7158
try:
72-
if pd.api.types.is_datetime64_any_dtype(
73-
pd_series
74-
) or pd.api.types.is_timedelta64_dtype(pd_series):
75-
# convert datetime or timedelta to an integer so that a histogram can be created
59+
if is_type_datetime_or_timedelta(pd_series):
7660
np_array = np.array(pd_series.dropna().astype(int))
7761
else:
7862
# let's drop infinite values because they break histograms
@@ -104,11 +88,15 @@ def _calculate_min_max(column):
10488
"""
10589
Calculate min and max values for a given column.
10690
"""
107-
if _is_type_numeric(column.dtype):
91+
if not is_numeric_or_temporal(column.dtype):
92+
return None, None
93+
94+
try:
10895
min_value = str(min(column.dropna())) if len(column.dropna()) > 0 else None
10996
max_value = str(max(column.dropna())) if len(column.dropna()) > 0 else None
11097
return min_value, max_value
111-
return None, None
98+
except (TypeError, ValueError):
99+
return None, None
112100

113101

114102
def analyze_columns(
@@ -167,7 +155,7 @@ def analyze_columns(
167155
unique_count=_count_unique(column), nan_count=column.isnull().sum().item()
168156
)
169157

170-
if _is_type_numeric(column.dtype):
158+
if is_numeric_or_temporal(column.dtype):
171159
min_value, max_value = _calculate_min_max(column)
172160
columns[i].stats.min = min_value
173161
columns[i].stats.max = max_value
@@ -187,7 +175,7 @@ def analyze_columns(
187175
for i in range(max_columns_to_analyze, len(df.columns)):
188176
# Ignore columns that are not numeric
189177
column = df.iloc[:, i]
190-
if not _is_type_numeric(column.dtype):
178+
if not is_numeric_or_temporal(column.dtype):
191179
continue
192180

193181
column_name = columns[i].name

deepnote_toolkit/ocelots/pandas/utils.py

Lines changed: 49 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,19 @@
55
from deepnote_toolkit.ocelots.constants import MAX_STRING_CELL_LENGTH
66

77

8+
def safe_convert_to_string(value):
9+
"""
10+
Safely convert a value to string, handling cases where str() might fail.
11+
12+
Note: For bytes, this returns Python's standard string representation (e.g., b'hello')
13+
rather than base64 encoding, which is more human-readable.
14+
"""
15+
try:
16+
return str(value)
17+
except Exception:
18+
return "<unconvertible>"
19+
20+
821
# like fillna, but only fills NaT (not a time) values in datetime columns with the specified value
922
def fill_nat(df, value):
1023
df_datetime_columns = df.select_dtypes(
@@ -76,36 +89,63 @@ def deduplicate_columns(df):
7689
# Cast dataframe contents to strings and trim them to avoid sending too much data
7790
def cast_objects_to_string(df):
7891
def to_string_truncated(elem):
79-
elem_string = str(elem)
92+
elem_string = safe_convert_to_string(elem)
8093
return (
8194
(elem_string[: MAX_STRING_CELL_LENGTH - 1] + "…")
8295
if len(elem_string) > MAX_STRING_CELL_LENGTH
8396
else elem_string
8497
)
8598

8699
for column in df:
87-
if not _is_type_number(df[column].dtype):
100+
if not is_pure_numeric(df[column].dtype):
88101
# if the dtype is not a number, we want to convert it to string and truncate
89102
df[column] = df[column].apply(to_string_truncated)
90103

91104
return df
92105

93106

94-
def _is_type_number(dtype):
107+
def is_type_datetime_or_timedelta(series_or_dtype):
95108
"""
96-
Returns True if dtype is a number, False otherwise. Datetime and timedelta will return False.
109+
Returns True if the series or dtype is datetime or timedelta, False otherwise.
110+
"""
111+
return pd.api.types.is_datetime64_any_dtype(
112+
series_or_dtype
113+
) or pd.api.types.is_timedelta64_dtype(series_or_dtype)
114+
97115

98-
The primary intent of this is to recognize a value that will converted to a JSON number during serialization.
116+
def is_numeric_or_temporal(dtype):
99117
"""
118+
Returns True if dtype is numeric or temporal (datetime/timedelta), False otherwise.
100119
101-
if pd.api.types.is_datetime64_any_dtype(dtype) or pd.api.types.is_timedelta64_dtype(
102-
dtype
103-
):
120+
This includes numbers (int, float), datetime, and timedelta types.
121+
Use this to determine if values can be plotted on a histogram or have min/max calculated.
122+
"""
123+
if is_type_datetime_or_timedelta(dtype):
124+
return True
125+
126+
try:
127+
return np.issubdtype(dtype, np.number) and not np.issubdtype(
128+
dtype, np.complexfloating
129+
)
130+
except TypeError:
131+
# np.issubdtype crashes on categorical column dtype, and also on others, e.g. geopandas types
132+
return False
133+
134+
135+
def is_pure_numeric(dtype):
136+
"""
137+
Returns True if dtype is a pure number (int, float), False otherwise.
138+
139+
Use this to determine if a value will be serialized as a JSON number.
140+
"""
141+
if is_type_datetime_or_timedelta(dtype):
104142
# np.issubdtype(dtype, np.number) returns True for timedelta, which we don't want
105143
return False
106144

107145
try:
108-
return np.issubdtype(dtype, np.number)
146+
return np.issubdtype(dtype, np.number) and not np.issubdtype(
147+
dtype, np.complexfloating
148+
)
109149
except TypeError:
110150
# np.issubdtype crashes on categorical column dtype, and also on others, e.g. geopandas types
111151
return False

0 commit comments

Comments
 (0)