From 391e799677e8e9b5104ba2c6139e02770ded4a60 Mon Sep 17 00:00:00 2001 From: Tyler Riccio Date: Mon, 26 May 2025 13:33:23 -0400 Subject: [PATCH 01/11] add ty experimental --- .gitignore | 3 +++ CONTRIBUTING.md | 13 +++++++++---- Makefile | 5 +++++ 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 7b9ba3edb..affec794f 100644 --- a/.gitignore +++ b/.gitignore @@ -133,3 +133,6 @@ benchmark/data/*.json .swp uv.lock + +# While typing is experimental, don't mark the entire package as typed +pointblank/py.typed diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b1e9083fc..b18ce60e0 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -25,11 +25,12 @@ Once there is consensus that a PR based on the issue would be helpful, adhering ### Setting Up Your Development Environment -To set up your development environment, you can follow these steps: +To set up your development environment, first clone the posit-dev/pointblank repository. -- Clone the posit-dev/pointblank repository -- Create a virtual environment for the folder -- Install the package in editable mode with `pip install -e .` from the root of the project folder +If you're using UV, you may run `uv sync` and your environment is setup! If using pip or another package manager, keep following these steps: + +- Create a virtual environment for the folder. +- Install the package in editable mode with `pip install -e .` from the root of the project folder. - Install the development dependencies with `pip install '.[dev]'` (have a look at the `pyproject.toml` file for the list of development dependencies) Our documentation uses `quartodoc` which in turn requires a local install of the Quarto CLI. To install Quarto, go to to get the latest build for your platform. @@ -43,3 +44,7 @@ Building the documentation can be done with `make docs-build` from the root of t The tests are located in the `tests` folder and we use `pytest` for running them. To run all of the tests, use `make test`. If you want to run a specific test file, you can use `pytest tests/test_file.py`. If you create new tests involving snapshots, please ensure that the resulting snapshots are relatively small. After adding snapshots, use `make test-update` (this runs `pytest --snapshot-update`). A subsequent use of `make test` should pass without any issues. + +### Linting and Type Checking + +We use `ruff` for linting, the settings used are fairly loose and objective. Linting is run in pre-commit in CI. You can run it locally with `make lint`. Type checking is currently not enforced, but we intend on gradually typing the codebase. You can run `make type` to run Astral's new experimental type checker `ty`. Feel free to leverage type hints and occasionally type checking but it's not obligatory at this time. diff --git a/Makefile b/Makefile index 20c27d7fd..2007aa201 100644 --- a/Makefile +++ b/Makefile @@ -18,6 +18,11 @@ lint: ## Run ruff formatter and linter @uv run ruff format @uv run ruff check --fix + +type: ## Run experimental(!) type checking + @uvx ty check pointblank + + check: pyright --pythonversion 3.8 pointblank pyright --pythonversion 3.9 pointblank From 2e9317875842ae2fed5033b8f204e1be48478210 Mon Sep 17 00:00:00 2001 From: Tyler Riccio Date: Mon, 26 May 2025 13:33:35 -0400 Subject: [PATCH 02/11] remove duplicate pytest xdist --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 7ea91b0e8..ceead60f7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -92,7 +92,6 @@ dev = [ "pytest-rerunfailures>=15.0", "pytest-snapshot", "pytest-xdist>=3.6.1", - "pytest-xdist>=3.6.1", "quartodoc>=0.8.1; python_version >= '3.9'", "ruff>=0.9.9", "shiny>=1.4.0", From 4594d7cc198aa9aedec5717366d2789d880bc1aa Mon Sep 17 00:00:00 2001 From: Tyler Riccio Date: Mon, 26 May 2025 14:00:29 -0400 Subject: [PATCH 03/11] gradually type any to Any, asserts --- pointblank/_interrogation.py | 4 +-- pointblank/schema.py | 10 +++++-- pointblank/validate.py | 56 +++++++++++++++++++----------------- 3 files changed, 38 insertions(+), 32 deletions(-) diff --git a/pointblank/_interrogation.py b/pointblank/_interrogation.py index 637939486..be2e3c900 100644 --- a/pointblank/_interrogation.py +++ b/pointblank/_interrogation.py @@ -1911,7 +1911,7 @@ class ColSchemaMatch: """ data_tbl: FrameT | Any - schema: any + schema: Any complete: bool in_order: bool case_sensitive_colnames: bool @@ -2425,7 +2425,7 @@ def _check_nulls_across_columns_nw(table, columns_subset): return result -def _modify_datetime_compare_val(tgt_column: any, compare_val: any) -> any: +def _modify_datetime_compare_val(tgt_column: Any, compare_val: Any) -> Any: tgt_col_dtype_str = str(tgt_column.dtype).lower() if compare_val is isinstance(compare_val, Column): # pragma: no cover diff --git a/pointblank/schema.py b/pointblank/schema.py index 5db200732..3911f27e2 100644 --- a/pointblank/schema.py +++ b/pointblank/schema.py @@ -2,10 +2,14 @@ import copy from dataclasses import dataclass +from typing import TYPE_CHECKING from pointblank._constants import IBIS_BACKENDS from pointblank._utils import _get_tbl_type, _is_lazy_frame, _is_lib_present, _is_narwhals_table +if TYPE_CHECKING: + from typing import Any + __all__ = ["Schema"] @@ -265,14 +269,14 @@ class Schema: columns: str | list[str] | list[tuple[str, str]] | list[tuple[str]] | dict[str, str] | None = ( None ) - tbl: any | None = None + tbl: Any | None = None def __init__( self, columns: ( str | list[str] | list[tuple[str, str]] | list[tuple[str]] | dict[str, str] | None ) = None, - tbl: any | None = None, + tbl: Any | None = None, **kwargs, ): if tbl is None and columns is None and not kwargs: @@ -872,7 +876,7 @@ def _schema_info_generate_params_dict( def _get_schema_validation_info( - data_tbl: any, + data_tbl: Any, schema: Schema, passed: bool, complete: bool, diff --git a/pointblank/validate.py b/pointblank/validate.py index f4b930681..bdcdbdb16 100644 --- a/pointblank/validate.py +++ b/pointblank/validate.py @@ -97,6 +97,7 @@ if TYPE_CHECKING: from collections.abc import Collection + from typing import Any from pointblank._typing import AbsoluteBounds, Tolerance @@ -2417,12 +2418,12 @@ class _ValidationInfo: step_id: str | None = None sha1: str | None = None assertion_type: str | None = None - column: any | None = None - values: any | list[any] | tuple | None = None + column: Any | None = None + values: Any | list[any] | tuple | None = None inclusive: tuple[bool, bool] | None = None na_pass: bool | None = None pre: Callable | None = None - segments: any | None = None + segments: Any | None = None thresholds: Thresholds | None = None actions: Actions | None = None label: str | None = None @@ -6937,7 +6938,7 @@ def col_vals_regex( def col_vals_expr( self, - expr: any, + expr: Any, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds = None, @@ -12992,7 +12993,7 @@ def _convert_string_to_datetime(value: str) -> datetime.datetime: return datetime.datetime.strptime(value, "%Y-%m-%d %H:%M:%S") -def _string_date_dttm_conversion(value: any) -> any: +def _string_date_dttm_conversion(value: Any) -> Any: """ Convert a string to a date or datetime object if it is in the correct format. If the value is not a string, it is returned as is. @@ -13030,9 +13031,9 @@ def _process_brief( brief: str | None, step: int, col: str | list[str] | None, - values: any | None, - thresholds: any | None, - segment: any | None, + values: Any | None, + thresholds: Any | None, + segment: Any | None, ) -> str: # If there is no brief, return `None` if brief is None: @@ -13098,7 +13099,7 @@ def _process_action_str( action_str: str, step: int, col: str | None, - value: any, + value: Any, type: str, level: str, time: str, @@ -13545,7 +13546,7 @@ def _prep_values_text( return values_str -def _seg_expr_from_string(data_tbl: any, segments_expr: str) -> tuple[str, str]: +def _seg_expr_from_string(data_tbl: Any, segments_expr: str) -> tuple[str, str]: """ Obtain the segmentation categories from a table column. @@ -13637,7 +13638,7 @@ def _seg_expr_from_tuple(segments_expr: tuple) -> list[tuple[str, str]]: return seg_tuples -def _apply_segments(data_tbl: any, segments_expr: tuple[str, str]) -> any: +def _apply_segments(data_tbl: Any, segments_expr: tuple[str, str]) -> Any: """ Apply the segments expression to the data table. @@ -13821,7 +13822,7 @@ def _process_title_text(title: str | None, tbl_name: str | None, lang: str) -> s return title_text -def _transform_tbl_preprocessed(pre: any, seg: any, interrogation_performed: bool) -> list[str]: +def _transform_tbl_preprocessed(pre: Any, seg: Any, interrogation_performed: bool) -> list[str]: # If no interrogation was performed, return a list of empty strings if not interrogation_performed: return ["" for _ in range(len(pre))] @@ -14141,16 +14142,14 @@ def _pre_processing_funcs_to_str(pre: Callable) -> str | list[str]: def _get_callable_source(fn: Callable) -> str: - if isinstance(fn, Callable): - try: - source_lines, _ = inspect.getsourcelines(fn) - source = "".join(source_lines).strip() - # Extract the `pre` argument from the source code - pre_arg = _extract_pre_argument(source) - return pre_arg - except (OSError, TypeError): # pragma: no cover - return fn.__name__ - return fn + try: + source_lines, _ = inspect.getsourcelines(fn) + source = "".join(source_lines).strip() + # Extract the `pre` argument from the source code + pre_arg = _extract_pre_argument(source) + return pre_arg + except (OSError, TypeError): # pragma: no cover + return fn.__name__ def _extract_pre_argument(source: str) -> str: @@ -14176,6 +14175,7 @@ def _create_table_time_html( if time_start is None: return "" + assert time_end is not None # typing # Get the time duration (difference between `time_end` and `time_start`) in seconds time_duration = (time_end - time_start).total_seconds() @@ -14393,12 +14393,12 @@ def _step_report_row_based( column: str, column_position: int, columns_subset: list[str] | None, - values: any, + values: Any, inclusive: tuple[bool, bool] | None, n: int, n_failed: int, all_passed: bool, - extract: any, + extract: Any, tbl_preview: GT, header: str, limit: int | None, @@ -14425,10 +14425,12 @@ def _step_report_row_based( elif assertion_type == "col_vals_le": text = f"{column} ≤ {values}" elif assertion_type == "col_vals_between": + assert inclusive is not None symbol_left = "≤" if inclusive[0] else "<" symbol_right = "≤" if inclusive[1] else "<" text = f"{values[0]} {symbol_left} {column} {symbol_right} {values[1]}" elif assertion_type == "col_vals_outside": + assert inclusive is not None symbol_left = "<" if inclusive[0] else "≤" symbol_right = ">" if inclusive[1] else "≥" text = f"{column} {symbol_left} {values[0]}, {column} {symbol_right} {values[1]}" @@ -14633,7 +14635,7 @@ def _step_report_rows_distinct( n: int, n_failed: int, all_passed: bool, - extract: any, + extract: Any, tbl_preview: GT, header: str, limit: int | None, @@ -14761,7 +14763,7 @@ def _step_report_rows_distinct( def _step_report_schema_in_order( step: int, schema_info: dict, header: str, lang: str, debug_return_df: bool = False -) -> GT | any: +) -> GT | Any: """ This is the case for schema validation where the schema is supposed to have the same column order as the target table. @@ -15100,7 +15102,7 @@ def _step_report_schema_in_order( def _step_report_schema_any_order( step: int, schema_info: dict, header: str, lang: str, debug_return_df: bool = False -) -> GT | any: +) -> GT | Any: """ This is the case for schema validation where the schema is permitted to not have to be in the same column order as the target table. From 45f7a4b6279ac08882faa3ab50f6295a72594d0e Mon Sep 17 00:00:00 2001 From: Tyler Riccio Date: Mon, 26 May 2025 14:05:08 -0400 Subject: [PATCH 04/11] remove unecessary None type in _create_text_col_exists --- pointblank/validate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pointblank/validate.py b/pointblank/validate.py index bdcdbdb16..aab8d28d7 100644 --- a/pointblank/validate.py +++ b/pointblank/validate.py @@ -13405,7 +13405,7 @@ def _create_text_expr(lang: str, for_failure: bool) -> str: return EXPECT_FAIL_TEXT[f"col_vals_expr_{type_}_text"][lang] -def _create_text_col_exists(lang: str, column: str | None, for_failure: bool = False) -> str: +def _create_text_col_exists(lang: str, column: str, for_failure: bool = False) -> str: type_ = _expect_failure_type(for_failure=for_failure) column_text = _prep_column_text(column=column) From 0e7dba4a69be6bd7be37bde3472050b300e0c8ce Mon Sep 17 00:00:00 2001 From: Tyler Riccio Date: Mon, 26 May 2025 14:07:44 -0400 Subject: [PATCH 05/11] explicit none return pre processing funcs to str --- pointblank/validate.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pointblank/validate.py b/pointblank/validate.py index aab8d28d7..4c389b430 100644 --- a/pointblank/validate.py +++ b/pointblank/validate.py @@ -14136,9 +14136,10 @@ def _transform_assertion_str( return type_upd -def _pre_processing_funcs_to_str(pre: Callable) -> str | list[str]: +def _pre_processing_funcs_to_str(pre: Callable) -> str | list[str] | None: if isinstance(pre, Callable): return _get_callable_source(fn=pre) + return None def _get_callable_source(fn: Callable) -> str: From ac3c97443d63a132e4b6e7e19fd3f122862f1d1b Mon Sep 17 00:00:00 2001 From: Tyler Riccio Date: Mon, 26 May 2025 14:09:22 -0400 Subject: [PATCH 06/11] fix replace_svg_dims --- pointblank/validate.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pointblank/validate.py b/pointblank/validate.py index 4c389b430..b6aefff69 100644 --- a/pointblank/validate.py +++ b/pointblank/validate.py @@ -13754,11 +13754,9 @@ def _get_assertion_icon(icon: list[str], length_val: int = 30) -> list[str]: return icon_svg -def _replace_svg_dimensions(svg: list[str], height_width: int | float) -> list[str]: +def _replace_svg_dimensions(svg: str, height_width: int | float) -> str: svg = re.sub(r'width="[0-9]*?px', f'width="{height_width}px', svg) - svg = re.sub(r'height="[0-9]*?px', f'height="{height_width}px', svg) - - return svg + return re.sub(r'height="[0-9]*?px', f'height="{height_width}px', svg) def _get_title_text( From e86c21fa7b415b1bfb71deb38c2a23c203b9b09c Mon Sep 17 00:00:00 2001 From: Tyler Riccio Date: Mon, 26 May 2025 14:28:58 -0400 Subject: [PATCH 07/11] type alias generic compliant values --- pointblank/_typing.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pointblank/_typing.py b/pointblank/_typing.py index c8cbd56c2..18ba19c7e 100644 --- a/pointblank/_typing.py +++ b/pointblank/_typing.py @@ -1,6 +1,8 @@ from __future__ import annotations +import datetime import sys +from collections.abc import Container from typing import List, Tuple, Union # Check Python version for TypeAlias support @@ -15,6 +17,12 @@ SegmentTuple: TypeAlias = Tuple[str, SegmentValue] SegmentItem: TypeAlias = Union[str, SegmentTuple] SegmentSpec: TypeAlias = Union[str, SegmentTuple, List[SegmentItem]] + + _CompliantValue: TypeAlias = Union[str, int, float, datetime.datetime, datetime.date] + """A compliant value that pointblank can use in a validation step""" + _CompliantValues: TypeAlias = Container[_CompliantValue] + """A collection of compliant values that pointblank can use in a validation step""" + else: # Python 3.8 and 3.9 compatible type aliases AbsoluteBounds = Tuple[int, int] @@ -24,6 +32,10 @@ SegmentTuple = Tuple[str, SegmentValue] SegmentItem = Union[str, SegmentTuple] SegmentSpec = Union[str, SegmentTuple, List[SegmentItem]] + _CompliantValue = Union[str, int, float, datetime.datetime, datetime.date] + """A compliant value that pointblank can use in a validation step""" + _CompliantValues = Container[_CompliantValue] + """A collection of compliant values that pointblank can use in a validation step""" # Add docstrings for better IDE support AbsoluteBounds.__doc__ = "Absolute bounds (i.e., plus or minus)" From acd0fc241172c9b23b075c3e2277e792c392bbb6 Mon Sep 17 00:00:00 2001 From: Tyler Riccio Date: Mon, 26 May 2025 14:29:33 -0400 Subject: [PATCH 08/11] change _prep_column_text to only throw if it gets None --- pointblank/validate.py | 34 +-- tests/test_validate.py | 625 +---------------------------------------- 2 files changed, 14 insertions(+), 645 deletions(-) diff --git a/pointblank/validate.py b/pointblank/validate.py index b6aefff69..44c1df6aa 100644 --- a/pointblank/validate.py +++ b/pointblank/validate.py @@ -99,7 +99,7 @@ from collections.abc import Collection from typing import Any - from pointblank._typing import AbsoluteBounds, Tolerance + from pointblank._typing import AbsoluteBounds, Tolerance, _CompliantValue, _CompliantValues __all__ = [ "Validate", @@ -13149,7 +13149,7 @@ def _process_action_str( def _create_autobrief_or_failure_text( - assertion_type: str, lang: str, column: str | None, values: str | None, for_failure: bool + assertion_type: str, lang: str, column: str, values: str | None, for_failure: bool ) -> str: if assertion_type in [ "col_vals_gt", @@ -13289,7 +13289,7 @@ def _expect_failure_type(for_failure: bool) -> str: def _create_text_comparison( assertion_type: str, lang: str, - column: str | list[str] | None, + column: str | list[str], values: str | None, for_failure: bool = False, ) -> str: @@ -13315,7 +13315,7 @@ def _create_text_comparison( def _create_text_between( lang: str, - column: str | None, + column: str, value_1: str, value_2: str, not_: bool = False, @@ -13345,7 +13345,7 @@ def _create_text_between( def _create_text_set( - lang: str, column: str | None, values: list[any], not_: bool = False, for_failure: bool = False + lang: str, column: str, values: list[any], not_: bool = False, for_failure: bool = False ) -> str: type_ = _expect_failure_type(for_failure=for_failure) @@ -13367,9 +13367,7 @@ def _create_text_set( return text -def _create_text_null( - lang: str, column: str | None, not_: bool = False, for_failure: bool = False -) -> str: +def _create_text_null(lang: str, column: str, not_: bool = False, for_failure: bool = False) -> str: type_ = _expect_failure_type(for_failure=for_failure) column_text = _prep_column_text(column=column) @@ -13386,9 +13384,7 @@ def _create_text_null( return text -def _create_text_regex( - lang: str, column: str | None, pattern: str, for_failure: bool = False -) -> str: +def _create_text_regex(lang: str, column: str, pattern: str, for_failure: bool = False) -> str: type_ = _expect_failure_type(for_failure=for_failure) column_text = _prep_column_text(column=column) @@ -13455,7 +13451,7 @@ def _create_text_rows_complete( return text -def _create_text_row_count_match(lang: str, value: int, for_failure: bool = False) -> str: +def _create_text_row_count_match(lang: str, value: dict, for_failure: bool = False) -> str: type_ = _expect_failure_type(for_failure=for_failure) values_text = _prep_values_text(value["count"], lang=lang) @@ -13463,7 +13459,7 @@ def _create_text_row_count_match(lang: str, value: int, for_failure: bool = Fals return EXPECT_FAIL_TEXT[f"row_count_match_n_{type_}_text"][lang].format(values_text=values_text) -def _create_text_col_count_match(lang: str, value: int, for_failure: bool = False) -> str: +def _create_text_col_count_match(lang: str, value: dict, for_failure: bool = False) -> str: type_ = _expect_failure_type(for_failure=for_failure) values_text = _prep_values_text(value["count"], lang=lang) @@ -13486,19 +13482,13 @@ def _create_text_specially(lang: str, for_failure: bool = False) -> str: def _prep_column_text(column: str | list[str]) -> str: if isinstance(column, list): return "`" + str(column[0]) + "`" - elif isinstance(column, str): + if isinstance(column, str): return "`" + column + "`" - else: - return "" + raise AssertionError def _prep_values_text( - values: str - | int - | float - | datetime.datetime - | datetime.date - | list[str | int | float | datetime.datetime | datetime.date], + values: _CompliantValue | _CompliantValues, lang: str, limit: int = 3, ) -> str: diff --git a/tests/test_validate.py b/tests/test_validate.py index f5e323317..91aed6a82 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -12650,627 +12650,6 @@ def test_above_threshold_no_interrogation(): def test_prep_column_text(): assert _prep_column_text(column="column") == "`column`" assert _prep_column_text(column=["column_a", "column_b"]) == "`column_a`" - assert _prep_column_text(column=3) == "" - -def test_validate_csv_string_path_input(): - csv_path = "data_raw/small_table.csv" - validator = Validate(data=csv_path) - - # Verify data was loaded correctly - assert hasattr(validator.data, "shape") - assert validator.data.shape[0] > 0 # Has rows - assert validator.data.shape[1] > 0 # Has columns - - # Verify it's a DataFrame-like object - assert hasattr(validator.data, "columns") - - # Test that validation methods still work - result = validator.col_exists(["date", "a"]) - assert isinstance(result, Validate) - - -def test_validate_csv_path_object_input(): - csv_path = Path("data_raw/small_table.csv") - validator = Validate(data=csv_path) - - # Verify data was loaded correctly - assert hasattr(validator.data, "shape") - assert validator.data.shape[0] > 0 - assert validator.data.shape[1] > 0 - - -def test_validate_non_csv_string_passthrough(): - test_data = "not_a_csv_file" - validator = Validate(data=test_data) - - assert validator.data == test_data - assert isinstance(validator.data, str) - - -def test_validate_non_csv_path_passthrough(): - test_path = Path("data_raw/small_table.txt") # Different extension - validator = Validate(data=test_path) - - assert validator.data == test_path - assert isinstance(validator.data, Path) - - -def test_validate_non_existent_csv_file_error(): - with pytest.raises(FileNotFoundError, match="CSV file not found"): - Validate(data="nonexistent_file.csv") - - -def test_validate_dataframe_passthrough(): - # Try to import and create a DataFrame - try: - import polars as pl - - df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - except ImportError: - try: - import pandas as pd - - df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - except ImportError: - pytest.skip("No DataFrame library available") - - validator = Validate(data=df) - - # Should be the same object (identity check) - assert validator.data is df - - -def test_validate_csv_integration_with_validations(): - csv_path = "data_raw/small_table.csv" - validator = Validate(data=csv_path) - - # Chain multiple validation methods - result = validator.col_exists(["date", "a"]).col_vals_not_null(["a"]) - - # Should return the same Validate object - assert result is validator - - # Should have validation steps added - assert len(validator.validation_info) > 0 - - -def test_validate_csv_different_files(): - csv_files = [ - "data_raw/small_table.csv", - "data_raw/game_revenue.csv", - ] - - for csv_file in csv_files: - try: - validator = Validate(data=csv_file) - assert hasattr(validator.data, "shape") - assert validator.data.shape[0] > 0 - assert validator.data.shape[1] > 0 - except FileNotFoundError: - # Skip if file doesn't exist - continue - - -def test_validate_csv_case_insensitive_extension(): - # Test the internal logic by using a CSV file we know exists - csv_path = "data_raw/small_table.csv" - validator = Validate(data=csv_path) - assert hasattr(validator.data, "shape") - - # The case insensitivity is handled by Path.suffix.lower() == '.csv' - - -def test_validate_csv_library_preference(): - csv_path = "data_raw/small_table.csv" - validator = Validate(data=csv_path) - - # Check which library was used based on the data type - data_type = type(validator.data).__name__ - - # If Polars is available, it should be used - try: - import polars as pl - - assert "polars" in data_type.lower() or "dataframe" in data_type.lower() - except ImportError: - # If only Pandas is available - try: - import pandas as pd - - assert "pandas" in data_type.lower() or "dataframe" in data_type.lower() - except ImportError: - pytest.fail("No DataFrame library available for CSV reading") - - -def test_validate_csv_with_interrogation(): - csv_path = "data_raw/small_table.csv" - validator = Validate(data=csv_path) - - # Add validation steps and interrogate - result = validator.col_exists(["date", "a"]).col_vals_not_null(["a"]).interrogate() - - # Should have completed interrogation - assert len(result.validation_info) > 0 - - # Check that we can get reports - report = result.get_tabular_report() - assert report is not None - - -def test_validate_parquet_single_file(): - parquet_path = TEST_DATA_DIR / "taxi_sample.parquet" - validator = Validate(data=str(parquet_path)) - - # Verify data was loaded correctly - assert hasattr(validator.data, "shape") - assert validator.data.shape[0] == 1000 # Expected sample size - assert validator.data.shape[1] == 18 # NYC taxi data columns - - # Verify it's a DataFrame-like object - assert hasattr(validator.data, "columns") - - # Test that validation methods still work - result = validator.col_exists(["vendor_name", "Trip_Distance"]) - assert isinstance(result, Validate) - - -def test_validate_parquet_glob_pattern(): - pattern = str(TEST_DATA_DIR / "taxi_part_*.parquet") - validator = Validate(data=pattern) - - # Should have 333 + 333 + 334 = 1000 rows (all three parts combined) - assert validator.data.shape[0] == 1000 - assert validator.data.shape[1] == 18 - - -def test_validate_parquet_bracket_pattern(): - pattern = str(TEST_DATA_DIR / "taxi_part_0[1-2].parquet") - validator = Validate(data=pattern) - - # Should have 333 + 333 = 666 rows (first two parts only) - assert validator.data.shape[0] == 666 - assert validator.data.shape[1] == 18 - - -def test_validate_parquet_directory(): - parquet_dir = TEST_DATA_DIR / "parquet_data" - validator = Validate(data=str(parquet_dir)) - - # Check that we have a reasonable quantity of data and that it's - # greater than individual file sizes - assert validator.data.shape[0] > 600 # Should have multiple files worth of data - assert validator.data.shape[1] > 0 # Should have columns - - -def test_validate_parquet_list_of_files(): - file_list = [ - str(TEST_DATA_DIR / "taxi_part_01.parquet"), - str(TEST_DATA_DIR / "taxi_part_02.parquet"), - ] - validator = Validate(data=file_list) - - # Should have 333 + 333 = 666 rows - assert validator.data.shape[0] == 666 - assert validator.data.shape[1] == 18 - - -def test_validate_parquet_with_interrogation(): - parquet_path = TEST_DATA_DIR / "taxi_sample.parquet" - validator = Validate(data=str(parquet_path)) - - # Add validation steps and interrogate - result = ( - validator.col_exists(["vendor_name", "Trip_Distance"]) - .col_vals_not_null(["vendor_name"]) - .interrogate() - ) - - # Should have completed interrogation - assert ( - len(result.validation_info) == 3 - ) # col_exists + col_vals_not_null (2 steps total, but col_exists creates 2) - - -def test_validate_non_parquet_passthrough(): - test_data = {"a": [1, 2, 3], "b": [4, 5, 6]} - validator = Validate(data=test_data) - - # Should be the original dict - assert validator.data is test_data - assert isinstance(validator.data, dict) - - -def test_validate_parquet_file_not_found(): - with pytest.raises(FileNotFoundError): - Validate(data=str(TEST_DATA_DIR / "nonexistent.parquet")) - - -def test_validate_parquet_pattern_not_found(): - with pytest.raises(FileNotFoundError): - Validate(data=str(TEST_DATA_DIR / "nonexistent_*.parquet")) - - -def test_validate_parquet_directory_not_found(): - import tempfile - - # Create a temporary empty directory for this test - with tempfile.TemporaryDirectory() as temp_dir: - empty_dir = Path(temp_dir) / "empty_subdir" - empty_dir.mkdir() - - with pytest.raises(FileNotFoundError): - Validate(data=str(empty_dir)) - - -def test_validate_parquet_mixed_list(): - mixed_list = [ - str(TEST_DATA_DIR / "taxi_part_01.parquet"), - "some_regular_file.txt", # Not a parquet file - ] - validator = Validate(data=mixed_list) - - # Should return the original list unchanged - assert validator.data == mixed_list - - -def test_validate_parquet_partitioned_small_table(): - partitioned_path = TEST_DATA_DIR / "partitioned_small_table" - validator = Validate(data=str(partitioned_path)) - - # Should have 13 rows from all partitions and 8 columns including the partition column - assert validator.data.shape[0] == 13 - assert validator.data.shape[1] == 8 # All original columns including f - - # Should have the f column with partition values - assert "f" in validator.data.columns - - # Check that we have the expected f values - if hasattr(validator.data, "group_by"): # Polars - f_values = set(validator.data["f"].unique().to_list()) - else: # Pandas - f_values = set(validator.data["f"].unique()) - - expected_f_values = {"high", "low", "mid"} - assert f_values == expected_f_values - - # Test validation functionality works - result = validator.col_exists(["a", "b", "f"]).interrogate() - assert len(result.validation_info) == 3 # `col_exists()` creates one step per column - - -def test_validate_parquet_permanent_partitioned_sales(): - partitioned_path = TEST_DATA_DIR / "partitioned_sales" - validator = Validate(data=str(partitioned_path)) - - # Should have data from all partitions (100 rows total) - assert validator.data.shape[0] == 100 - assert validator.data.shape[1] == 9 # All original columns including status - - # Should have the status column with partition values - assert "status" in validator.data.columns - - # Check that we have the expected status values - if hasattr(validator.data, "group_by"): # Polars - status_counts = validator.data.group_by("status").len().sort("len", descending=True) - status_values = set(status_counts["status"].to_list()) - else: # Pandas - status_values = set(validator.data["status"].unique()) - - expected_statuses = {"pending", "shipped", "delivered", "returned", "cancelled"} - assert status_values == expected_statuses - - # Test validation functionality works - result = validator.col_exists(["product_id", "status", "revenue"]).interrogate() - assert len(result.validation_info) == 3 # `col_exists()` creates one step per column - - -def test_pandas_only_environment_scenario(): - from unittest.mock import patch - - # Mock polars as unavailable by making _is_lib_present return False for polars - with patch("pointblank.validate._is_lib_present") as mock_is_lib: - - def side_effect(lib_name): - return lib_name == "pandas" # Only pandas is available - - mock_is_lib.side_effect = side_effect - - import pandas as pd - import pointblank as pb - - # Create test data using Pandas with large numbers to trigger formatting - data = pd.DataFrame( - { - "transaction_amounts": [1000, 15000, 25000, 30000, 45000, 50000, 75000], - "customer_scores": [85.5, 92.3, 78.1, 88.7, 95.2, 82.4, 90.1], - "status": [ - "active", - "pending", - "active", - "completed", - "active", - "pending", - "completed", - ], - } - ) - - # Create validation with large threshold values that will trigger formatting - thresholds = pb.Thresholds(warning=5000, error=10000, critical=15000) - - validation = ( - pb.Validate(data=data, tbl_name="pandas_only_scenario", thresholds=thresholds) - .col_vals_gt(columns="transaction_amounts", value=500) # Large numbers - .col_vals_between(columns="customer_scores", left=70.0, right=100.0) - .col_vals_in_set(columns="status", set=["active", "pending", "completed"]) - .interrogate() - ) - - # Generate tabular report - should use Pandas-based GT formatting - report = validation.get_tabular_report() - assert report is not None - assert hasattr(report, "_body") - - # Verify formatting worked by checking report content using proper HTML rendering - report_html = report.as_raw_html() - assert len(report_html) > 1000 # Should have substantial content - assert "transaction_amounts" in report_html - - -def test_polars_only_environment_scenario(): - from unittest.mock import patch - - # Mock pandas as unavailable by making `_is_lib_present()` return False for pandas - with patch("pointblank.validate._is_lib_present") as mock_is_lib: - - def side_effect(lib_name): - return lib_name == "polars" # Only polars is available - - mock_is_lib.side_effect = side_effect - - import polars as pl - import pointblank as pb - - # Create test data using Polars with large numbers to trigger formatting - data = pl.DataFrame( - { - "transaction_amounts": [1000, 15000, 25000, 30000, 45000, 50000, 75000], - "customer_scores": [85.5, 92.3, 78.1, 88.7, 95.2, 82.4, 90.1], - "status": [ - "active", - "pending", - "active", - "completed", - "active", - "pending", - "completed", - ], - } - ) - - # Create validation with large threshold values that will trigger formatting - thresholds = pb.Thresholds(warning=5000, error=10000, critical=15000) - - validation = ( - pb.Validate(data=data, tbl_name="polars_only_scenario", thresholds=thresholds) - .col_vals_gt(columns="transaction_amounts", value=500) # Large numbers - .col_vals_between(columns="customer_scores", left=70.0, right=100.0) - .col_vals_in_set(columns="status", set=["active", "pending", "completed"]) - .interrogate() - ) - - # Generate tabular report - should use Polars-based GT formatting - report = validation.get_tabular_report() - assert report is not None - assert hasattr(report, "_body") - - # Verify formatting worked by checking report content using proper HTML rendering - report_html = report.as_raw_html() - assert len(report_html) > 1000 # Should have substantial content - assert "transaction_amounts" in report_html - - -def test_both_libraries_environment_scenario(): - import pandas as pd - import polars as pl - import pointblank as pb - - # Test data for both DataFrame types - test_values = { - "revenue": [10000, 25000, 30000, 45000, 60000, 75000, 90000], - "profit_margin": [0.15, 0.22, 0.18, 0.25, 0.20, 0.28, 0.32], - "region": ["North", "South", "East", "West", "North", "South", "East"], - } - - # Create test data using both Polars and Pandas - polars_data = pl.DataFrame(test_values) - pandas_data = pd.DataFrame(test_values) - - # Large threshold values that will trigger formatting - thresholds = pb.Thresholds(warning=8000, error=12000, critical=20000) - - # Test with Polars DataFrame (should use Polars-based formatting) - polars_validation = ( - pb.Validate(data=polars_data, tbl_name="polars_mixed_env", thresholds=thresholds) - .col_vals_gt(columns="revenue", value=5000) - .col_vals_between(columns="profit_margin", left=0.1, right=0.4) - .col_vals_in_set(columns="region", set=["North", "South", "East", "West"]) - .interrogate() - ) - - polars_report = polars_validation.get_tabular_report() - assert polars_report is not None - assert hasattr(polars_report, "_body") - - # Test with Pandas DataFrame (should use Pandas-based formatting) - pandas_validation = ( - pb.Validate(data=pandas_data, tbl_name="pandas_mixed_env", thresholds=thresholds) - .col_vals_gt(columns="revenue", value=5000) - .col_vals_between(columns="profit_margin", left=0.1, right=0.4) - .col_vals_in_set(columns="region", set=["North", "South", "East", "West"]) - .interrogate() - ) - - pandas_report = pandas_validation.get_tabular_report() - assert pandas_report is not None - assert hasattr(pandas_report, "_body") - - # Both reports should be generated successfully - polars_html = polars_report.as_raw_html() - pandas_html = pandas_report.as_raw_html() - - assert len(polars_html) > 1000 # Should have substantial content - assert len(pandas_html) > 1000 # Should have substantial content - assert "revenue" in polars_html - assert "revenue" in pandas_html - - -def test_dataframe_library_formatting_consistency_across_scenarios(): - import pandas as pd - import polars as pl - from pointblank.validate import ( - _format_single_number_with_gt, - _format_single_float_with_gt, - _format_single_integer_with_gt, - ) - - # Test values that would commonly trigger formatting - test_numbers = [1000, 12345, 999999, 1000000] - test_floats = [1234.56, 99999.99, 0.000123] - test_integers = [1500, 25000, 100000] - - # Test number formatting consistency - for value in test_numbers: - polars_result = _format_single_number_with_gt( - value, n_sigfig=3, compact=True, locale="en", df_lib=pl - ) - pandas_result = _format_single_number_with_gt( - value, n_sigfig=3, compact=True, locale="en", df_lib=pd - ) - - assert polars_result == pandas_result - - # Test float formatting consistency - for value in test_floats: - polars_result = _format_single_float_with_gt(value, decimals=2, locale="en", df_lib=pl) - pandas_result = _format_single_float_with_gt(value, decimals=2, locale="en", df_lib=pd) - - assert polars_result == pandas_result - - # Test integer formatting consistency - for value in test_integers: - polars_result = _format_single_integer_with_gt(value, locale="en", df_lib=pl) - pandas_result = _format_single_integer_with_gt(value, locale="en", df_lib=pd) - - assert polars_result == pandas_result - - -def test_scenario_integration_with_large_datasets(): - import polars as pl - import pandas as pd - import pointblank as pb - - # Create large dataset that will trigger number formatting in various functions - large_size = 2000 # Reduced size for faster testing - - # Polars version - polars_large_data = pl.DataFrame( - { - "transaction_id": range(1, large_size + 1), - "amount": [i * 1000 for i in range(1, large_size + 1)], # Large monetary values - "customer_tier": ["premium" if i % 3 == 0 else "standard" for i in range(large_size)], - "processing_fee": [round(i * 0.025, 2) for i in range(1, large_size + 1)], - } - ) - - # Pandas version - pandas_large_data = pd.DataFrame( - { - "transaction_id": range(1, large_size + 1), - "amount": [i * 1000 for i in range(1, large_size + 1)], # Large monetary values - "customer_tier": ["premium" if i % 3 == 0 else "standard" for i in range(large_size)], - "processing_fee": [round(i * 0.025, 2) for i in range(1, large_size + 1)], - } - ) - - # High threshold values that will trigger threshold formatting - thresholds = pb.Thresholds(warning=1000, error=2500, critical=4000) - - datasets = [ - ("Polars Large Dataset", polars_large_data), - ("Pandas Large Dataset", pandas_large_data), - ] - - for dataset_name, data in datasets: - # Complex validation with multiple steps - validation = ( - pb.Validate(data=data, tbl_name=f"large_{dataset_name.lower()}", thresholds=thresholds) - .col_vals_gt(columns="amount", value=500) # Large numbers formatting - .col_vals_between(columns="processing_fee", left=0.0, right=200.0) # Float formatting - .col_vals_in_set(columns="customer_tier", set=["premium", "standard"]) - .col_vals_not_null(columns="transaction_id") # Integer formatting - .interrogate() - ) - - # Generate report - should handle large numbers correctly - report = validation.get_tabular_report() - assert report is not None - assert hasattr(report, "_body") - - # Verify report quality - report_html = str(report) - assert len(report_html) > 2000 # Should have substantial formatted content - - # Check that validation worked correctly - assert len(validation.validation_info) == 4 # Four validation steps - assert all(step.all_passed for step in validation.validation_info) # All should pass - - -def test_scenario_edge_cases_and_error_handling(): - import polars as pl - import pandas as pd - import pointblank as pb - from pointblank.validate import _format_single_number_with_gt - - # Test with some edge case values - edge_cases = [ - 0, # Zero - 1, # Small positive - -1000, # Negative number - 999999999, # Very large number - ] - - # Test that edge cases work with both libraries - for value in edge_cases: - try: - polars_result = _format_single_number_with_gt(value, n_sigfig=3, df_lib=pl) - pandas_result = _format_single_number_with_gt(value, n_sigfig=3, df_lib=pd) - - # Both should return strings and be identical - assert isinstance(polars_result, str) - assert isinstance(pandas_result, str) - assert polars_result == pandas_result - - except Exception as e: - pytest.fail(f"Edge case {value} failed: {e}") - - # Test with None df_lib (backward compatibility) - try: - none_result = _format_single_number_with_gt(12345, n_sigfig=3, df_lib=None) - assert isinstance(none_result, str) - except Exception as e: - pytest.fail(f"df_lib=None case failed: {e}") - - # Test empty datasets don't cause formatting issues - empty_polars = pl.DataFrame({"values": pl.Series([], dtype=pl.Int64)}) - empty_pandas = pd.DataFrame({"values": pd.Series([], dtype="int64")}) - - for name, empty_data in [("Polars", empty_polars), ("Pandas", empty_pandas)]: - validation = pb.Validate(data=empty_data, tbl_name=f"empty_{name.lower()}") - # Should be able to create validation object even with empty data - assert validation is not None - - # Adding validation steps to empty data should work - validation = validation.col_vals_gt(columns="values", value=0) - assert len(validation.validation_info) == 1 + with pytest.raises(AssertionError): + _prep_column_text(column=3) From e1a997042397199d7652f1b1806734b501229488 Mon Sep 17 00:00:00 2001 From: Tyler Riccio Date: Mon, 26 May 2025 14:31:08 -0400 Subject: [PATCH 09/11] autobrief return not implemented instead of None --- pointblank/validate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pointblank/validate.py b/pointblank/validate.py index 44c1df6aa..3dbe94476 100644 --- a/pointblank/validate.py +++ b/pointblank/validate.py @@ -13279,7 +13279,7 @@ def _create_autobrief_or_failure_text( if assertion_type == "specially": return _create_text_specially(lang=lang, for_failure=for_failure) - return None # pragma: no cover + raise NotImplementedError # pragma: no cover def _expect_failure_type(for_failure: bool) -> str: @@ -13345,7 +13345,7 @@ def _create_text_between( def _create_text_set( - lang: str, column: str, values: list[any], not_: bool = False, for_failure: bool = False + lang: str, column: str, values: list[Any], not_: bool = False, for_failure: bool = False ) -> str: type_ = _expect_failure_type(for_failure=for_failure) From 167dfdf209b87097e831b55e3840eef3ff579311 Mon Sep 17 00:00:00 2001 From: Tyler Riccio <83321774+tylerriccio33@users.noreply.github.com> Date: Wed, 18 Jun 2025 22:01:58 -0400 Subject: [PATCH 10/11] Manual revert test_validate.py I missed a merge marker resulting in a huge diff. --- tests/test_validate.py | 625 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 623 insertions(+), 2 deletions(-) diff --git a/tests/test_validate.py b/tests/test_validate.py index 91aed6a82..f5e323317 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -12650,6 +12650,627 @@ def test_above_threshold_no_interrogation(): def test_prep_column_text(): assert _prep_column_text(column="column") == "`column`" assert _prep_column_text(column=["column_a", "column_b"]) == "`column_a`" + assert _prep_column_text(column=3) == "" - with pytest.raises(AssertionError): - _prep_column_text(column=3) + +def test_validate_csv_string_path_input(): + csv_path = "data_raw/small_table.csv" + validator = Validate(data=csv_path) + + # Verify data was loaded correctly + assert hasattr(validator.data, "shape") + assert validator.data.shape[0] > 0 # Has rows + assert validator.data.shape[1] > 0 # Has columns + + # Verify it's a DataFrame-like object + assert hasattr(validator.data, "columns") + + # Test that validation methods still work + result = validator.col_exists(["date", "a"]) + assert isinstance(result, Validate) + + +def test_validate_csv_path_object_input(): + csv_path = Path("data_raw/small_table.csv") + validator = Validate(data=csv_path) + + # Verify data was loaded correctly + assert hasattr(validator.data, "shape") + assert validator.data.shape[0] > 0 + assert validator.data.shape[1] > 0 + + +def test_validate_non_csv_string_passthrough(): + test_data = "not_a_csv_file" + validator = Validate(data=test_data) + + assert validator.data == test_data + assert isinstance(validator.data, str) + + +def test_validate_non_csv_path_passthrough(): + test_path = Path("data_raw/small_table.txt") # Different extension + validator = Validate(data=test_path) + + assert validator.data == test_path + assert isinstance(validator.data, Path) + + +def test_validate_non_existent_csv_file_error(): + with pytest.raises(FileNotFoundError, match="CSV file not found"): + Validate(data="nonexistent_file.csv") + + +def test_validate_dataframe_passthrough(): + # Try to import and create a DataFrame + try: + import polars as pl + + df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + except ImportError: + try: + import pandas as pd + + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + except ImportError: + pytest.skip("No DataFrame library available") + + validator = Validate(data=df) + + # Should be the same object (identity check) + assert validator.data is df + + +def test_validate_csv_integration_with_validations(): + csv_path = "data_raw/small_table.csv" + validator = Validate(data=csv_path) + + # Chain multiple validation methods + result = validator.col_exists(["date", "a"]).col_vals_not_null(["a"]) + + # Should return the same Validate object + assert result is validator + + # Should have validation steps added + assert len(validator.validation_info) > 0 + + +def test_validate_csv_different_files(): + csv_files = [ + "data_raw/small_table.csv", + "data_raw/game_revenue.csv", + ] + + for csv_file in csv_files: + try: + validator = Validate(data=csv_file) + assert hasattr(validator.data, "shape") + assert validator.data.shape[0] > 0 + assert validator.data.shape[1] > 0 + except FileNotFoundError: + # Skip if file doesn't exist + continue + + +def test_validate_csv_case_insensitive_extension(): + # Test the internal logic by using a CSV file we know exists + csv_path = "data_raw/small_table.csv" + validator = Validate(data=csv_path) + assert hasattr(validator.data, "shape") + + # The case insensitivity is handled by Path.suffix.lower() == '.csv' + + +def test_validate_csv_library_preference(): + csv_path = "data_raw/small_table.csv" + validator = Validate(data=csv_path) + + # Check which library was used based on the data type + data_type = type(validator.data).__name__ + + # If Polars is available, it should be used + try: + import polars as pl + + assert "polars" in data_type.lower() or "dataframe" in data_type.lower() + except ImportError: + # If only Pandas is available + try: + import pandas as pd + + assert "pandas" in data_type.lower() or "dataframe" in data_type.lower() + except ImportError: + pytest.fail("No DataFrame library available for CSV reading") + + +def test_validate_csv_with_interrogation(): + csv_path = "data_raw/small_table.csv" + validator = Validate(data=csv_path) + + # Add validation steps and interrogate + result = validator.col_exists(["date", "a"]).col_vals_not_null(["a"]).interrogate() + + # Should have completed interrogation + assert len(result.validation_info) > 0 + + # Check that we can get reports + report = result.get_tabular_report() + assert report is not None + + +def test_validate_parquet_single_file(): + parquet_path = TEST_DATA_DIR / "taxi_sample.parquet" + validator = Validate(data=str(parquet_path)) + + # Verify data was loaded correctly + assert hasattr(validator.data, "shape") + assert validator.data.shape[0] == 1000 # Expected sample size + assert validator.data.shape[1] == 18 # NYC taxi data columns + + # Verify it's a DataFrame-like object + assert hasattr(validator.data, "columns") + + # Test that validation methods still work + result = validator.col_exists(["vendor_name", "Trip_Distance"]) + assert isinstance(result, Validate) + + +def test_validate_parquet_glob_pattern(): + pattern = str(TEST_DATA_DIR / "taxi_part_*.parquet") + validator = Validate(data=pattern) + + # Should have 333 + 333 + 334 = 1000 rows (all three parts combined) + assert validator.data.shape[0] == 1000 + assert validator.data.shape[1] == 18 + + +def test_validate_parquet_bracket_pattern(): + pattern = str(TEST_DATA_DIR / "taxi_part_0[1-2].parquet") + validator = Validate(data=pattern) + + # Should have 333 + 333 = 666 rows (first two parts only) + assert validator.data.shape[0] == 666 + assert validator.data.shape[1] == 18 + + +def test_validate_parquet_directory(): + parquet_dir = TEST_DATA_DIR / "parquet_data" + validator = Validate(data=str(parquet_dir)) + + # Check that we have a reasonable quantity of data and that it's + # greater than individual file sizes + assert validator.data.shape[0] > 600 # Should have multiple files worth of data + assert validator.data.shape[1] > 0 # Should have columns + + +def test_validate_parquet_list_of_files(): + file_list = [ + str(TEST_DATA_DIR / "taxi_part_01.parquet"), + str(TEST_DATA_DIR / "taxi_part_02.parquet"), + ] + validator = Validate(data=file_list) + + # Should have 333 + 333 = 666 rows + assert validator.data.shape[0] == 666 + assert validator.data.shape[1] == 18 + + +def test_validate_parquet_with_interrogation(): + parquet_path = TEST_DATA_DIR / "taxi_sample.parquet" + validator = Validate(data=str(parquet_path)) + + # Add validation steps and interrogate + result = ( + validator.col_exists(["vendor_name", "Trip_Distance"]) + .col_vals_not_null(["vendor_name"]) + .interrogate() + ) + + # Should have completed interrogation + assert ( + len(result.validation_info) == 3 + ) # col_exists + col_vals_not_null (2 steps total, but col_exists creates 2) + + +def test_validate_non_parquet_passthrough(): + test_data = {"a": [1, 2, 3], "b": [4, 5, 6]} + validator = Validate(data=test_data) + + # Should be the original dict + assert validator.data is test_data + assert isinstance(validator.data, dict) + + +def test_validate_parquet_file_not_found(): + with pytest.raises(FileNotFoundError): + Validate(data=str(TEST_DATA_DIR / "nonexistent.parquet")) + + +def test_validate_parquet_pattern_not_found(): + with pytest.raises(FileNotFoundError): + Validate(data=str(TEST_DATA_DIR / "nonexistent_*.parquet")) + + +def test_validate_parquet_directory_not_found(): + import tempfile + + # Create a temporary empty directory for this test + with tempfile.TemporaryDirectory() as temp_dir: + empty_dir = Path(temp_dir) / "empty_subdir" + empty_dir.mkdir() + + with pytest.raises(FileNotFoundError): + Validate(data=str(empty_dir)) + + +def test_validate_parquet_mixed_list(): + mixed_list = [ + str(TEST_DATA_DIR / "taxi_part_01.parquet"), + "some_regular_file.txt", # Not a parquet file + ] + validator = Validate(data=mixed_list) + + # Should return the original list unchanged + assert validator.data == mixed_list + + +def test_validate_parquet_partitioned_small_table(): + partitioned_path = TEST_DATA_DIR / "partitioned_small_table" + validator = Validate(data=str(partitioned_path)) + + # Should have 13 rows from all partitions and 8 columns including the partition column + assert validator.data.shape[0] == 13 + assert validator.data.shape[1] == 8 # All original columns including f + + # Should have the f column with partition values + assert "f" in validator.data.columns + + # Check that we have the expected f values + if hasattr(validator.data, "group_by"): # Polars + f_values = set(validator.data["f"].unique().to_list()) + else: # Pandas + f_values = set(validator.data["f"].unique()) + + expected_f_values = {"high", "low", "mid"} + assert f_values == expected_f_values + + # Test validation functionality works + result = validator.col_exists(["a", "b", "f"]).interrogate() + assert len(result.validation_info) == 3 # `col_exists()` creates one step per column + + +def test_validate_parquet_permanent_partitioned_sales(): + partitioned_path = TEST_DATA_DIR / "partitioned_sales" + validator = Validate(data=str(partitioned_path)) + + # Should have data from all partitions (100 rows total) + assert validator.data.shape[0] == 100 + assert validator.data.shape[1] == 9 # All original columns including status + + # Should have the status column with partition values + assert "status" in validator.data.columns + + # Check that we have the expected status values + if hasattr(validator.data, "group_by"): # Polars + status_counts = validator.data.group_by("status").len().sort("len", descending=True) + status_values = set(status_counts["status"].to_list()) + else: # Pandas + status_values = set(validator.data["status"].unique()) + + expected_statuses = {"pending", "shipped", "delivered", "returned", "cancelled"} + assert status_values == expected_statuses + + # Test validation functionality works + result = validator.col_exists(["product_id", "status", "revenue"]).interrogate() + assert len(result.validation_info) == 3 # `col_exists()` creates one step per column + + +def test_pandas_only_environment_scenario(): + from unittest.mock import patch + + # Mock polars as unavailable by making _is_lib_present return False for polars + with patch("pointblank.validate._is_lib_present") as mock_is_lib: + + def side_effect(lib_name): + return lib_name == "pandas" # Only pandas is available + + mock_is_lib.side_effect = side_effect + + import pandas as pd + import pointblank as pb + + # Create test data using Pandas with large numbers to trigger formatting + data = pd.DataFrame( + { + "transaction_amounts": [1000, 15000, 25000, 30000, 45000, 50000, 75000], + "customer_scores": [85.5, 92.3, 78.1, 88.7, 95.2, 82.4, 90.1], + "status": [ + "active", + "pending", + "active", + "completed", + "active", + "pending", + "completed", + ], + } + ) + + # Create validation with large threshold values that will trigger formatting + thresholds = pb.Thresholds(warning=5000, error=10000, critical=15000) + + validation = ( + pb.Validate(data=data, tbl_name="pandas_only_scenario", thresholds=thresholds) + .col_vals_gt(columns="transaction_amounts", value=500) # Large numbers + .col_vals_between(columns="customer_scores", left=70.0, right=100.0) + .col_vals_in_set(columns="status", set=["active", "pending", "completed"]) + .interrogate() + ) + + # Generate tabular report - should use Pandas-based GT formatting + report = validation.get_tabular_report() + assert report is not None + assert hasattr(report, "_body") + + # Verify formatting worked by checking report content using proper HTML rendering + report_html = report.as_raw_html() + assert len(report_html) > 1000 # Should have substantial content + assert "transaction_amounts" in report_html + + +def test_polars_only_environment_scenario(): + from unittest.mock import patch + + # Mock pandas as unavailable by making `_is_lib_present()` return False for pandas + with patch("pointblank.validate._is_lib_present") as mock_is_lib: + + def side_effect(lib_name): + return lib_name == "polars" # Only polars is available + + mock_is_lib.side_effect = side_effect + + import polars as pl + import pointblank as pb + + # Create test data using Polars with large numbers to trigger formatting + data = pl.DataFrame( + { + "transaction_amounts": [1000, 15000, 25000, 30000, 45000, 50000, 75000], + "customer_scores": [85.5, 92.3, 78.1, 88.7, 95.2, 82.4, 90.1], + "status": [ + "active", + "pending", + "active", + "completed", + "active", + "pending", + "completed", + ], + } + ) + + # Create validation with large threshold values that will trigger formatting + thresholds = pb.Thresholds(warning=5000, error=10000, critical=15000) + + validation = ( + pb.Validate(data=data, tbl_name="polars_only_scenario", thresholds=thresholds) + .col_vals_gt(columns="transaction_amounts", value=500) # Large numbers + .col_vals_between(columns="customer_scores", left=70.0, right=100.0) + .col_vals_in_set(columns="status", set=["active", "pending", "completed"]) + .interrogate() + ) + + # Generate tabular report - should use Polars-based GT formatting + report = validation.get_tabular_report() + assert report is not None + assert hasattr(report, "_body") + + # Verify formatting worked by checking report content using proper HTML rendering + report_html = report.as_raw_html() + assert len(report_html) > 1000 # Should have substantial content + assert "transaction_amounts" in report_html + + +def test_both_libraries_environment_scenario(): + import pandas as pd + import polars as pl + import pointblank as pb + + # Test data for both DataFrame types + test_values = { + "revenue": [10000, 25000, 30000, 45000, 60000, 75000, 90000], + "profit_margin": [0.15, 0.22, 0.18, 0.25, 0.20, 0.28, 0.32], + "region": ["North", "South", "East", "West", "North", "South", "East"], + } + + # Create test data using both Polars and Pandas + polars_data = pl.DataFrame(test_values) + pandas_data = pd.DataFrame(test_values) + + # Large threshold values that will trigger formatting + thresholds = pb.Thresholds(warning=8000, error=12000, critical=20000) + + # Test with Polars DataFrame (should use Polars-based formatting) + polars_validation = ( + pb.Validate(data=polars_data, tbl_name="polars_mixed_env", thresholds=thresholds) + .col_vals_gt(columns="revenue", value=5000) + .col_vals_between(columns="profit_margin", left=0.1, right=0.4) + .col_vals_in_set(columns="region", set=["North", "South", "East", "West"]) + .interrogate() + ) + + polars_report = polars_validation.get_tabular_report() + assert polars_report is not None + assert hasattr(polars_report, "_body") + + # Test with Pandas DataFrame (should use Pandas-based formatting) + pandas_validation = ( + pb.Validate(data=pandas_data, tbl_name="pandas_mixed_env", thresholds=thresholds) + .col_vals_gt(columns="revenue", value=5000) + .col_vals_between(columns="profit_margin", left=0.1, right=0.4) + .col_vals_in_set(columns="region", set=["North", "South", "East", "West"]) + .interrogate() + ) + + pandas_report = pandas_validation.get_tabular_report() + assert pandas_report is not None + assert hasattr(pandas_report, "_body") + + # Both reports should be generated successfully + polars_html = polars_report.as_raw_html() + pandas_html = pandas_report.as_raw_html() + + assert len(polars_html) > 1000 # Should have substantial content + assert len(pandas_html) > 1000 # Should have substantial content + assert "revenue" in polars_html + assert "revenue" in pandas_html + + +def test_dataframe_library_formatting_consistency_across_scenarios(): + import pandas as pd + import polars as pl + from pointblank.validate import ( + _format_single_number_with_gt, + _format_single_float_with_gt, + _format_single_integer_with_gt, + ) + + # Test values that would commonly trigger formatting + test_numbers = [1000, 12345, 999999, 1000000] + test_floats = [1234.56, 99999.99, 0.000123] + test_integers = [1500, 25000, 100000] + + # Test number formatting consistency + for value in test_numbers: + polars_result = _format_single_number_with_gt( + value, n_sigfig=3, compact=True, locale="en", df_lib=pl + ) + pandas_result = _format_single_number_with_gt( + value, n_sigfig=3, compact=True, locale="en", df_lib=pd + ) + + assert polars_result == pandas_result + + # Test float formatting consistency + for value in test_floats: + polars_result = _format_single_float_with_gt(value, decimals=2, locale="en", df_lib=pl) + pandas_result = _format_single_float_with_gt(value, decimals=2, locale="en", df_lib=pd) + + assert polars_result == pandas_result + + # Test integer formatting consistency + for value in test_integers: + polars_result = _format_single_integer_with_gt(value, locale="en", df_lib=pl) + pandas_result = _format_single_integer_with_gt(value, locale="en", df_lib=pd) + + assert polars_result == pandas_result + + +def test_scenario_integration_with_large_datasets(): + import polars as pl + import pandas as pd + import pointblank as pb + + # Create large dataset that will trigger number formatting in various functions + large_size = 2000 # Reduced size for faster testing + + # Polars version + polars_large_data = pl.DataFrame( + { + "transaction_id": range(1, large_size + 1), + "amount": [i * 1000 for i in range(1, large_size + 1)], # Large monetary values + "customer_tier": ["premium" if i % 3 == 0 else "standard" for i in range(large_size)], + "processing_fee": [round(i * 0.025, 2) for i in range(1, large_size + 1)], + } + ) + + # Pandas version + pandas_large_data = pd.DataFrame( + { + "transaction_id": range(1, large_size + 1), + "amount": [i * 1000 for i in range(1, large_size + 1)], # Large monetary values + "customer_tier": ["premium" if i % 3 == 0 else "standard" for i in range(large_size)], + "processing_fee": [round(i * 0.025, 2) for i in range(1, large_size + 1)], + } + ) + + # High threshold values that will trigger threshold formatting + thresholds = pb.Thresholds(warning=1000, error=2500, critical=4000) + + datasets = [ + ("Polars Large Dataset", polars_large_data), + ("Pandas Large Dataset", pandas_large_data), + ] + + for dataset_name, data in datasets: + # Complex validation with multiple steps + validation = ( + pb.Validate(data=data, tbl_name=f"large_{dataset_name.lower()}", thresholds=thresholds) + .col_vals_gt(columns="amount", value=500) # Large numbers formatting + .col_vals_between(columns="processing_fee", left=0.0, right=200.0) # Float formatting + .col_vals_in_set(columns="customer_tier", set=["premium", "standard"]) + .col_vals_not_null(columns="transaction_id") # Integer formatting + .interrogate() + ) + + # Generate report - should handle large numbers correctly + report = validation.get_tabular_report() + assert report is not None + assert hasattr(report, "_body") + + # Verify report quality + report_html = str(report) + assert len(report_html) > 2000 # Should have substantial formatted content + + # Check that validation worked correctly + assert len(validation.validation_info) == 4 # Four validation steps + assert all(step.all_passed for step in validation.validation_info) # All should pass + + +def test_scenario_edge_cases_and_error_handling(): + import polars as pl + import pandas as pd + import pointblank as pb + from pointblank.validate import _format_single_number_with_gt + + # Test with some edge case values + edge_cases = [ + 0, # Zero + 1, # Small positive + -1000, # Negative number + 999999999, # Very large number + ] + + # Test that edge cases work with both libraries + for value in edge_cases: + try: + polars_result = _format_single_number_with_gt(value, n_sigfig=3, df_lib=pl) + pandas_result = _format_single_number_with_gt(value, n_sigfig=3, df_lib=pd) + + # Both should return strings and be identical + assert isinstance(polars_result, str) + assert isinstance(pandas_result, str) + assert polars_result == pandas_result + + except Exception as e: + pytest.fail(f"Edge case {value} failed: {e}") + + # Test with None df_lib (backward compatibility) + try: + none_result = _format_single_number_with_gt(12345, n_sigfig=3, df_lib=None) + assert isinstance(none_result, str) + except Exception as e: + pytest.fail(f"df_lib=None case failed: {e}") + + # Test empty datasets don't cause formatting issues + empty_polars = pl.DataFrame({"values": pl.Series([], dtype=pl.Int64)}) + empty_pandas = pd.DataFrame({"values": pd.Series([], dtype="int64")}) + + for name, empty_data in [("Polars", empty_polars), ("Pandas", empty_pandas)]: + validation = pb.Validate(data=empty_data, tbl_name=f"empty_{name.lower()}") + # Should be able to create validation object even with empty data + assert validation is not None + + # Adding validation steps to empty data should work + validation = validation.col_vals_gt(columns="values", value=0) + assert len(validation.validation_info) == 1 From 4d4dc407a200bb9f68bed4005865f7d33ca0626d Mon Sep 17 00:00:00 2001 From: Tyler Riccio <83321774+tylerriccio33@users.noreply.github.com> Date: Wed, 18 Jun 2025 22:07:04 -0400 Subject: [PATCH 11/11] re add test for bad column type test_validate.py --- tests/test_validate.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_validate.py b/tests/test_validate.py index f5e323317..74b7bcbc6 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -12650,7 +12650,8 @@ def test_above_threshold_no_interrogation(): def test_prep_column_text(): assert _prep_column_text(column="column") == "`column`" assert _prep_column_text(column=["column_a", "column_b"]) == "`column_a`" - assert _prep_column_text(column=3) == "" + with pytest.raises(AssertionError): + _prep_column_text(column=3) def test_validate_csv_string_path_input():