diff --git a/CHANGELOG.md b/CHANGELOG.md index 6862fd8e6..7d33fed2d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,10 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### Added +- Environment variable `VALIDATE_QUERYABLES` to enable/disable validation of queryables in search/filter requests. When set to `true`, search requests will be validated against the defined queryables, returning an error for any unsupported fields. Defaults to `false` for backward compatibility.[#532](https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch/pull/532) + +- Environment variable `QUERYABLES_CACHE_TTL` to configure the TTL (in seconds) for caching queryables. Default is `3600` seconds (1 hour) to balance performance and freshness of queryables data. [#532](https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch/pull/532) + ### Changed ### Fixed diff --git a/README.md b/README.md index f2a7f498e..36ca2b3df 100644 --- a/README.md +++ b/README.md @@ -367,8 +367,10 @@ You can customize additional settings in your `.env` file: | `STAC_INDEX_ASSETS` | Controls if Assets are indexed when added to Elasticsearch/Opensearch. This allows asset fields to be included in search queries. | `false` | Optional | | `USE_DATETIME` | Configures the datetime search behavior in SFEOS. When enabled, searches both datetime field and falls back to start_datetime/end_datetime range for items with null datetime. When disabled, searches only by start_datetime/end_datetime range. | `true` | Optional | | `USE_DATETIME_NANOS` | Enables nanosecond precision handling for `datetime` field searches as per the `date_nanos` type. When `False`, it uses 3 millisecond precision as per the type `date`. | `true` | Optional | -| `EXCLUDED_FROM_QUERYABLES` | Comma-separated list of fully qualified field names to exclude from the queryables endpoint and filtering. Use full paths like `properties.auth:schemes,properties.storage:schemes`. Excluded fields and their nested children will not be exposed in queryables. | None | Optional | +| `EXCLUDED_FROM_QUERYABLES` | Comma-separated list of fully qualified field names to exclude from the queryables endpoint and filtering. Use full paths like `properties.auth:schemes,properties.storage:schemes`. Excluded fields and their nested children will not be exposed in queryables. If `VALIDATE_QUERYABLES` is enabled, these fields will also be considered invalid for filtering. | None | Optional | | `EXCLUDED_FROM_ITEMS` | Specifies fields to exclude from STAC item responses. Supports comma-separated field names and dot notation for nested fields (e.g., `private_data,properties.confidential,assets.internal`). | `None` | Optional | +| `VALIDATE_QUERYABLES` | Enable validation of query parameters against the collection's queryables. If set to `true`, the API will reject queries containing fields that are not defined in the collection's queryables. | `false` | Optional | +| `QUERYABLES_CACHE_TTL` | Time-to-live (in seconds) for the queryables cache. Used when `VALIDATE_QUERYABLES` is enabled. | `3600` | Optional | > [!NOTE] @@ -424,6 +426,29 @@ EXCLUDED_FROM_QUERYABLES="properties.auth:schemes,properties.storage:schemes,pro - Excluded fields and their nested children will be skipped during field traversal - Both the field itself and any nested properties will be excluded +## Queryables Validation + +SFEOS supports validating query parameters against the collection's defined queryables. This ensures that users only query fields that are explicitly exposed and indexed. + +**Configuration:** + +To enable queryables validation, set the following environment variables: + +```bash +VALIDATE_QUERYABLES=true +QUERYABLES_CACHE_TTL=3600 # Optional, defaults to 3600 seconds (1 hour) +``` + +**Behavior:** + +- When enabled, the API maintains a cache of all queryable fields across all collections. +- Search requests (both GET and POST) are checked against this cache. +- If a request contains a query parameter or filter field that is not in the list of allowed queryables, the API returns a `400 Bad Request` error with a message indicating the invalid field(s). +- The cache is automatically refreshed based on the `QUERYABLES_CACHE_TTL` setting. +- **Interaction with `EXCLUDED_FROM_QUERYABLES`**: If `VALIDATE_QUERYABLES` is enabled, fields listed in `EXCLUDED_FROM_QUERYABLES` will also be considered invalid for filtering. This effectively enforces the exclusion of these fields from search queries. + +This feature helps prevent queries on non-queryable fields which could lead to unnecessary load on the database. + ## Datetime-Based Index Management ### Overview diff --git a/stac_fastapi/core/stac_fastapi/core/base_database_logic.py b/stac_fastapi/core/stac_fastapi/core/base_database_logic.py index 105fdf925..1ed5265b7 100644 --- a/stac_fastapi/core/stac_fastapi/core/base_database_logic.py +++ b/stac_fastapi/core/stac_fastapi/core/base_database_logic.py @@ -138,3 +138,8 @@ async def delete_collection( ) -> None: """Delete a collection from the database.""" pass + + @abc.abstractmethod + async def get_queryables_mapping(self, collection_id: str = "*") -> Dict[str, Any]: + """Retrieve mapping of Queryables for search.""" + pass diff --git a/stac_fastapi/core/stac_fastapi/core/core.py b/stac_fastapi/core/stac_fastapi/core/core.py index 3334a4db3..fad0a979f 100644 --- a/stac_fastapi/core/stac_fastapi/core/core.py +++ b/stac_fastapi/core/stac_fastapi/core/core.py @@ -24,6 +24,10 @@ from stac_fastapi.core.base_settings import ApiBaseSettings from stac_fastapi.core.datetime_utils import format_datetime_range from stac_fastapi.core.models.links import PagingLinks +from stac_fastapi.core.queryables import ( + QueryablesCache, + get_properties_from_cql2_filter, +) from stac_fastapi.core.redis_utils import redis_pagination_links from stac_fastapi.core.serializers import CollectionSerializer, ItemSerializer from stac_fastapi.core.session import Session @@ -88,6 +92,10 @@ class CoreClient(AsyncBaseCoreClient): title: str = attr.ib(default="stac-fastapi") description: str = attr.ib(default="stac-fastapi") + def __attrs_post_init__(self): + """Initialize the queryables cache.""" + self.queryables_cache = QueryablesCache(self.database) + def _landing_page( self, base_url: str, @@ -816,6 +824,8 @@ async def post_search( ) if hasattr(search_request, "query") and getattr(search_request, "query"): + query_fields = set(getattr(search_request, "query").keys()) + await self.queryables_cache.validate(query_fields) for field_name, expr in getattr(search_request, "query").items(): field = "properties__" + field_name for op, value in expr.items(): @@ -834,7 +844,11 @@ async def post_search( if cql2_filter is not None: try: + query_fields = get_properties_from_cql2_filter(cql2_filter) + await self.queryables_cache.validate(query_fields) search = await self.database.apply_cql2_filter(search, cql2_filter) + except HTTPException: + raise except Exception as e: raise HTTPException( status_code=400, detail=f"Error with cql2 filter: {e}" diff --git a/stac_fastapi/core/stac_fastapi/core/queryables.py b/stac_fastapi/core/stac_fastapi/core/queryables.py new file mode 100644 index 000000000..63c63ba8f --- /dev/null +++ b/stac_fastapi/core/stac_fastapi/core/queryables.py @@ -0,0 +1,110 @@ +"""A module for managing queryable attributes.""" + +import asyncio +import os +import time +from typing import Any, Dict, List, Set + +from fastapi import HTTPException + + +class QueryablesCache: + """A thread-safe, time-based cache for queryable properties.""" + + def __init__(self, database_logic: Any): + """ + Initialize the QueryablesCache. + + Args: + database_logic: An instance of a class with a `get_queryables_mapping` method. + """ + self._db_logic = database_logic + self._cache: Dict[str, List[str]] = {} + self._all_queryables: Set[str] = set() + self._last_updated: float = 0 + self._lock = asyncio.Lock() + self.validation_enabled: bool = False + self.cache_ttl: int = 3600 # How often to refresh cache (in seconds) + self.excluded_queryables: Set[str] = set() + self.reload_settings() + + def reload_settings(self): + """Reload settings from environment variables.""" + self.validation_enabled = ( + os.getenv("VALIDATE_QUERYABLES", "false").lower() == "true" + ) + self.cache_ttl = int(os.getenv("QUERYABLES_CACHE_TTL", "3600")) + + excluded = os.getenv("EXCLUDED_FROM_QUERYABLES", "") + self.excluded_queryables = set() + if excluded: + for field in excluded.split(","): + field = field.strip() + if field: + # Remove 'properties.' prefix if present + if field.startswith("properties."): + field = field[11:] + self.excluded_queryables.add(field) + + async def _update_cache(self): + """Update the cache with the latest queryables from the database.""" + if not self.validation_enabled: + return + + async with self._lock: + if (time.time() - self._last_updated < self.cache_ttl) and self._cache: + return + + queryables_mapping = await self._db_logic.get_queryables_mapping() + all_queryables_set = set(queryables_mapping.keys()) + + if self.excluded_queryables: + all_queryables_set = all_queryables_set - self.excluded_queryables + + self._all_queryables = all_queryables_set + + self._cache = {"*": list(all_queryables_set)} + self._last_updated = time.time() + + async def get_all_queryables(self) -> Set[str]: + """ + Return a set of all queryable attributes across all collections. + + This method will update the cache if it's stale or has been cleared. + """ + if not self.validation_enabled: + return set() + + if (time.time() - self._last_updated >= self.cache_ttl) or not self._cache: + await self._update_cache() + return self._all_queryables + + async def validate(self, fields: Set[str]) -> None: + """ + Validate if the provided fields are queryable. + + Raises HTTPException if invalid fields are found. + """ + if not self.validation_enabled: + return + + allowed_fields = await self.get_all_queryables() + invalid_fields = fields - allowed_fields + if invalid_fields: + raise HTTPException( + status_code=400, + detail=f"Invalid query fields: {', '.join(invalid_fields)}.", + ) + + +def get_properties_from_cql2_filter(cql2_filter: Dict[str, Any]) -> Set[str]: + """Recursively extract property names from a CQL2 filter.""" + props: Set[str] = set() + if "op" in cql2_filter and "args" in cql2_filter: + for arg in cql2_filter["args"]: + if isinstance(arg, dict): + if "op" in arg: + props.update(get_properties_from_cql2_filter(arg)) + elif "property" in arg: + props.add(arg["property"]) + return props diff --git a/stac_fastapi/tests/api/test_api_query_validation.py b/stac_fastapi/tests/api/test_api_query_validation.py new file mode 100644 index 000000000..a071a0294 --- /dev/null +++ b/stac_fastapi/tests/api/test_api_query_validation.py @@ -0,0 +1,116 @@ +import json +import os +from unittest import mock + +import pytest + +if os.getenv("BACKEND", "elasticsearch").lower() == "opensearch": + from stac_fastapi.opensearch.app import app_config +else: + from stac_fastapi.elasticsearch.app import app_config + + +def get_core_client(): + if os.getenv("BACKEND", "elasticsearch").lower() == "opensearch": + from stac_fastapi.opensearch.app import app_config + else: + from stac_fastapi.elasticsearch.app import app_config + return app_config["client"] + + +def reload_queryables_settings(): + client = get_core_client() + if hasattr(client, "queryables_cache"): + client.queryables_cache.reload_settings() + + +@pytest.fixture(autouse=True) +def enable_validation(): + + client = app_config["client"] + with mock.patch.dict(os.environ, {"VALIDATE_QUERYABLES": "true"}): + client.queryables_cache.reload_settings() + yield + client.queryables_cache.reload_settings() + + +@pytest.mark.asyncio +async def test_search_post_query_valid_param(app_client, ctx): + """Test POST /search with a valid query parameter""" + query = {"query": {"eo:cloud_cover": {"lt": 10}}} + resp = await app_client.post("/search", json=query) + assert resp.status_code == 200 + + +@pytest.mark.asyncio +async def test_search_post_query_invalid_param(app_client, ctx): + """Test POST /search with an invalid query parameter""" + query = {"query": {"invalid_param": {"eq": "test"}}} + resp = await app_client.post("/search", json=query) + assert resp.status_code == 400 + resp_json = resp.json() + assert "Invalid query fields: invalid_param" in resp_json["detail"] + + +@pytest.mark.asyncio +async def test_item_collection_get_filter_valid_param(app_client, ctx): + """Test GET /collections/{collection_id}/items with a valid filter parameter""" + collection_id = ctx.item["collection"] + filter_body = { + "op": "<", + "args": [{"property": "eo:cloud_cover"}, 10], + } + params = { + "filter-lang": "cql2-json", + "filter": json.dumps(filter_body), + } + resp = await app_client.get(f"/collections/{collection_id}/items", params=params) + assert resp.status_code == 200 + + +@pytest.mark.asyncio +async def test_item_collection_get_filter_invalid_param(app_client, ctx): + """Test GET /collections/{collection_id}/items with an invalid filter parameter""" + collection_id = ctx.item["collection"] + filter_body = { + "op": "=", + "args": [{"property": "invalid_param"}, "test"], + } + params = { + "filter-lang": "cql2-json", + "filter": json.dumps(filter_body), + } + resp = await app_client.get(f"/collections/{collection_id}/items", params=params) + assert resp.status_code == 400 + resp_json = resp.json() + assert "Invalid query fields: invalid_param" in resp_json["detail"] + + +@pytest.mark.asyncio +async def test_validate_queryables_excluded(app_client, ctx): + """Test that excluded queryables are rejected when validation is enabled.""" + + excluded_field = "eo:cloud_cover" + client = app_config["client"] + + with mock.patch.dict( + os.environ, + { + "VALIDATE_QUERYABLES": "true", + "EXCLUDED_FROM_QUERYABLES": excluded_field, + "QUERYABLES_CACHE_TTL": "0", + }, + ): + client.queryables_cache.reload_settings() + + query = {"query": {excluded_field: {"lt": 10}}} + resp = await app_client.post("/search", json=query) + assert resp.status_code == 400 + assert "Invalid query fields" in resp.json()["detail"] + assert excluded_field in resp.json()["detail"] + + query = {"query": {"id": {"eq": "test-item"}}} + resp = await app_client.post("/search", json=query) + assert resp.status_code == 200 + + client.queryables_cache.reload_settings() diff --git a/stac_fastapi/tests/core/test_queryables.py b/stac_fastapi/tests/core/test_queryables.py new file mode 100644 index 000000000..10a742049 --- /dev/null +++ b/stac_fastapi/tests/core/test_queryables.py @@ -0,0 +1,118 @@ +import os +import time +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest +from fastapi import HTTPException + +from stac_fastapi.core.queryables import ( + QueryablesCache, + get_properties_from_cql2_filter, +) + + +class TestQueryablesCache: + @pytest.fixture + def mock_db_logic(self): + db_logic = MagicMock() + db_logic.get_queryables_mapping = AsyncMock( + return_value={"prop1": "type1", "prop2": "type2"} + ) + return db_logic + + @pytest.fixture + def queryables_cache(self, mock_db_logic): + with patch.dict( + os.environ, {"VALIDATE_QUERYABLES": "true", "QUERYABLES_CACHE_TTL": "60"} + ): + cache = QueryablesCache(mock_db_logic) + return cache + + def test_init(self, mock_db_logic): + with patch.dict( + os.environ, {"VALIDATE_QUERYABLES": "true", "QUERYABLES_CACHE_TTL": "120"} + ): + cache = QueryablesCache(mock_db_logic) + assert cache.validation_enabled is True + assert cache.cache_ttl == 120 + + def test_reload_settings(self, queryables_cache): + with patch.dict( + os.environ, {"VALIDATE_QUERYABLES": "false", "QUERYABLES_CACHE_TTL": "300"} + ): + queryables_cache.reload_settings() + assert queryables_cache.validation_enabled is False + assert queryables_cache.cache_ttl == 300 + + @pytest.mark.asyncio + async def test_get_all_queryables_updates_cache( + self, queryables_cache, mock_db_logic + ): + queryables = await queryables_cache.get_all_queryables() + assert queryables == {"prop1", "prop2"} + mock_db_logic.get_queryables_mapping.assert_called_once() + + @pytest.mark.asyncio + async def test_get_all_queryables_uses_cache(self, queryables_cache, mock_db_logic): + await queryables_cache.get_all_queryables() + mock_db_logic.get_queryables_mapping.assert_called_once() + + # Should use cache now + await queryables_cache.get_all_queryables() + mock_db_logic.get_queryables_mapping.assert_called_once() + + @pytest.mark.asyncio + async def test_get_all_queryables_refresh_after_ttl( + self, queryables_cache, mock_db_logic + ): + await queryables_cache.get_all_queryables() + mock_db_logic.get_queryables_mapping.assert_called_once() + + # Simulate time passing + queryables_cache._last_updated = time.time() - queryables_cache.cache_ttl - 1 + + await queryables_cache.get_all_queryables() + assert mock_db_logic.get_queryables_mapping.call_count == 2 + + @pytest.mark.asyncio + async def test_get_all_queryables_disabled(self, queryables_cache): + queryables_cache.validation_enabled = False + queryables = await queryables_cache.get_all_queryables() + assert queryables == set() + + @pytest.mark.asyncio + async def test_validate_valid_fields(self, queryables_cache): + await queryables_cache.validate({"prop1"}) + + @pytest.mark.asyncio + async def test_validate_invalid_fields(self, queryables_cache): + with pytest.raises(HTTPException) as excinfo: + await queryables_cache.validate({"invalid_prop"}) + assert excinfo.value.status_code == 400 + assert "Invalid query fields: invalid_prop" in str(excinfo.value.detail) + + @pytest.mark.asyncio + async def test_validate_disabled(self, queryables_cache): + queryables_cache.validation_enabled = False + await queryables_cache.validate({"invalid_prop"}) + + +def test_get_properties_from_cql2_filter(): + # Simple prop + cql2 = {"op": "=", "args": [{"property": "prop1"}, "value"]} + props = get_properties_from_cql2_filter(cql2) + assert props == {"prop1"} + + # Nested props + cql2_nested = { + "op": "and", + "args": [ + {"op": "=", "args": [{"property": "prop1"}, "v1"]}, + {"op": "<", "args": [{"property": "prop2"}, 10]}, + ], + } + props = get_properties_from_cql2_filter(cql2_nested) + assert props == {"prop1", "prop2"} + + # Empty/invalid + assert get_properties_from_cql2_filter({}) == set() diff --git a/stac_fastapi/tests/data/test_collection.json b/stac_fastapi/tests/data/test_collection.json index 32a7d254b..dda5b8a0e 100644 --- a/stac_fastapi/tests/data/test_collection.json +++ b/stac_fastapi/tests/data/test_collection.json @@ -6,6 +6,11 @@ "type": "Collection", "description": "Landat 8 imagery radiometrically calibrated and orthorectified using gound points and Digital Elevation Model (DEM) data to correct relief displacement.", "stac_version": "1.0.0", + "queryables": { + "eo:cloud_cover": { + "$ref": "https://stac-extensions.github.io/eo/v1.0.0/schema.json#/definitions/fields/properties/eo:cloud_cover" + } + }, "license": "PDDL-1.0", "summaries": { "platform": [