Change union for best-match in deserialization (#13)

PrzeG · web-flow · commit 32a7ac9b6039 · 2025-03-04T10:04:07.000+01:00
diff --git a/packages/catalystwan-core/src/catalystwan/core/models/deserialize.py b/packages/catalystwan-core/src/catalystwan/core/models/deserialize.py
@@ -1,14 +1,15 @@
 from collections import deque
 from copy import deepcopy
-from dataclasses import fields, is_dataclass
+from dataclasses import dataclass, fields, is_dataclass
 from functools import reduce
 from inspect import isclass, unwrap
-from typing import Any, Dict, List, Literal, Protocol, Tuple, Type, TypeVar, Union
+from typing import Any, Dict, List, Literal, Optional, Protocol, Tuple, Type, TypeVar, Union, cast
 
 from catalystwan.core.exceptions import (
     CatalystwanModelInputException,
     CatalystwanModelValidationError,
 )
+from catalystwan.core.models.utils import count_matching_keys
 from catalystwan.core.types import MODEL_TYPES, AliasPath, DataclassInstance
 from typing_extensions import Annotated, get_args, get_origin, get_type_hints
 
@@ -19,6 +20,13 @@ class ValueExtractorCallable(Protocol):
     def __call__(self, field_value: Any) -> Any: ...
 
 
+@dataclass
+class ExtractedValue:
+    value: Any
+    exact_match: bool
+    matched_keys: Optional[int] = None
+
+
 class ModelDeserializer:
     def __init__(self, model: Type[T]) -> None:
         self.model = model
@@ -57,67 +65,91 @@ def __check_errors(self):
                 message += f"{exc}\n"
             raise CatalystwanModelValidationError(message)
 
-    def __is_optional(self, t: Any) -> bool:
-        if get_origin(t) is Union and type(None) in get_args(t):
-            return True
-        return False
-
-    def __extract_type(self, field_type: Any, field_value: Any, field_name: str) -> Any:
+    def __extract_type(self, field_type: Any, field_value: Any, field_name: str) -> ExtractedValue:
         origin = get_origin(field_type)
         # check for simple types and classes
         if origin is None:
-            if field_type is Any:
-                return field_value
-            if isinstance(field_value, field_type):
-                return field_value
+            if field_type is Any or isinstance(field_value, field_type):
+                return ExtractedValue(value=field_value, exact_match=True)
+            # Do not cast bool values
+            elif field_type is bool:
+                ...
+            # False/Empty values (like empty string or list) can match to None
+            elif field_type is type(None):
+                if not field_value:
+                    return ExtractedValue(value=None, exact_match=False)
             elif is_dataclass(field_type):
-                assert isinstance(field_type, type)
-                return deserialize(field_type, **field_value)
+                model_instance = deserialize(
+                    cast(Type[DataclassInstance], field_type), **field_value
+                )
+                return ExtractedValue(
+                    value=model_instance,
+                    exact_match=False,
+                    matched_keys=count_matching_keys(model_instance, field_value),
+                )
             elif isclass(unwrap(field_type)):
                 if isinstance(field_value, dict):
-                    return field_type(**field_value)
+                    return ExtractedValue(value=field_type(**field_value), exact_match=False)
                 else:
                     try:
-                        return field_type(field_value)
+                        return ExtractedValue(value=field_type(field_value), exact_match=False)
                     except ValueError:
                         raise CatalystwanModelInputException(
                             f"Unable to match or cast input value for {field_name} [expected_type={unwrap(field_type)}, input={field_value}, input_type={type(field_value)}]"
                         )
+        # List is an exact match only if all of its elements are
         elif origin is list:
             if isinstance(field_value, list):
-                return [
-                    self.__extract_type(get_args(field_type)[0], value, field_name)
-                    for value in field_value
-                ]
-        elif self.__is_optional(field_type):
-            if field_value is None:
-                return None
-            else:
-                try:
-                    return self.__extract_type(get_args(field_type)[0], field_value, field_name)
-                except CatalystwanModelInputException as e:
-                    if not field_value:
-                        return None
-                    raise e
+                values = []
+                exact_match = True
+                for value in field_value:
+                    extracted_value = self.__extract_type(
+                        get_args(field_type)[0], value, field_name
+                    )
+                    values.append(extracted_value.value)
+                    if not extracted_value.exact_match:
+                        exact_match = False
+                return ExtractedValue(value=values, exact_match=exact_match)
         elif origin is Literal:
             for arg in get_args(field_type):
                 try:
                     if type(arg)(field_value) == arg:
-                        return type(arg)(field_value)
+                        return ExtractedValue(
+                            value=type(arg)(field_value), exact_match=type(arg) is type(field_value)
+                        )
                 except Exception:
                     continue
         elif origin is Annotated:
             validator, caster = field_type.__metadata__
             if validator(field_value):
-                return field_value
-            return caster(field_value)
-        # TODO: Currently, casting is done left-to-right. Searching deeper for a better match may be the way to go.
+                return ExtractedValue(value=field_value, exact_match=True)
+            return ExtractedValue(value=caster(field_value), exact_match=False)
+        # When parsing Unions, try to find the best match. Currently, it involves:
+        # 1. Finding the exact match
+        # 2. If not found, favors dataclasses - sorted by number of matched keys, then None values
+        # 3. If no dataclasses are present, return the leftmost matched argument
         elif origin is Union:
+            matches: List[ExtractedValue] = []
             for arg in get_args(field_type):
                 try:
-                    return self.__extract_type(arg, field_value, field_name)
+                    extracted_value = self.__extract_type(arg, field_value, field_name)
+                    # exact match, return
+                    if extracted_value.exact_match:
+                        return extracted_value
+                    else:
+                        matches.append(extracted_value)
                 except Exception:
                     continue
+            # Only one element matched, return
+            if len(matches) == 1:
+                return matches[0]
+            # Only non-exact matches left, sort and return first element
+            elif len(matches) > 1:
+                matches.sort(
+                    key=lambda x: (x.matched_keys is not None, x.matched_keys, x.value is None),
+                    reverse=True,
+                )
+                return matches[0]
         # Correct type not found, add exception
         raise CatalystwanModelInputException(
             f"Unable to match or cast input value for {field_name} [expected_type={unwrap(field_type)}, input={field_value}, input_type={type(field_value)}]"
@@ -130,7 +162,7 @@ def __transform_model_input(
         kwargs_copy = deepcopy(kwargs)
         new_args = []
         new_kwargs = {}
-        field_types = get_type_hints(cls)
+        field_types = get_type_hints(cls, include_extras=True)
         for field in fields(cls):
             if not field.init:
                 continue
@@ -140,7 +172,9 @@ def __transform_model_input(
                 field_value = args_copy.popleft()
                 try:
                     new_args.append(
-                        self.__extract_type(field_type, value_extractor(field_value), field.name)
+                        self.__extract_type(
+                            field_type, value_extractor(field_value), field.name
+                        ).value
                     )
                 except (
                     CatalystwanModelInputException,
@@ -164,7 +198,7 @@ def __transform_model_input(
             try:
                 new_kwargs[field.name] = self.__extract_type(
                     field_type, value_extractor(field_value), field.name
-                )
+                ).value
             except (
                 CatalystwanModelInputException,
                 CatalystwanModelValidationError,
diff --git a/packages/catalystwan-core/src/catalystwan/core/models/utils.py b/packages/catalystwan-core/src/catalystwan/core/models/utils.py
@@ -0,0 +1,27 @@
+from dataclasses import is_dataclass
+from typing import TypeVar, cast
+
+from catalystwan.core.types import DataclassInstance
+
+DataclassType = TypeVar("DataclassType", bound=DataclassInstance)
+
+
+def count_matching_keys(model: DataclassType, model_payload: dict):
+    matched_keys = 0
+    for key, value in model_payload.items():
+        try:
+            model_value = getattr(model, key)
+            matched_keys += 1
+            if is_dataclass(model_value) and isinstance(value, dict):
+                matched_keys += count_matching_keys(cast(DataclassType, model_value), value)
+            elif (
+                isinstance(model_value, list)
+                and all([is_dataclass(element) for element in model_value])
+                and isinstance(value, list)
+            ):
+                for model_v, input_v in zip(model_value, value):
+                    matched_keys += count_matching_keys(model_v, input_v)
+        except AttributeError:
+            continue
+
+    return matched_keys
diff --git a/packages/catalystwan-core/src/catalystwan/core/request_adapter.py b/packages/catalystwan-core/src/catalystwan/core/request_adapter.py
@@ -14,6 +14,7 @@
 )
 from catalystwan.core.models.deserialize import deserialize
 from catalystwan.core.models.serialize import serialize
+from catalystwan.core.models.utils import count_matching_keys
 from catalystwan.core.types import DataclassInstance
 from typing_extensions import get_args, get_origin
 
@@ -213,34 +214,12 @@ class ModelReturn:
         # return model that matches best with the input
         valid_models.sort(
             key=lambda x: (
-                self.__count_matching_keys(x.model, cast(dict, x.payload.data)),
+                count_matching_keys(x.model, cast(dict, x.payload.data)),
                 x.payload.priority,
             ),
             reverse=True,
         )
         return valid_models[0].model
 
-    def __count_matching_keys(self, model: DataclassType, model_payload: dict):
-        matched_keys = 0
-        for key, value in model_payload.items():
-            try:
-                model_value = getattr(model, key)
-                matched_keys += 1
-                if is_dataclass(model_value) and isinstance(value, dict):
-                    matched_keys += self.__count_matching_keys(
-                        cast(DataclassType, model_value), value
-                    )
-                elif (
-                    isinstance(model_value, list)
-                    and all([is_dataclass(element) for element in model_value])
-                    and isinstance(value, list)
-                ):
-                    for model_v, input_v in zip(model_value, value):
-                        matched_keys += self.__count_matching_keys(model_v, input_v)
-            except AttributeError:
-                continue
-
-        return matched_keys
-
     def __copy__(self) -> RequestAdapter:
         return RequestAdapter(session=copy(self.session), logger=self.logger)
diff --git a/packages/catalystwan-core/tests/test_model_deserialize.py b/packages/catalystwan-core/tests/test_model_deserialize.py
@@ -1,8 +1,11 @@
 from dataclasses import dataclass
 from ipaddress import IPv4Address, IPv6Address
 from typing import List, Literal, Optional, Union
+from uuid import UUID
 
+import pytest
 from catalystwan.core.models.deserialize import deserialize
+from catalystwan.core.types import Variable
 
 
 def test_simple_deserialize():
@@ -131,3 +134,102 @@ class Model:
     assert m.union_field == IPv4Address("10.0.0.1")
     assert m.submodel_field.int_field == 1
     assert isinstance(m.submodel_field, Submodel)
+
+
+@pytest.mark.parametrize(
+    "value",
+    [
+        ("1"),
+        (1),
+        (1.2),
+        ("True"),
+        ("3a56601d-6132-4aea-98d0-605fa966ad48"),
+        (UUID("3a56601d-6132-4aea-98d0-605fa966ad48")),
+    ],
+)
+def test_union_match_identity(value):
+    @dataclass
+    class Model:
+        union_field: Union[str, int, bool, float, UUID]
+
+    m = deserialize(Model, union_field=value)
+    assert m.union_field == value
+
+
+def test_union_match_optional():
+    @dataclass
+    class Model:
+        union_field: Optional[Union[str, int, bool, float, UUID]] = None
+
+    m1 = deserialize(Model)
+    m2 = deserialize(Model, union_field=None)
+    m3 = deserialize(Model, union_field=[])
+
+    assert m1.union_field is None
+    assert m2.union_field is None
+    assert m3.union_field is None
+
+
+@pytest.mark.parametrize(
+    "value",
+    [
+        ("1"),
+        (1),
+        ("True"),
+        ("3a56601d-6132-4aea-98d0-605fa966ad48"),
+        (UUID("3a56601d-6132-4aea-98d0-605fa966ad48")),
+        ([1, "2", 3]),
+        ([1.2, True, 1.3]),
+    ],
+)
+def test_union_match_nested_identity(value):
+    @dataclass
+    class Model:
+        union_field: Union[
+            str, int, Union[UUID, Union[List[Union[str, int]], List[Union[float, bool]]]]
+        ]
+
+    m = deserialize(Model, union_field=value)
+
+    assert m.union_field == value
+
+
+def test_union_match_models():
+    @dataclass
+    class Submodel1:
+        f1: int
+
+    @dataclass
+    class Submodel2:
+        f1: int
+        f2: int
+
+    @dataclass
+    class Model:
+        union_field: Union[str, Submodel1, Submodel2]
+
+    m1 = deserialize(Model, **{"union_field": {"f1": 1}})
+    m2 = deserialize(Model, **{"union_field": {"f1": 1, "f2": 2}})
+    m3 = deserialize(Model, **{"union_field": {"f1": 1, "f2": 2, "irrelevant_key": 0}})
+
+    assert m1.union_field == Submodel1(1)
+    assert m2.union_field == Submodel2(1, 2)
+    assert m3.union_field == Submodel2(1, 2)
+
+
+@pytest.mark.parametrize(
+    "model_input,expected_value",
+    [
+        ("1", 1),
+        ("3a56601d-6132-4aea-98d0-605fa966ad48", UUID("3a56601d-6132-4aea-98d0-605fa966ad48")),
+        ("some_string", "{{some_string}}"),
+    ],
+)
+def test_match_union_cast(model_input, expected_value):
+    @dataclass
+    class Model:
+        union_field: Optional[Union[int, bool, UUID, Variable]]
+
+    m = deserialize(Model, union_field=model_input)
+
+    assert m.union_field == expected_value