Warn about error handling in modelutility, show warnings by default, some doc fixes

mdbenito · mdbenito · commit 65df37b70d66 · 2025-04-02T16:19:20.000+02:00
diff --git a/docs/value/data-banzhaf.md b/docs/value/data-banzhaf.md
@@ -40,7 +40,8 @@ from pydvl.valuation.stopping import MinUpdates
 
 train, test = Dataset.from_arrays(...)
 model = ...
-utility = ModelUtility(model, SupervisedScorer(model, test, default=0.0))
+scorer =  SupervisedScorer(model, test, default=0.0)
+utility = ModelUtility(model, scorer)
 sampler = PermutationSampler()
 valuation = BanzhafValuation(utility, sampler, MinUpdates(1000))
 with parallel_config(n_jobs=16):
@@ -84,7 +85,8 @@ more on this subject see [[semi-values-sampling]].
     
     train, test = Dataset.from_arrays(...)
     model = ...
-    utility = ModelUtility(model, SupervisedScorer(model, test, default=0.0))
+    scorer = SupervisedScorer(model, test, default=0.0)
+    utility = ModelUtility(model, scorer)
     valuation = MSRBanzhafValuation(utility, MaxSamples(1000), batch_size=64)
     with parallel_config(n_jobs=16):
         valuation.fit(train)
diff --git a/docs/value/index.md b/docs/value/index.md
@@ -229,6 +229,16 @@ objects for different datasets. You can read more about [setting up the
 cache][getting-started-cache] in the installation guide, and in the
 documentation of the [caching][pydvl.utils.caching] module.
 
+!!! danger "Errors are hidden by default"
+    During semi-value computations, the utility can be evaluated on subsets that
+    break the fitting process. For instance, a classifier might require at least two
+    classes to fit, but the utility is sometimes evaluated on subsets with only one
+    class. This will raise an error with most classifiers. To avoid this, we set by
+    default `catch_errors=True` upon instantiation, which will catch the error and
+    return the scorer's default value instead. While we show a warning to signal that
+    something went wrong, this suppression can lead to unexpected results, so it is
+    important to be aware of this setting and to set it to `False` when testing, or if
+    you are sure that the utility will not be evaluated on problematic subsets.
 
 ### Computing some values
 
@@ -267,25 +277,33 @@ over, sliced, sorted, as well as converted to a [pandas.DataFrame][] using
 
 ### Learning the utility
 
-Since each evaluation of the utility entails a full retrain of the model on a new subset of the training data, it is natural to try to learn this mapping from subsets to scores. This is the idea behind **Data Utility Learning (DUL)**
+Since each evaluation of the utility entails a full retraining of the model on a
+new subset of the training data, it is natural to try to learn this mapping from
+subsets to scores. This is the idea behind **Data Utility Learning (DUL)**
 [@wang_improving_2022] and in pyDVL it's as simple as wrapping the
-`ModelUtility` inside [DataUtilityLearning][pydvl.valuation.utility.DataUtilityLearning]:
+[ModelUtility][pydvl.valuation.utility.ModelUtility] inside a
+[DataUtilityLearning][pydvl.valuation.utility.DataUtilityLearning] object:
 
 ```python
-from pydvl.valuation import ModelUtility, DataUtilityLearning, Dataset
+from pydvl.valuation import *
+from pydvl.valuation.types import Sample
 from sklearn.linear_model import LinearRegression, LogisticRegression
 from sklearn.datasets import load_iris
 
-dataset = Dataset.from_sklearn(load_iris())
-u = ModelUtility(LogisticRegression(), dataset)
+train, test = Dataset.from_sklearn(load_iris())
+scorer = SupervisedScorer("accuracy", test, default=0.0, range=(0, 1))
+u = ModelUtility(LogisticRegression(), scorer)
 training_budget = 3
-wrapped_u = DataUtilityLearning(u, training_budget, LinearRegression())
+utility_model = IndicatorUtilityModel(
+    predictor=LinearRegression(), n_data=len(train)
+)
+wrapped_u = DataUtilityLearning(u, training_budget, utility_model)
 
 # First 3 calls will be computed normally
 for i in range(training_budget):
-   _ = wrapped_u((i,))
+    _ = wrapped_u(Sample(None, train.indices[:i]))
 # Subsequent calls will be computed using the learned model for DUL
-wrapped_u((1, 2, 3))
+wrapped_u(Sample(None, train.indices))
 ```
 
 ## Problems of data values { #problems-of-data-values }
diff --git a/src/pydvl/valuation/utility/__init__.py b/src/pydvl/valuation/utility/__init__.py
@@ -9,7 +9,7 @@ class for all utilities is [UtilityBase][pydvl.valuation.utility.base.UtilityBas
 ## Utility for model-based methods
 
 [ModelUtility][pydvl.valuation.utility.modelutility.ModelUtility] holds information
-about model, and scoring function (the latter being what one usually understands under
+about model and scoring function (the latter being what one usually understands under
 *utility* in the general definition of Shapley value). Model-based evaluation methods
 define the utility as a retraining of the model on a subset of the data, which is then
 [scored][pydvl.valuation.scorers]. Please see the documentation on [Computing Data
diff --git a/src/pydvl/valuation/utility/modelutility.py b/src/pydvl/valuation/utility/modelutility.py
@@ -1,10 +1,85 @@
 """
 This module implements a utility function for supervised models.
 
-It is mostly geared towards sci-kit-learn models, but can be used with any object
-that implements the [BaseModel][pydvl.utils.types.BaseModel] protocol, i.e. that has a
+[ModelUtility][pydvl.valuation.utility.modelutility.ModelUtility] holds a model and a
+scorer. Each call to the utility will fit the model on a subset of the training data and
+evaluate the scorer on the test data. It is used by all the valuation methods in
+[pydvl.valuation][pydvl.valuation].
+
+This class is geared towards sci-kit-learn models, but can be used with any object that
+implements the [BaseModel][pydvl.utils.types.BaseModel] protocol, i.e. that has a
 `fit()` method.
 
+!!! danger "Errors are hidden by default"
+    During semi-value computations, the utility can be evaluated on subsets that
+    break the fitting process. For instance, a classifier might require at least two
+    classes to fit, but the utility is sometimes evaluated on subsets with only one
+    class. This will raise an error with most classifiers. To avoid this, we set by
+    default `catch_errors=True` upon instantiation, which will catch the error and
+    return the scorer's default value instead. While we show a warning to signal that
+    something went wrong, this suppression can lead to unexpected results, so it is
+    important to be aware of this setting and to set it to `False` when testing, or if
+    you are sure that the utility will not be evaluated on problematic subsets.
+
+
+## Examples
+
+??? Example "Standard usage"
+    The utility takes a model and a scorer and is passed to the valuation method. Here's
+    the basic usage:
+
+    ```python
+    from joblib import parallel_config
+    from pydvl.valuation import (
+        Dataset, MinUpdates, ModelUtility, SupervisedScorer, TMCShapleyValuation
+    )
+
+    train, test = Dataset.from_arrays(X, y, ...)
+    model = SomeModel()  # Implementing the basic scikit-learn interface
+    scorer =  SupervisedScorer("r2", test, default=0.0, range=(-np.inf, 1.0))
+    utility = ModelUtility(model, scorer, catch_errors=True, show_warnings=True)
+    valuation = TMCShapleyValuation(utility, is_done=MinUpdates(1000))
+    with parallel_config(n_jobs=-1):
+        valuation.fit(train)
+    ```
+
+??? Example "Directly calling the utility"
+    The following code instantiates a utility object and calls it directly. The
+    underlying logistic regression model will be trained on the indices passed as
+    argument, and evaluated on the test data.
+
+    ```python
+    from pydvl.valuation.utility import ModelUtility
+    from pydvl.valuation.dataset import Dataset
+    from pydvl.valuation.scorers import SupervisedScorer
+    from sklearn.linear_model import LinearRegression, LogisticRegression
+    from sklearn.datasets import load_iris
+
+    train, test = Dataset.from_sklearn(load_iris(), random_state=16)
+    scorer =  SupervisedScorer("accuracy", test, default=0.0, range=(0.0, 1.0))
+    u = ModelUtility(LogisticRegression(random_state=16), scorer, catch_errors=True)
+    u(Sample(None, subset=train.indices))
+    ```
+
+??? Example "Enabling the cache"
+    In this example an in-memory cache is used. Note that caching is only useful
+    under certain conditions, and does not really speed typical Monte Carlo
+    approximations. See [the introduction][#getting-started-cache] and the [module
+    documentation][pydvl.utils.caching] for more.
+
+    ```python
+    (...)  # Imports as above
+    cache_backend = InMemoryCacheBackend()  # See other backends in the caching module
+    u = ModelUtility(
+            model=LogisticRegression(random_state=16),
+            scorer=SupervisedScorer("accuracy", test, default=0.0, range=(0.0, 1.0)),
+            cache_backend=cache_backend,
+            catch_errors=True
+        )
+    u(Sample(None, subset=train.indices))
+    u(Sample(None, subset=train.indices))  # The second call does not retrain the model
+    ```
+
 ## Data type of the underlying data arrays
 
 In principle, very few to no assumptions are made about the data type. As long as it is
@@ -109,38 +184,6 @@ class ModelUtility(UtilityBase[SampleT], Generic[SampleT, ModelT]):
         cached_func_options: Optional configuration object for cached utility evaluation.
         clone_before_fit: If `True`, the model will be cloned before calling
             `fit()`.
-
-    ??? Example
-        ``` pycon
-        >>> from pydvl.valuation.utility import ModelUtility, DataUtilityLearning
-        >>> from pydvl.valuation.dataset import Dataset
-        >>> from pydvl.valuation.scorers import SupervisedScorer
-        >>> from sklearn.linear_model import LinearRegression, LogisticRegression
-        >>> from sklearn.datasets import load_iris
-        >>> train, test = Dataset.from_sklearn(load_iris(), random_state=16)
-        >>> u = ModelUtility(LogisticRegression(random_state=16), SupervisedScorer("accuracy"))
-        >>> u(Sample(None, subset=train.indices))
-        0.9
-        ```
-
-        With caching enabled:
-
-        ```pycon
-        >>> from pydvl.valuation.utility import ModelUtility, DataUtilityLearning
-        >>> from pydvl.valuation.dataset import Dataset
-        >>> from pydvl.utils.caching.memory import InMemoryCacheBackend
-        >>> from sklearn.linear_model import LinearRegression, LogisticRegression
-        >>> from sklearn.datasets import load_iris
-        >>> train, test = Dataset.from_sklearn(load_iris(), random_state=16)
-        >>> cache_backend = InMemoryCacheBackend()
-        >>> u = ModelUtility(
-        ...        model=LogisticRegression(random_state=16),
-        ...        scorer=SupervisedScorer("accuracy"),
-        ...        cache_backend=cache_backend)
-        >>> u(Sample(None, subset=train.indices))
-        0.9
-        ```
-
     """
 
     model: ModelT
@@ -152,7 +195,7 @@ def __init__(
         scorer: Scorer,
         *,
         catch_errors: bool = False,
-        show_warnings: bool = False,
+        show_warnings: bool = True,
         cache_backend: CacheBackend | None = None,
         cached_func_options: CachedFuncConfig | None = None,
         clone_before_fit: bool = True,