diff --git a/docs/value/owen.md b/docs/value/owen.md index cc2910b1a..b987de7df 100644 --- a/docs/value/owen.md +++ b/docs/value/owen.md @@ -95,7 +95,7 @@ probability values $q$ between 0 and 0.5 at random and then generates two sample for each index, one using the probability $q$ for index draws, and another with probability $1-q$. -!!! Example +??? Example ```python from pydvl.valuation import AntitheticOwenSampler, ShapleyValuation, RankCorrelation ... diff --git a/src/pydvl/valuation/samplers/__init__.py b/src/pydvl/valuation/samplers/__init__.py index 7e137823e..76dbbcde5 100644 --- a/src/pydvl/valuation/samplers/__init__.py +++ b/src/pydvl/valuation/samplers/__init__.py @@ -156,6 +156,8 @@ def fit(self, data: Dataset): from typing import Union from .base import * + +from .ame import * # isort: skip from .classwise import * from .msr import * from .owen import * @@ -169,6 +171,7 @@ def fit(self, data: Dataset): StochasticSampler = Union[ UniformSampler, PermutationSampler, + AMESampler, AntitheticSampler, StratifiedSampler, OwenSampler, diff --git a/src/pydvl/valuation/samplers/ame.py b/src/pydvl/valuation/samplers/ame.py new file mode 100644 index 000000000..557f399d1 --- /dev/null +++ b/src/pydvl/valuation/samplers/ame.py @@ -0,0 +1,115 @@ +""" +AMESampler implements a generalized Average Marginal Effect sampling scheme. + +For each index $i$, it first samples a probability $q$ from a user defined distribution +$D_q$. Then it samples a subset from the complement of $i$, by including each element +independently, following a Bernoulli distribution with parameter $q$. + +This two-stage process was introduced in Lin et al. (2022)[^1] together with an +approximation using Lasso. There it was shown, that under a uniform $D_q$, the Monte +Carlo sum of marginal utilities is an unbiased estimator of the Shapley value, without +the need for a correction factor. This is essentially the same as the Owen sampling +scheme. + +However, one can also use other distributions for $D_q$, such as a beta distribution. In +this case, the Monte Carlo sum of marginal utilities no longer converges to Shapley +without the correction factor, and even then because of the very different shape of the +distribution, attempting to approximate Shapley values with this method is not advisable. + + +## References + +[^1]: Lin, Jinkun, Anqi Zhang, Mathias Lécuyer, Jinyang Li, Aurojit Panda, and + Siddhartha Sen. [Measuring the Effect of Training Data on Deep Learning Predictions + via Randomized Experiments](https://proceedings.mlr.press/v162/lin22h.html). In + Proceedings of the 39th International Conference on Machine Learning, 13468–504. + PMLR, 2022. +""" + +from __future__ import annotations + +from typing import Type + +import numpy as np +import scipy.stats.distributions as dist +from scipy.integrate import quad +from scipy.stats import rv_continuous + +from pydvl.utils.numeric import complement, random_subset +from pydvl.utils.types import Seed +from pydvl.valuation.samplers.powerset import ( + IndexIteration, + PowersetSampler, + SequentialIndexIteration, +) +from pydvl.valuation.samplers.utils import StochasticSamplerMixin +from pydvl.valuation.types import IndexSetT, Sample, SampleGenerator + + +class AMESampler(StochasticSamplerMixin, PowersetSampler): + """AMESampler implements the two-stage Average Marginal Effect sampling scheme. + + For each index i, it first samples a probability p from `p_distribution`. Then it + samples a subset from the complement of i by including each element independently + following a Bernoulli distribution with parameter p. + + Args: + p_distribution: A scipy + [continuous random variable][scipy.stats._distn_infrastructure.rv_continuous] + Defaults to [scipy.stats.distributions.uniform][]. + batch_size: Number of samples per batch. + index_iteration: Strategy to iterate over indices. + seed: Seed or random state. + """ + + def __init__( + self, + p_distribution: rv_continuous | None = None, + batch_size: int = 1, + index_iteration: Type[IndexIteration] = SequentialIndexIteration, + seed: Seed | None = None, + ): + super().__init__( + batch_size=batch_size, index_iteration=index_iteration, seed=seed + ) + self.p_distribution = p_distribution or dist.uniform + + def generate(self, indices: IndexSetT) -> SampleGenerator: + """For each index i, sample p ~ p_distribution and then generate a subset of + the complement by including each element independently with probability p. + """ + for idx in self.index_iterable(indices): + _complement = complement(indices, [idx]) + p = self.p_distribution.rvs(random_state=self._rng).item() + subset = random_subset(_complement, p, self._rng) + yield Sample(idx, subset) + + def sample_limit(self, indices: IndexSetT) -> int | None: + """The sample limit is determined by the outer loop over indices.""" + return self._index_iterator_cls.length(len(indices)) + + def log_weight(self, n: int, subset_len: int) -> float: + r"""Computes the probability of a sample of a given size. + + Let m = n - 1 or m = n depending on the index iteration strategy. For a given + index i and a sampled subset S of size k, The probability of obtaining S under + the two-stage scheme is the result of marginalizing over the distribution of p: + + $$P(S) = \int_0^1 f(p) p^k (1-p)^{m-k} dp.$$ + + Args: + n: The size of the dataset. + subset_len: The size of the sampled subset. + Returns: + The natural logarithm of P(S) + """ + m = self._index_iterator_cls.complement_size(n) + integrand = ( + lambda p: self.p_distribution.pdf(p) + * (p**subset_len) + * ((1 - p) ** (m - subset_len)) + ) + I, abserr = quad(integrand, 0, 1, epsabs=1e-12) # noqa + if I < 0: + raise ValueError("Computed integral is non-positive.") + return float(np.log(I) if I > 0 else -np.inf) diff --git a/src/pydvl/valuation/samplers/msr.py b/src/pydvl/valuation/samplers/msr.py index 33bcf281e..cf61a28ee 100644 --- a/src/pydvl/valuation/samplers/msr.py +++ b/src/pydvl/valuation/samplers/msr.py @@ -217,6 +217,11 @@ def log_weight(self, n: int, subset_len: int) -> float: """ return float(-(n - 1) * np.log(2)) if n > 0 else 0.0 + def sample_limit(self, indices: IndexSetT) -> int | None: + if len(indices) == 0: + return 0 + return None + def make_strategy( self, utility: UtilityBase, diff --git a/src/pydvl/valuation/samplers/owen.py b/src/pydvl/valuation/samplers/owen.py index 7d5b281d1..1b30be0f8 100644 --- a/src/pydvl/valuation/samplers/owen.py +++ b/src/pydvl/valuation/samplers/owen.py @@ -219,11 +219,10 @@ def log_weight(self, n: int, subset_len: int) -> float: $$ P (| S_{q_j} | = k) = \binom{n}{k} \ q_j^k (1 - q_j)^{n - k}.$$ So, if each $q_j$ is chosen with equal weight (or more generally with - probability $p_j$),then by total probability, the overall probability of - obtaining a subset of size $k$ is a mixture of the binomials: - $$ - P (| S | = k) = \sum_{j = 1}^N p_j \ \binom{n}{k} \ q_j^k (1 - q_j)^{n - k}. - $$ + probability $p_j=P(q_j=q)$ for any $q$),then by total probability, the overall + probability of obtaining a subset of size $k$ is a mixture of the binomials: + + $$P(|S| = k) = \sum_{j = 1}^N p_j \ \binom{n}{k} \ q_j^k (1 - q_j)^{n - k}.$$ In our case $p_j = 1/N$, so that $P(|S|=k) = \frac{1}{N} \sum_{j=1}^N P (| S_{q_j} | = k)$. For large enough $N$ this is diff --git a/src/pydvl/valuation/samplers/permutation.py b/src/pydvl/valuation/samplers/permutation.py index f3a06906a..3ad93d7ef 100644 --- a/src/pydvl/valuation/samplers/permutation.py +++ b/src/pydvl/valuation/samplers/permutation.py @@ -179,6 +179,11 @@ def skip_indices(self, indices: IndexSetT): estimates if not done carefully.""" self._skip_indices = indices + def sample_limit(self, indices: IndexSetT) -> int | None: + if len(indices) == 0: + return 0 + return None + def generate(self, indices: IndexSetT) -> SampleGenerator: """Generates the permutation samples. diff --git a/src/pydvl/valuation/samplers/powerset.py b/src/pydvl/valuation/samplers/powerset.py index 8b7d1d6bb..5ec267022 100644 --- a/src/pydvl/valuation/samplers/powerset.py +++ b/src/pydvl/valuation/samplers/powerset.py @@ -206,7 +206,10 @@ def complement_size(n: int) -> int: class FiniteRandomIndexIteration(FiniteIterationMixin, RandomIndexIteration): - """Samples indices at random, once""" + """Samples indices at random, once, without replacement. + + Each index is sampled exactly once. + """ def __iter__(self) -> Generator[IndexT, None, None]: if len(self._indices) == 0: @@ -585,6 +588,9 @@ def generate(self, indices: IndexSetT) -> SampleGenerator: subset = random_subset(complement(indices, [idx]), seed=self._rng) yield Sample(idx, subset) + def sample_limit(self, indices: IndexSetT) -> int | None: + return self._index_iterator_cls.length(len(indices)) + class AntitheticSampler(StochasticSamplerMixin, PowersetSampler): """A sampler that draws samples uniformly, followed by their complements. @@ -604,3 +610,7 @@ def generate(self, indices: IndexSetT) -> SampleGenerator: subset = random_subset(_complement, seed=self._rng) yield Sample(idx, subset) yield Sample(idx, complement(_complement, subset)) + + def sample_limit(self, indices: IndexSetT) -> int | None: + len_outer = self._index_iterator_cls.length(len(indices)) + return len_outer * 2 if len_outer is not None else None diff --git a/src/pydvl/valuation/samplers/stratified.py b/src/pydvl/valuation/samplers/stratified.py index 1367e275b..22b601faa 100644 --- a/src/pydvl/valuation/samplers/stratified.py +++ b/src/pydvl/valuation/samplers/stratified.py @@ -725,7 +725,13 @@ def __iter__(self) -> Generator[tuple[int, int], None, None]: class RandomSizeIteration(SampleSizeIteration): - """Draws a set size $k$ following the distribution of sizes given by the strategy.""" + """Draws a set size $k$ following the distribution of sizes given by the strategy. + + Args: + strategy: The strategy to use for computing the number of samples to take. + n_indices: The number of indices in the index set from which samples are taken. + seed: The seed for the random number generator. + """ def __init__( self, strategy: SampleSizeStrategy, n_indices: int, seed: Seed | None = None diff --git a/src/pydvl/valuation/samplers/utils.py b/src/pydvl/valuation/samplers/utils.py index 725d172fe..ae115d897 100644 --- a/src/pydvl/valuation/samplers/utils.py +++ b/src/pydvl/valuation/samplers/utils.py @@ -9,7 +9,6 @@ import numpy as np from pydvl.utils.types import Seed -from pydvl.valuation.types import IndexSetT class StochasticSamplerMixin: @@ -22,6 +21,3 @@ class StochasticSamplerMixin: def __init__(self, *args, seed: Seed | None = None, **kwargs): super().__init__(*args, **kwargs) self._rng = np.random.default_rng(seed) - - def sample_limit(self, indices: IndexSetT) -> int | None: - return None diff --git a/src/pydvl/valuation/types.py b/src/pydvl/valuation/types.py index 04a778774..8596af6ac 100644 --- a/src/pydvl/valuation/types.py +++ b/src/pydvl/valuation/types.py @@ -35,6 +35,7 @@ "ValueUpdateT", ] + IndexT: TypeAlias = np.int_ IndexSetT: TypeAlias = NDArray[IndexT] NameT: TypeAlias = Union[np.object_, np.int_, np.str_] diff --git a/tests/valuation/methods/test_semivalues.py b/tests/valuation/methods/test_semivalues.py index b4b0ddda9..8bcacc44a 100644 --- a/tests/valuation/methods/test_semivalues.py +++ b/tests/valuation/methods/test_semivalues.py @@ -66,7 +66,7 @@ def log_weight(self, n: int, j: int) -> float: ) @pytest.mark.parametrize( "sampler_cls, sampler_kwargs", - deterministic_samplers() + random_samplers(proper=True), + deterministic_samplers() + random_samplers(proper=True, stratified=False), ) @pytest.mark.parametrize( "valuation_cls, valuation_kwargs, exact_values_attr", @@ -95,10 +95,11 @@ def test_games( sampler_cls, sampler_kwargs, seed=seed, - # For stratified samplers: - lower_bound=1, - upper_bound=None, - n_samples=n_samples, # Required for cases using FiniteSequentialSizeIteration + # Testing stratified samplers is too slow and they are tested elsewhere. + # # For stratified samplers: + # lower_bound=1, + # upper_bound=None, + # n_samples=n_samples, # Required for cases using FiniteSequentialSizeIteration ) valuation = valuation_cls( utility=test_game.u, diff --git a/tests/valuation/samplers/test_sampler.py b/tests/valuation/samplers/test_sampler.py index 0b7a45f94..95be43476 100644 --- a/tests/valuation/samplers/test_sampler.py +++ b/tests/valuation/samplers/test_sampler.py @@ -43,6 +43,7 @@ UniformOwenStrategy, UniformSampler, ) +from pydvl.valuation.samplers.ame import AMESampler from pydvl.valuation.samplers.permutation import PermutationSamplerBase from pydvl.valuation.types import IndexSetT, Sample, SampleGenerator from pydvl.valuation.utility.base import UtilityBase @@ -151,6 +152,19 @@ def permutation_samplers(): ] +def amesampler_samplers(): + return [ + ( + AMESampler, + {"index_iteration": RandomIndexIteration, "seed": lambda seed: seed}, + ), + ( + AMESampler, + {"index_iteration": SequentialIndexIteration, "seed": lambda seed: seed}, + ), + ] + + def powerset_samplers(): ret = [] @@ -291,20 +305,24 @@ def owen_samplers(): ] -def random_samplers(proper: bool = False, n_samples_per_index: int = 32): +def random_samplers(proper: bool = False, stratified: bool = True): """Use this as parameter values in pytest.mark.parametrize for parameters "sampler_cls, sampler_kwargs" Build the objects with recursive_make(sampler_cls, sampler_kwargs, **lambda_args) where lambda args are named as the key in the dictionary that contains the lambda. """ - return ( + ret = ( permutation_samplers() + powerset_samplers() - + stratified_samplers(n_samples_per_index=n_samples_per_index) + + amesampler_samplers() + owen_samplers() - + (improper_samplers() if not proper else []) ) + if stratified: + ret += stratified_samplers() + if not proper: + ret += improper_samplers() + return ret @pytest.mark.parametrize( @@ -550,9 +568,13 @@ def test_length_of_infinite_samplers( # check that we can generate samples that are longer than size of powerset batches = list(islice(sampler.generate_batches(indices), n_batches)) if len(indices) > 0: + assert sampler.sample_limit(indices) is None assert ( len(list(flatten(batches))) == n_batches * batch_size == sampler.n_samples ) + else: + assert sampler.sample_limit(indices) == 0 + with pytest.raises(TypeError): len(sampler)