Introducing LogNoisyExpectedImprovement (#1577)

SebastianAment · facebook-github-bot · commit 8f58784e1353 · 2022-12-20T08:24:37.000-08:00
Summary: Pull Request resolved: #1577 Follow up on D41890063 (d819b2d), continuing the logarithmification of improvement-based acquisition functions. Notably, the existing test case for `NEI` already exhibits acquisition values that are exactly zero and gradients on the order of machine epsilon. This is solved by `LogNEI`. Reviewed By: Balandat Differential Revision: D42109272 fbshipit-source-id: 96ca29dfeba3687b708beff2a21fb3da4ae98d2b
diff --git a/botorch/acquisition/analytic.py b/botorch/acquisition/analytic.py
@@ -14,6 +14,8 @@
 import math
 
 from abc import ABC
+
+from contextlib import nullcontext
 from copy import deepcopy
 from typing import Dict, Optional, Tuple, Union
 
@@ -441,6 +443,91 @@ def _compute_prob_feas(self, X: Tensor, means: Tensor, sigmas: Tensor) -> Tensor
         return prob_feas
 
 
+class LogNoisyExpectedImprovement(AnalyticAcquisitionFunction):
+    r"""Single-outcome Log Noisy Expected Improvement (via fantasies).
+
+    This computes Log Noisy Expected Improvement by averaging over the Expected
+    Improvement values of a number of fantasy models. Only supports the case
+    `q=1`. Assumes that the posterior distribution of the model is Gaussian.
+    The model must be single-outcome.
+
+    `LogNEI(x) = log(E(max(y - max Y_base), 0))), (y, Y_base) ~ f((x, X_base))`,
+    where `X_base` are previously observed points.
+
+    Note: This acquisition function currently relies on using a FixedNoiseGP (required
+    for noiseless fantasies).
+
+    Example:
+        >>> model = FixedNoiseGP(train_X, train_Y, train_Yvar=train_Yvar)
+        >>> LogNEI = LogNoisyExpectedImprovement(model, train_X)
+        >>> nei = LogNEI(test_X)
+    """
+
+    def __init__(
+        self,
+        model: GPyTorchModel,
+        X_observed: Tensor,
+        num_fantasies: int = 20,
+        maximize: bool = True,
+        posterior_transform: Optional[PosteriorTransform] = None,
+        **kwargs,
+    ) -> None:
+        r"""Single-outcome Noisy Log Expected Improvement (via fantasies).
+
+        Args:
+            model: A fitted single-outcome model.
+            X_observed: A `n x d` Tensor of observed points that are likely to
+                be the best observed points so far.
+            num_fantasies: The number of fantasies to generate. The higher this
+                number the more accurate the model (at the expense of model
+                complexity and performance).
+            maximize: If True, consider the problem a maximization problem.
+        """
+        if not isinstance(model, FixedNoiseGP):
+            raise UnsupportedError(
+                "Only FixedNoiseGPs are currently supported for fantasy LogNEI"
+            )
+        # sample fantasies
+        from botorch.sampling.normal import SobolQMCNormalSampler
+
+        # Drop gradients from model.posterior if X_observed does not require gradients
+        # as otherwise, gradients of the GP's kernel's hyper-parameters are tracked
+        # through the rsample_from_base_sample method of GPyTorchPosterior. These
+        # gradients are usually only required w.r.t. the marginal likelihood.
+        with nullcontext() if X_observed.requires_grad else torch.no_grad():
+            posterior = model.posterior(X=X_observed)
+        sampler = SobolQMCNormalSampler(sample_shape=torch.Size([num_fantasies]))
+        Y_fantasized = sampler(posterior).squeeze(-1)
+        batch_X_observed = X_observed.expand(num_fantasies, *X_observed.shape)
+        # The fantasy model will operate in batch mode
+        fantasy_model = _get_noiseless_fantasy_model(
+            model=model, batch_X_observed=batch_X_observed, Y_fantasized=Y_fantasized
+        )
+        super().__init__(
+            model=fantasy_model, posterior_transform=posterior_transform, **kwargs
+        )
+        best_f, _ = Y_fantasized.max(dim=-1) if maximize else Y_fantasized.min(dim=-1)
+        self.best_f, self.maximize = best_f, maximize
+
+    @t_batch_mode_transform(expected_q=1)
+    def forward(self, X: Tensor) -> Tensor:
+        r"""Evaluate logarithm of the mean Expected Improvement on the candidate set X.
+
+        Args:
+            X: A `b1 x ... bk x 1 x d`-dim batched tensor of `d`-dim design points.
+
+        Returns:
+            A `b1 x ... bk`-dim tensor of Log Noisy Expected Improvement values at
+            the given design points `X`.
+        """
+        # add batch dimension for broadcasting to fantasy models
+        mean, sigma = self._mean_and_sigma(X.unsqueeze(-3))
+        u = _scaled_improvement(mean, sigma, self.best_f, self.maximize)
+        log_ei = _log_ei_helper(u) + sigma.log()
+        # this is mathematically - though not numerically - equivalent to log(mean(ei))
+        return torch.logsumexp(log_ei, dim=-1) - math.log(log_ei.shape[-1])
+
+
 class NoisyExpectedImprovement(ExpectedImprovement):
     r"""Single-outcome Noisy Expected Improvement (via fantasies).
 
@@ -486,10 +573,14 @@ def __init__(
         # sample fantasies
         from botorch.sampling.normal import SobolQMCNormalSampler
 
-        with torch.no_grad():
+        # Drop gradients from model.posterior if X_observed does not require gradients
+        # as otherwise, gradients of the GP's kernel's hyper-parameters are tracked
+        # through the rsample_from_base_sample method of GPyTorchPosterior. These
+        # gradients are usually only required w.r.t. the marginal likelihood.
+        with nullcontext() if X_observed.requires_grad else torch.no_grad():
             posterior = model.posterior(X=X_observed)
-            sampler = SobolQMCNormalSampler(sample_shape=torch.Size([num_fantasies]))
-            Y_fantasized = sampler(posterior).squeeze(-1)
+        sampler = SobolQMCNormalSampler(sample_shape=torch.Size([num_fantasies]))
+        Y_fantasized = sampler(posterior).squeeze(-1)
         batch_X_observed = X_observed.expand(num_fantasies, *X_observed.shape)
         # The fantasy model will operate in batch mode
         fantasy_model = _get_noiseless_fantasy_model(
@@ -515,45 +606,6 @@ def forward(self, X: Tensor) -> Tensor:
         return (sigma * _ei_helper(u)).mean(dim=-1)
 
 
-def _get_noiseless_fantasy_model(
-    model: FixedNoiseGP, batch_X_observed: Tensor, Y_fantasized: Tensor
-) -> FixedNoiseGP:
-    r"""Construct a fantasy model from a fitted model and provided fantasies.
-
-    The fantasy model uses the hyperparameters from the original fitted model and
-    assumes the fantasies are noiseless.
-
-    Args:
-        model: a fitted FixedNoiseGP
-        batch_X_observed: A `b x n x d` tensor of inputs where `b` is the number of
-            fantasies.
-        Y_fantasized: A `b x n` tensor of fantasized targets where `b` is the number of
-            fantasies.
-
-    Returns:
-        The fantasy model.
-    """
-    # initialize a copy of FixedNoiseGP on the original training inputs
-    # this makes FixedNoiseGP a non-batch GP, so that the same hyperparameters
-    # are used across all batches (by default, a GP with batched training data
-    # uses independent hyperparameters for each batch).
-    fantasy_model = FixedNoiseGP(
-        train_X=model.train_inputs[0],
-        train_Y=model.train_targets.unsqueeze(-1),
-        train_Yvar=model.likelihood.noise_covar.noise.unsqueeze(-1),
-    )
-    # update training inputs/targets to be batch mode fantasies
-    fantasy_model.set_train_data(
-        inputs=batch_X_observed, targets=Y_fantasized, strict=False
-    )
-    # use noiseless fantasies
-    fantasy_model.likelihood.noise_covar.noise = torch.full_like(Y_fantasized, 1e-7)
-    # load hyperparameters from original model
-    state_dict = deepcopy(model.state_dict())
-    fantasy_model.load_state_dict(state_dict)
-    return fantasy_model
-
-
 class UpperConfidenceBound(AnalyticAcquisitionFunction):
     r"""Single-outcome Upper Confidence Bound (UCB).
 
@@ -807,3 +859,42 @@ def _construct_dist(means: Tensor, sigmas: Tensor, inds: Tensor) -> Normal:
     mean = means.index_select(dim=-1, index=inds)
     sigma = sigmas.index_select(dim=-1, index=inds)
     return Normal(loc=mean, scale=sigma)
+
+
+def _get_noiseless_fantasy_model(
+    model: FixedNoiseGP, batch_X_observed: Tensor, Y_fantasized: Tensor
+) -> FixedNoiseGP:
+    r"""Construct a fantasy model from a fitted model and provided fantasies.
+
+    The fantasy model uses the hyperparameters from the original fitted model and
+    assumes the fantasies are noiseless.
+
+    Args:
+        model: a fitted FixedNoiseGP
+        batch_X_observed: A `b x n x d` tensor of inputs where `b` is the number of
+            fantasies.
+        Y_fantasized: A `b x n` tensor of fantasized targets where `b` is the number of
+            fantasies.
+
+    Returns:
+        The fantasy model.
+    """
+    # initialize a copy of FixedNoiseGP on the original training inputs
+    # this makes FixedNoiseGP a non-batch GP, so that the same hyperparameters
+    # are used across all batches (by default, a GP with batched training data
+    # uses independent hyperparameters for each batch).
+    fantasy_model = FixedNoiseGP(
+        train_X=model.train_inputs[0],
+        train_Y=model.train_targets.unsqueeze(-1),
+        train_Yvar=model.likelihood.noise_covar.noise.unsqueeze(-1),
+    )
+    # update training inputs/targets to be batch mode fantasies
+    fantasy_model.set_train_data(
+        inputs=batch_X_observed, targets=Y_fantasized, strict=False
+    )
+    # use noiseless fantasies
+    fantasy_model.likelihood.noise_covar.noise = torch.full_like(Y_fantasized, 1e-7)
+    # load hyperparameters from original model
+    state_dict = deepcopy(model.state_dict())
+    fantasy_model.load_state_dict(state_dict)
+    return fantasy_model
diff --git a/botorch/acquisition/input_constructors.py b/botorch/acquisition/input_constructors.py
@@ -33,6 +33,7 @@
     ConstrainedExpectedImprovement,
     ExpectedImprovement,
     LogExpectedImprovement,
+    LogNoisyExpectedImprovement,
     NoisyExpectedImprovement,
     PosteriorMean,
     ProbabilityOfImprovement,
@@ -380,7 +381,7 @@ def construct_inputs_constrained_ei(
     raise NotImplementedError  # pragma: nocover
 
 
-@acqf_input_constructor(NoisyExpectedImprovement)
+@acqf_input_constructor(NoisyExpectedImprovement, LogNoisyExpectedImprovement)
 def construct_inputs_noisy_ei(
     model: Model,
     training_data: MaybeDict[SupervisedDataset],
diff --git a/test/acquisition/test_analytic.py b/test/acquisition/test_analytic.py
@@ -14,6 +14,7 @@
     ConstrainedExpectedImprovement,
     ExpectedImprovement,
     LogExpectedImprovement,
+    LogNoisyExpectedImprovement,
     NoisyExpectedImprovement,
     PosteriorMean,
     ProbabilityOfImprovement,
@@ -551,14 +552,33 @@ def test_noisy_expected_improvement(self):
         for dtype in (torch.float, torch.double):
             model = self._get_model(dtype=dtype)
             X_observed = model.train_inputs[0]
-            nEI = NoisyExpectedImprovement(model, X_observed, num_fantasies=5)
+            nfan = 5
+            nEI = NoisyExpectedImprovement(model, X_observed, num_fantasies=nfan)
+            LogNEI = LogNoisyExpectedImprovement(model, X_observed, num_fantasies=nfan)
+            # before assigning, check that the attributes exist
+            self.assertTrue(hasattr(LogNEI, "model"))
+            self.assertTrue(hasattr(LogNEI, "best_f"))
+            self.assertTrue(isinstance(LogNEI.model, FixedNoiseGP))
+            LogNEI.model = nEI.model  # let the two share their values and fantasies
+            LogNEI.best_f = nEI.best_f
+
             X_test = torch.tensor(
                 [[[0.25]], [[0.75]]],
                 device=X_observed.device,
                 dtype=dtype,
-                requires_grad=True,
             )
+            X_test_log = X_test.clone()
+            X_test.requires_grad = True
+            X_test_log.requires_grad = True
             val = nEI(X_test)
+            # testing logNEI yields the same result (also checks dtype)
+            log_val = LogNEI(X_test_log)
+            exp_log_val = log_val.exp()
+            # notably, val[1] is usually zero in this test, which is precisely what
+            # gives rise to problems during optimization, and what logNEI avoids
+            # since it generally takes a large negative number (<-2000) and has
+            # strong gradient signals in this regime.
+            self.assertTrue(torch.allclose(exp_log_val, val))
             # test basics
             self.assertEqual(val.dtype, dtype)
             self.assertEqual(val.device.type, X_observed.device.type)
@@ -569,17 +589,35 @@ def test_noisy_expected_improvement(self):
             # test gradient
             val.sum().backward()
             self.assertGreater(X_test.grad[0].abs().item(), 1e-5)
-            # test without gradient
-            with torch.no_grad():
-                nEI(X_test)
+            # testing gradient through exp of log computation
+            exp_log_val.sum().backward()
+            # testing that first gradient element coincides. The second is in the
+            # regime where the naive implementation looses accuracy.
+            atol = 1e-5 if dtype == torch.float32 else 1e-14
+            self.assertTrue(
+                torch.allclose(X_test.grad[0], X_test_log.grad[0], atol=atol)
+            )
+
             # test non-FixedNoiseGP model
             other_model = SingleTaskGP(X_observed, model.train_targets.unsqueeze(-1))
-            with self.assertRaises(UnsupportedError):
-                NoisyExpectedImprovement(other_model, X_observed, num_fantasies=5)
-            # Test with minimize
-            nEI = NoisyExpectedImprovement(
-                model, X_observed, num_fantasies=5, maximize=False
-            )
+            for constructor in (NoisyExpectedImprovement, LogNoisyExpectedImprovement):
+                with self.assertRaises(UnsupportedError):
+                    constructor(other_model, X_observed, num_fantasies=5)
+                # Test constructor with minimize
+                acqf = constructor(model, X_observed, num_fantasies=5, maximize=False)
+                # test evaluation without gradients enabled
+                with torch.no_grad():
+                    acqf(X_test)
+
+                # testing gradients are only propagated if X_observed requires them
+                # i.e. kernel hyper-parameters are not tracked through to best_f
+                X_observed.requires_grad = False
+                acqf = constructor(model, X_observed, num_fantasies=5)
+                self.assertFalse(acqf.best_f.requires_grad)
+
+                X_observed.requires_grad = True
+                acqf = constructor(model, X_observed, num_fantasies=5)
+                self.assertTrue(acqf.best_f.requires_grad)
 
 
 class TestScalarizedPosteriorMean(BotorchTestCase):