Don't flatten batch dims

jessegrabowski · jessegrabowski · commit 3db6e231daa7 · 2025-10-29T06:06:32.000-05:00
diff --git a/pymc/distributions/multivariate.py b/pymc/distributions/multivariate.py
@@ -34,7 +34,6 @@
     sigmoid,
 )
 from pytensor.tensor.blockwise import Blockwise
-from pytensor.tensor.einsum import _delta
 from pytensor.tensor.elemwise import DimShuffle
 from pytensor.tensor.exceptions import NotScalarConstantError
 from pytensor.tensor.linalg import cholesky, det, eigh, solve_triangular, trace
@@ -1213,12 +1212,8 @@ def rv_op(cls, n, eta, sd_dist, *, size=None):
         D = sd_dist.type(name="D")  # Make sd_dist opaque to OpFromGraph
         size = D.shape[:-1]
 
-        # We flatten the size to make operations easier, and then rebuild it
-        flat_size = pt.prod(size, dtype="int64")
-
-        next_rng, C = LKJCorrRV._random_corr_matrix(rng=rng, n=n, eta=eta, flat_size=flat_size)
-        D_matrix = D.reshape((flat_size, n))
-        C *= D_matrix[..., :, None] * D_matrix[..., None, :]
+        next_rng, C = LKJCorrRV._random_corr_matrix(rng=rng, n=n, eta=eta, size=size)
+        C *= D[..., :, None] * D[..., None, :]
 
         tril_idx = pt.tril_indices(n, k=0)
         samples = pt.linalg.cholesky(C)[..., tril_idx[0], tril_idx[1]]
@@ -1520,53 +1515,52 @@ def make_node(self, rng, size, n, eta):
 
     @classmethod
     def rv_op(cls, n: int, eta, *, rng=None, size=None):
-        # We flatten the size to make operations easier, and then rebuild it
+        # HACK: normalize_size_param doesn't handle size=() properly
+        if not size:
+            size = None
+
         n = pt.as_tensor(n, ndim=0, dtype=int)
         eta = pt.as_tensor(eta, ndim=0)
         rng = normalize_rng_param(rng)
         size = normalize_size_param(size)
 
-        if rv_size_is_none(size):
-            flat_size = 1
-        else:
-            flat_size = pt.prod(size, dtype="int64")
+        next_rng, C = cls._random_corr_matrix(rng=rng, n=n, eta=eta, size=size)
 
-        next_rng, C = cls._random_corr_matrix(rng=rng, n=n, eta=eta, flat_size=flat_size)
-        C = C[0] if rv_size_is_none(size) else C.reshape((*size, n, n))
-
-        return cls(
-            inputs=[rng, size, n, eta],
-            outputs=[next_rng, C],
-        )(rng, size, n, eta)
+        return cls(inputs=[rng, size, n, eta], outputs=[next_rng, C])(rng, size, n, eta)
 
     @classmethod
     def _random_corr_matrix(
-        cls, rng: Variable, n: int, eta: TensorVariable, flat_size: TensorVariable
+        cls, rng: Variable, n: int, eta: TensorVariable, size: TensorVariable
     ) -> tuple[Variable, TensorVariable]:
         # original implementation in R see:
         # https://github.com/rmcelreath/rethinking/blob/master/R/distributions.r
+        size = () if rv_size_is_none(size) else size
 
         beta = eta - 1.0 + n / 2.0
-        next_rng, beta_rvs = pt.random.beta(
-            alpha=beta, beta=beta, size=flat_size, rng=rng
-        ).owner.outputs
+        next_rng, beta_rvs = pt.random.beta(alpha=beta, beta=beta, size=size, rng=rng).owner.outputs
         r12 = 2.0 * beta_rvs - 1.0
-        P = pt.full((flat_size, n, n), pt.eye(n))
+
+        P = pt.full((*size, n, n), pt.eye(n))
         P = P[..., 0, 1].set(r12)
         P = P[..., 1, 1].set(pt.sqrt(1.0 - r12**2))
         n = get_underlying_scalar_constant_value(n)
 
         for mp1 in range(2, n):
             beta -= 0.5
+
             next_rng, y = pt.random.beta(
-                alpha=mp1 / 2.0, beta=beta, size=flat_size, rng=next_rng
+                alpha=mp1 / 2.0, beta=beta, size=size, rng=next_rng
             ).owner.outputs
+
             next_rng, z = pt.random.normal(
-                loc=0, scale=1, size=(flat_size, mp1), rng=next_rng
+                loc=0, scale=1, size=(*size, mp1), rng=next_rng
             ).owner.outputs
-            z = z / pt.sqrt(pt.einsum("ij,ij->i", z, z.copy()))[..., np.newaxis]
+
+            ein_sig_z = "i, i->" if z.ndim == 1 else "...ij, ...ij->...i"
+            z = z / pt.sqrt(pt.einsum(ein_sig_z, z, z.copy()))[..., np.newaxis]
             P = P[..., 0:mp1, mp1].set(pt.sqrt(y[..., np.newaxis]) * z)
             P = P[..., mp1, mp1].set(pt.sqrt(1.0 - y))
+
         C = pt.einsum("...ji,...jk->...ik", P, P.copy())
 
         return next_rng, C
@@ -1584,10 +1578,7 @@ def dist(cls, n, eta, **kwargs):
 
     @staticmethod
     def support_point(rv: TensorVariable, *args):
-        ndim = rv.ndim
-
-        # Batched identity matrix
-        return _delta(rv.shape, (ndim - 2, ndim - 1)).astype(int)
+        return pt.broadcast_to(pt.eye(rv.shape[-1]), rv.shape)
 
     @staticmethod
     def logp(value: TensorVariable, n, eta):
diff --git a/tests/distributions/test_multivariate.py b/tests/distributions/test_multivariate.py
@@ -1312,17 +1312,17 @@ def test_kronecker_normal_support_point(self, mu, covs, size, expected):
     @pytest.mark.parametrize(
         "n, eta, size, expected",
         [
-            (3, 1, None, np.zeros(3)),
-            (5, 1, None, np.zeros(10)),
-            pytest.param(3, 1, 1, np.zeros((1, 3))),
-            pytest.param(5, 1, (2, 3), np.zeros((2, 3, 10))),
+            (3, 1, None, np.eye(3)),
+            (5, 1, None, np.eye(5)),
+            (3, 1, (1,), np.broadcast_to(np.eye(3), (1, 3, 3))),
+            (5, 1, (2, 3), np.broadcast_to(np.eye(5), (2, 3, 5, 5))),
         ],
+        ids=["n=3", "n=5", "batch_1", "batch_2"],
     )
     def test_lkjcorr_support_point(self, n, eta, size, expected):
         with pm.Model() as model:
-            pm.LKJCorr("x", n=n, eta=eta, size=size, return_matrix=False)
-        # LKJCorr logp is only implemented for vector values (size=None)
-        assert_support_point_is_expected(model, expected, check_finite_logp=size is None)
+            pm.LKJCorr("x", n=n, eta=eta, size=size)
+        assert_support_point_is_expected(model, expected, check_finite_logp=True)
 
     @pytest.mark.parametrize(
         "n, eta, size, expected",
@@ -1466,13 +1466,12 @@ def test_with_lkjcorr_matrix(
         self,
     ):
         with pm.Model() as model:
-            corr = pm.LKJCorr("corr", n=3, eta=2, return_matrix=True)
-            pm.Deterministic("corr_mat", corr)
-            mv = pm.MvNormal("mv", 0.0, cov=corr, size=4)
+            corr_mat = pm.LKJCorr("corr_mat", n=3, eta=2)
+            mv = pm.MvNormal("mv", 0.0, cov=corr_mat, size=4)
             prior = pm.sample_prior_predictive(draws=10, return_inferencedata=False)
 
         assert prior["corr_mat"].shape == (10, 3, 3)  # square
-        assert (prior["corr_mat"][:, [0, 1, 2], [0, 1, 2]] == 1.0).all()  # 1.0 on diagonal
+        assert np.allclose(prior["corr_mat"][:, [0, 1, 2], [0, 1, 2]], 1.0)  # 1.0 on diagonal
         assert (prior["corr_mat"] == prior["corr_mat"].transpose(0, 2, 1)).all()  # symmetric
         assert (
             prior["corr_mat"].max() <= 1.0 and prior["corr_mat"].min() >= -1.0