Subset arrays (#411)

eodole · LarsKue · web-flow · commit b4d0a720bc5e · 2025-05-06T13:30:51.000-04:00
* made initial backend functions for adapter subsetting, need to still make the squeeze function and link it to the front end

* added subsample functionality, to do would be adding them to testing procedures

* made the take function and ran the linter

* changed name of subsampling function

* changed documentation, to be consistent with external notation, rather than internal shorthand

* small formation change to documentation

* changed subsample to have sample size and axis in the constructor

* moved transforms in the adapter.py so they're in alphabetical order like the other transforms

* changed random_subsample to maptransform rather than filter transform

* updated documentation with new naming convention

* added arguments of take to the constructor

* added feature to specify a percentage of the data to subsample rather than only integer input

* changed subsample in adapter.py to allow float as an input for the sample size

* renamed subsample_array and associated classes/functions to RandomSubsample and random_subsample respectively

* included TypeError to force users to only subsample one dataset at a time

* ran linter

* rerun formatter

* clean up random subsample transform and docs

* clean up take transform and docs

* nitpick clean-up

* skip shape check for subsampled adapter transform inverse

* fix serialization of new transforms

* skip randomly subsampled key in serialization consistency check

---------

Co-authored-by: LarsKue &lt;lars@kuehmichel.de&gt;
diff --git a/.gitignore b/.gitignore
@@ -39,3 +39,6 @@ docs/
 
 # MacOS
 .DS_Store
+
+# Rproj
+.Rproj.user
diff --git a/bayesflow/adapters/adapter.py b/bayesflow/adapters/adapter.py
@@ -25,6 +25,8 @@
     Standardize,
     ToArray,
     Transform,
+    RandomSubsample,
+    Take,
 )
 from .transforms.filter_transform import Predicate
 
@@ -665,6 +667,28 @@ def one_hot(self, keys: str | Sequence[str], num_classes: int):
         self.transforms.append(transform)
         return self
 
+    def random_subsample(self, key: str, *, sample_size: int | float, axis: int = -1):
+        """
+        Append a :py:class:`~transforms.RandomSubsample` transform to the adapter.
+
+        Parameters
+        ----------
+        key : str or Sequence of str
+            The name of the variable to subsample.
+        sample_size : int or float
+            The number of samples to draw, or a fraction between 0 and 1 of the total number of samples to draw.
+        axis: int, optional
+            Which axis to draw samples over. The last axis is used by default.
+        """
+
+        if not isinstance(key, str):
+            raise TypeError("Can only subsample one batch entry at a time.")
+
+        transform = MapTransform({key: RandomSubsample(sample_size=sample_size, axis=axis)})
+
+        self.transforms.append(transform)
+        return self
+
     def rename(self, from_key: str, to_key: str):
         """Append a :py:class:`~transforms.Rename` transform to the adapter.
 
@@ -741,7 +765,7 @@ def standardize(
             Names of variables to include in the transform.
         exclude : str or Sequence of str, optional
             Names of variables to exclude from the transform.
-        **kwargs : dict
+        **kwargs :
             Additional keyword arguments passed to the transform.
         """
         transform = FilterTransform(
@@ -754,6 +778,42 @@ def standardize(
         self.transforms.append(transform)
         return self
 
+    def take(
+        self,
+        include: str | Sequence[str] = None,
+        *,
+        indices: Sequence[int],
+        axis: int = -1,
+        predicate: Predicate = None,
+        exclude: str | Sequence[str] = None,
+    ):
+        """
+        Append a :py:class:`~transforms.Take` transform to the adapter.
+
+        Parameters
+        ----------
+        include : str or Sequence of str, optional
+            Names of variables to include in the transform.
+        indices : Sequence of int
+            Which indices to take from the data.
+        axis : int, optional
+            Which axis to take from. The last axis is used by default.
+        predicate : Predicate, optional
+            Function that indicates which variables should be transformed.
+        exclude : str or Sequence of str, optional
+            Names of variables to exclude from the transform.
+        """
+        transform = FilterTransform(
+            transform_constructor=Take,
+            predicate=predicate,
+            include=include,
+            exclude=exclude,
+            indices=indices,
+            axis=axis,
+        )
+        self.transforms.append(transform)
+        return self
+
     def to_array(
         self,
         include: str | Sequence[str] = None,
diff --git a/bayesflow/adapters/transforms/__init__.py b/bayesflow/adapters/transforms/__init__.py
@@ -23,6 +23,8 @@
 from .to_array import ToArray
 from .to_dict import ToDict
 from .transform import Transform
+from .random_subsample import RandomSubsample
+from .take import Take
 
 from ...utils._docs import _add_imports_to_all
 
diff --git a/bayesflow/adapters/transforms/random_subsample.py b/bayesflow/adapters/transforms/random_subsample.py
@@ -0,0 +1,48 @@
+import numpy as np
+from bayesflow.utils.serialization import serializable, serialize
+from .elementwise_transform import ElementwiseTransform
+
+
+@serializable(package="bayesflow.adapters")
+class RandomSubsample(ElementwiseTransform):
+    """
+    A transform that takes a random subsample of the data within an axis.
+
+    Example: adapter.random_subsample("x", sample_size = 3, axis = -1)
+
+    """
+
+    def __init__(
+        self,
+        sample_size: int | float,
+        axis: int = -1,
+    ):
+        super().__init__()
+        if isinstance(sample_size, float):
+            if sample_size <= 0 or sample_size >= 1:
+                ValueError("Sample size as a percentage must be a float between 0 and 1 exclusive. ")
+        self.sample_size = sample_size
+        self.axis = axis
+
+    def forward(self, data: np.ndarray, **kwargs) -> np.ndarray:
+        axis = self.axis
+        max_sample_size = data.shape[axis]
+
+        if isinstance(self.sample_size, int):
+            sample_size = self.sample_size
+        else:
+            sample_size = np.round(self.sample_size * max_sample_size)
+
+        # random sample without replacement
+        sample_indices = np.random.permutation(max_sample_size)[0 : sample_size - 1]
+
+        return np.take(data, sample_indices, axis)
+
+    def inverse(self, data: np.ndarray, **kwargs) -> np.ndarray:
+        # non invertible transform
+        return data
+
+    def get_config(self) -> dict:
+        config = {"sample_size": self.sample_size, "axis": self.axis}
+
+        return serialize(config)
diff --git a/bayesflow/adapters/transforms/take.py b/bayesflow/adapters/transforms/take.py
@@ -0,0 +1,31 @@
+from collections.abc import Sequence
+import numpy as np
+
+from bayesflow.utils.serialization import serializable, serialize
+
+from .elementwise_transform import ElementwiseTransform
+
+
+@serializable(package="bayesflow.adapters")
+class Take(ElementwiseTransform):
+    """
+    A transform to reduce the dimensionality of arrays output by the summary network
+    Example: adapter.take("x", np.arange(0,3), axis=-1)
+    """
+
+    def __init__(self, indices: Sequence[int], axis: int = -1):
+        super().__init__()
+        self.indices = indices
+        self.axis = axis
+
+    def forward(self, data: np.ndarray, **kwargs) -> np.ndarray:
+        return np.take(data, self.indices, self.axis)
+
+    def inverse(self, data: np.ndarray, **kwargs) -> np.ndarray:
+        # not a true invertible function
+        return data
+
+    def get_config(self) -> dict:
+        config = {"indices": self.indices, "axis": self.axis}
+
+        return serialize(config)
diff --git a/tests/test_adapters/conftest.py b/tests/test_adapters/conftest.py
@@ -11,7 +11,7 @@ def adapter():
     def serializable_fn(x):
         return x
 
-    d = (
+    return (
         Adapter()
         .to_array()
         .as_set(["s1", "s2"])
@@ -32,12 +32,12 @@ def serializable_fn(x):
         .standardize(exclude=["t1", "t2", "o1"])
         .drop("d1")
         .one_hot("o1", 10)
-        .keep(["x", "y", "z1", "p1", "p2", "s1", "s2", "t1", "t2", "o1", "split_1", "split_2"])
+        .keep(["x", "y", "z1", "p1", "p2", "s1", "s2", "s3", "t1", "t2", "o1", "split_1", "split_2"])
         .rename("o1", "o2")
+        .random_subsample("s3", sample_size=33, axis=0)
+        .take("s3", indices=np.arange(0, 32), axis=0)
     )
 
-    return d
-
 
 @pytest.fixture()
 def random_data():
@@ -58,6 +58,7 @@ def random_data():
         "d1": np.random.standard_normal(size=(32, 2)),
         "d2": np.random.standard_normal(size=(32, 2)),
         "o1": np.random.randint(0, 9, size=(32, 2)),
+        "s3": np.random.standard_normal(size=(35, 2)),
         "u1": np.random.uniform(low=-1, high=2, size=(32, 1)),
         "key_to_split": np.random.standard_normal(size=(32, 10)),
     }
@@ -67,7 +68,7 @@ def random_data():
 def adapter_log_det_jac():
     from bayesflow.adapters import Adapter
 
-    adapter = (
+    return (
         Adapter()
         .scale("x1", by=2)
         .log("p1", p1=True)
@@ -79,14 +80,12 @@ def adapter_log_det_jac():
         .rename("u1", "u")
     )
 
-    return adapter
-
 
 @pytest.fixture()
 def adapter_log_det_jac_inverse():
     from bayesflow.adapters import Adapter
 
-    adapter = (
+    return (
         Adapter()
         .standardize("x1", mean=1, std=2)
         .log("p1")
@@ -96,5 +95,3 @@ def adapter_log_det_jac_inverse():
         .constrain("u1", lower=-1, upper=2)
         .scale(["p1", "p2", "p3"], by=3.5)
     )
-
-    return adapter
diff --git a/tests/test_adapters/test_adapters.py b/tests/test_adapters/test_adapters.py
@@ -16,6 +16,9 @@ def test_cycle_consistency(adapter, random_data):
         if key in ["d1", "d2", "p3", "n1", "u1"]:
             # dropped
             continue
+        if key == "s3":
+            # we subsampled this key, so it is expected for its shape to change
+            continue
         assert key in deprocessed
         assert np.allclose(value, deprocessed[key])
 
@@ -31,6 +34,10 @@ def test_serialize_deserialize(adapter, random_data):
     random_data["foo"] = random_data["x1"]
     deserialized_processed = deserialized(random_data)
     for key, value in processed.items():
+        if key == "s3":
+            # skip this key because it is *randomly* subsampled
+            continue
+
         assert np.allclose(value, deserialized_processed[key])