From e6794ae76ed9bd5051ee204fe75be9b77e6b22b5 Mon Sep 17 00:00:00 2001
From: Valentin Pratz <git@valentinpratz.de>
Date: Mon, 16 Jun 2025 10:37:39 +0000
Subject: [PATCH 01/10] tests: remove diffusion_model marker

---
 tests/test_networks/conftest.py | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/tests/test_networks/conftest.py b/tests/test_networks/conftest.py
index 678029d92..bc2999a23 100644
--- a/tests/test_networks/conftest.py
+++ b/tests/test_networks/conftest.py
@@ -158,12 +158,12 @@ def typical_point_inference_network_subnet():
         "flow_matching",
         "free_form_flow",
         "consistency_model",
-        pytest.param("diffusion_model_edm_F", marks=pytest.mark.diffusion_model),
-        pytest.param("diffusion_model_edm_noise", marks=[pytest.mark.slow, pytest.mark.diffusion_model]),
-        pytest.param("diffusion_model_cosine_velocity", marks=[pytest.mark.slow, pytest.mark.diffusion_model]),
-        pytest.param("diffusion_model_cosine_F", marks=[pytest.mark.slow, pytest.mark.diffusion_model]),
-        pytest.param("diffusion_model_cosine_noise", marks=[pytest.mark.slow, pytest.mark.diffusion_model]),
-        pytest.param("diffusion_model_cosine_velocity", marks=[pytest.mark.slow, pytest.mark.diffusion_model]),
+        pytest.param("diffusion_model_edm_F"),
+        pytest.param("diffusion_model_edm_noise", marks=[pytest.mark.slow]),
+        pytest.param("diffusion_model_cosine_velocity", marks=[pytest.mark.slow]),
+        pytest.param("diffusion_model_cosine_F", marks=[pytest.mark.slow]),
+        pytest.param("diffusion_model_cosine_noise", marks=[pytest.mark.slow]),
+        pytest.param("diffusion_model_cosine_velocity", marks=[pytest.mark.slow]),
     ],
     scope="function",
 )
@@ -191,21 +191,19 @@ def inference_network_subnet(request):
         "flow_matching",
         "free_form_flow",
         "consistency_model",
-        pytest.param("diffusion_model_edm_F", marks=pytest.mark.diffusion_model),
+        pytest.param("diffusion_model_edm_F"),
         pytest.param(
             "diffusion_model_edm_noise",
             marks=[
                 pytest.mark.slow,
-                pytest.mark.diffusion_model,
                 pytest.mark.skip("noise predicition not testable without prior training for numerical reasons."),
             ],
         ),
-        pytest.param("diffusion_model_cosine_velocity", marks=[pytest.mark.slow, pytest.mark.diffusion_model]),
+        pytest.param("diffusion_model_cosine_velocity", marks=[pytest.mark.slow]),
         pytest.param(
             "diffusion_model_cosine_F",
             marks=[
                 pytest.mark.slow,
-                pytest.mark.diffusion_model,
                 pytest.mark.skip("skip to reduce load on CI."),
             ],
         ),
@@ -213,7 +211,6 @@ def inference_network_subnet(request):
             "diffusion_model_cosine_noise",
             marks=[
                 pytest.mark.slow,
-                pytest.mark.diffusion_model,
                 pytest.mark.skip("noise predicition not testable without prior training for numerical reasons."),
             ],
         ),
@@ -221,7 +218,6 @@ def inference_network_subnet(request):
             "diffusion_model_cosine_velocity",
             marks=[
                 pytest.mark.slow,
-                pytest.mark.diffusion_model,
                 pytest.mark.skip("skip to reduce load on CI."),
             ],
         ),

From 69c1a9d292955269ba81d9dd4860de3936b0347e Mon Sep 17 00:00:00 2001
From: Valentin Pratz <git@valentinpratz.de>
Date: Mon, 16 Jun 2025 10:40:06 +0000
Subject: [PATCH 02/10] remove scope="session" from fixtures to allow local
 overrides

some tests need different sizes and dimensions, and the session scope is
not required here but blocks overriding in more specific conftest.py
files.
---
 tests/conftest.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 560b7c59b..95ec8f693 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -41,32 +41,32 @@ def pytest_make_parametrize_id(config, val, argname):
     return f"{argname}={repr(val)}"
 
 
-@pytest.fixture(params=[2], scope="session")
+@pytest.fixture(params=[2])
 def batch_size(request):
     return request.param
 
 
-@pytest.fixture(params=[None, 2, 3], scope="session")
+@pytest.fixture(params=[None, 2, 3])
 def conditions_size(request):
     return request.param
 
 
-@pytest.fixture(params=[1, 4], scope="session")
+@pytest.fixture(params=[1, 4])
 def summary_dim(request):
     return request.param
 
 
-@pytest.fixture(params=["two_moons"], scope="session")
+@pytest.fixture(params=["two_moons"])
 def dataset(request):
     return request.getfixturevalue(request.param)
 
 
-@pytest.fixture(params=[2, 3], scope="session")
+@pytest.fixture(params=[2, 3])
 def feature_size(request):
     return request.param
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture()
 def random_conditions(batch_size, conditions_size):
     if conditions_size is None:
         return None
@@ -74,7 +74,7 @@ def random_conditions(batch_size, conditions_size):
     return keras.random.normal((batch_size, conditions_size))
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture()
 def random_samples(batch_size, feature_size):
     return keras.random.normal((batch_size, feature_size))
 
@@ -86,11 +86,11 @@ def random_seed():
     return seed
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture()
 def random_set(batch_size, set_size, feature_size):
     return keras.random.normal((batch_size, set_size, feature_size))
 
 
-@pytest.fixture(params=[2, 3], scope="session")
+@pytest.fixture(params=[2, 3])
 def set_size(request):
     return request.param

From 73cfa348887a6fe1f7800e7acb5dc71bcc232734 Mon Sep 17 00:00:00 2001
From: Valentin Pratz <git@valentinpratz.de>
Date: Mon, 16 Jun 2025 10:41:31 +0000
Subject: [PATCH 03/10] [WIP] add serialization compatiblity tests

---
 .gitignore                                    |   2 +-
 noxfile.py                                    |  44 ++++
 tests/test_compatibility/conftest.py          |  61 +++++
 .../test_adapters/__init__.py                 |   0
 .../test_adapters/conftest.py                 |  90 +++++++
 .../test_adapters/test_adapters.py            |  40 +++
 .../test_approximators/__init__.py            |   0
 .../test_approximators/conftest.py            |  72 ++++++
 .../test_continuous_approximator/__init__.py  |   0
 .../test_continuous_approximator.py           |  54 +++++
 .../__init__.py                               |   0
 .../conftest.py                               |  82 +++++++
 .../test_model_comparison_approximator.py     |  40 +++
 .../test_distributions/__init__.py            |   0
 .../test_distributions/conftest.py            |  32 +++
 .../test_distributions/test_distributions.py  |  32 +++
 .../test_compatibility/test_links/__init__.py |   0
 .../test_compatibility/test_links/conftest.py |  79 ++++++
 .../test_links/test_links.py                  |  34 +++
 .../test_metrics/__init__.py                  |   0
 .../test_metrics/conftest.py                  |  33 +++
 .../test_metrics/test_metrics.py              |  31 +++
 .../test_networks/__init__.py                 |   0
 .../test_networks/conftest.py                 |   0
 .../test_inference_networks/__init__.py       |   0
 .../test_inference_networks/conftest.py       | 228 ++++++++++++++++++
 .../test_inference_networks.py                |  56 +++++
 .../test_summary_networks/__init__.py         |   0
 .../test_summary_networks/conftest.py         |  50 ++++
 .../test_summary_networks.py                  |  37 +++
 .../test_scores/__init__.py                   |   0
 .../test_scores/conftest.py                   |  51 ++++
 .../test_scores/test_scores.py                |  45 ++++
 .../test_workflows/__init__.py                |   0
 .../test_workflows/conftest.py                |  65 +++++
 .../test_workflows/test_workflows.py          |  58 +++++
 tests/test_compatibility/utils/__init__.py    |   2 +
 tests/test_compatibility/utils/helpers.py     |  17 ++
 tests/test_compatibility/utils/io.py          |  48 ++++
 39 files changed, 1382 insertions(+), 1 deletion(-)
 create mode 100644 noxfile.py
 create mode 100644 tests/test_compatibility/conftest.py
 create mode 100644 tests/test_compatibility/test_adapters/__init__.py
 create mode 100644 tests/test_compatibility/test_adapters/conftest.py
 create mode 100644 tests/test_compatibility/test_adapters/test_adapters.py
 create mode 100644 tests/test_compatibility/test_approximators/__init__.py
 create mode 100644 tests/test_compatibility/test_approximators/conftest.py
 create mode 100644 tests/test_compatibility/test_approximators/test_continuous_approximator/__init__.py
 create mode 100644 tests/test_compatibility/test_approximators/test_continuous_approximator/test_continuous_approximator.py
 create mode 100644 tests/test_compatibility/test_approximators/test_model_comparison_approximators/__init__.py
 create mode 100644 tests/test_compatibility/test_approximators/test_model_comparison_approximators/conftest.py
 create mode 100644 tests/test_compatibility/test_approximators/test_model_comparison_approximators/test_model_comparison_approximator.py
 create mode 100644 tests/test_compatibility/test_distributions/__init__.py
 create mode 100644 tests/test_compatibility/test_distributions/conftest.py
 create mode 100644 tests/test_compatibility/test_distributions/test_distributions.py
 create mode 100644 tests/test_compatibility/test_links/__init__.py
 create mode 100644 tests/test_compatibility/test_links/conftest.py
 create mode 100644 tests/test_compatibility/test_links/test_links.py
 create mode 100644 tests/test_compatibility/test_metrics/__init__.py
 create mode 100644 tests/test_compatibility/test_metrics/conftest.py
 create mode 100644 tests/test_compatibility/test_metrics/test_metrics.py
 create mode 100644 tests/test_compatibility/test_networks/__init__.py
 create mode 100644 tests/test_compatibility/test_networks/conftest.py
 create mode 100644 tests/test_compatibility/test_networks/test_inference_networks/__init__.py
 create mode 100644 tests/test_compatibility/test_networks/test_inference_networks/conftest.py
 create mode 100644 tests/test_compatibility/test_networks/test_inference_networks/test_inference_networks.py
 create mode 100644 tests/test_compatibility/test_networks/test_summary_networks/__init__.py
 create mode 100644 tests/test_compatibility/test_networks/test_summary_networks/conftest.py
 create mode 100644 tests/test_compatibility/test_networks/test_summary_networks/test_summary_networks.py
 create mode 100644 tests/test_compatibility/test_scores/__init__.py
 create mode 100644 tests/test_compatibility/test_scores/conftest.py
 create mode 100644 tests/test_compatibility/test_scores/test_scores.py
 create mode 100644 tests/test_compatibility/test_workflows/__init__.py
 create mode 100644 tests/test_compatibility/test_workflows/conftest.py
 create mode 100644 tests/test_compatibility/test_workflows/test_workflows.py
 create mode 100644 tests/test_compatibility/utils/__init__.py
 create mode 100644 tests/test_compatibility/utils/helpers.py
 create mode 100644 tests/test_compatibility/utils/io.py

diff --git a/.gitignore b/.gitignore
index 1ca9eaef6..48bb9a4be 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,7 +13,7 @@ docsrc/source/contributing.md
 examples/checkpoints/
 build
 docs/
-
+_compatibility_data/
 
 # mypy
 .mypy_cache
diff --git a/noxfile.py b/noxfile.py
new file mode 100644
index 000000000..9eb7c645b
--- /dev/null
+++ b/noxfile.py
@@ -0,0 +1,44 @@
+import nox
+import argparse
+from pathlib import Path
+import os
+
+
+@nox.session
+def save_and_load(session: nox.Session):
+    """Save models and outputs to disk and compare outputs between versions.
+
+    This session installs the bayesflow version specified by the `commit` argument, and runs the test suite either in
+    "save" or in "load" mode. In save mode, results are stored to disk and a within-version load test is performed.
+    In load mode, the stored models and outputs are loaded from disk, and old and new outputs are compared.
+    This helps to detect breaking serialization between versions.
+
+    Important: The test code from the current checkout is used, not from the installed version.
+    """
+    # parse the arguments
+    parser = argparse.ArgumentParser()
+    # add subparsers for the two different commands
+    subparsers = parser.add_subparsers(help="subcommand help", dest="mode")
+    # save command
+    parser_save = subparsers.add_parser("save")
+    parser_save.add_argument("commit", type=str, default=".")
+    # load command, additional "from" argument
+    parser_load = subparsers.add_parser("load")
+    parser_load.add_argument("commit", type=str, default=".")
+    parser.add_argument("--from", type=str, default="", required=False, dest="from_commit")
+    # keep unknown arguments, they will be forwarded to pytest below
+    args, unknownargs = parser.parse_known_args(session.posargs)
+
+    # install dependencies, currently the jax backend is used, but we could add a configuration option for this
+    repo_path = Path(os.curdir).absolute().parent / "bf2"
+    session.install(f"git+file://{str(repo_path)}@{args.commit}")
+    session.install("jax")
+    session.install("pytest")
+
+    # pass mode and commits to pytest, required for correct save and load behavior
+    cmd = ["pytest", "--mode", args.mode, "--commit", args.commit]
+    if args.mode == "load":
+        cmd += ["--from", args.from_commit]
+    cmd += unknownargs
+
+    session.run(*cmd, env={"KERAS_BACKEND": "jax"})
diff --git a/tests/test_compatibility/conftest.py b/tests/test_compatibility/conftest.py
new file mode 100644
index 000000000..6d9e1fc29
--- /dev/null
+++ b/tests/test_compatibility/conftest.py
@@ -0,0 +1,61 @@
+import pytest
+from pathlib import Path
+
+
+@pytest.fixture(autouse=True, scope="session")
+def mode(request):
+    mode = request.config.getoption("--mode")
+    if not mode:
+        return "save"
+    return mode
+
+
+@pytest.fixture(scope="session")
+def commit(request):
+    return request.config.getoption("--commit")
+
+
+@pytest.fixture(scope="session")
+def from_commit(request):
+    return request.config.getoption("--from")
+
+
+@pytest.fixture(autouse=True, scope="session")
+def data_dir(request, commit, from_commit, tmp_path_factory):
+    # read config option to detect "unset" scenario
+    mode = request.config.getoption("--mode")
+    if mode == "save":
+        path = Path(".").absolute() / "_compatibility_data" / commit
+        return path
+    elif mode == "load":
+        path = Path(".").absolute() / "_compatibility_data" / from_commit
+        if not path.exists():
+            pytest.exit(reason=f"Load path '{path}' does not exist. Please specify a valid load path", returncode=1)
+        return path
+    # if mode is unset, save and load from a temporary directory
+    return Path(tmp_path_factory.mktemp("_compatibility_data"))
+
+
+@pytest.fixture(params=["sir", "fusion"])
+def simulator(request):
+    if request.param == "sir":
+        from bayesflow.simulators import SIR
+
+        return SIR()
+    elif request.param == "fusion":
+        from bayesflow.simulators import Simulator
+        from bayesflow.types import Shape, Tensor
+        from bayesflow.utils.decorators import allow_batch_size
+        import numpy as np
+
+        class FusionSimulator(Simulator):
+            @allow_batch_size
+            def sample(self, batch_shape: Shape, num_observations: int = 4) -> dict[str, Tensor]:
+                mean = np.random.normal(0.0, 0.1, size=batch_shape + (2,))
+                noise = np.random.standard_normal(batch_shape + (num_observations, 2))
+
+                x = mean[:, None] + noise
+
+                return dict(mean=mean, a=x, b=x)
+
+        return FusionSimulator()
diff --git a/tests/test_compatibility/test_adapters/__init__.py b/tests/test_compatibility/test_adapters/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/test_compatibility/test_adapters/conftest.py b/tests/test_compatibility/test_adapters/conftest.py
new file mode 100644
index 000000000..73dfa1eb3
--- /dev/null
+++ b/tests/test_compatibility/test_adapters/conftest.py
@@ -0,0 +1,90 @@
+import pytest
+import numpy as np
+
+
+@pytest.fixture()
+def default_adapter():
+    from bayesflow import Adapter
+
+    return Adapter.create_default(["x1", "x2"])
+
+
+@pytest.fixture()
+def complete_adapter():
+    from bayesflow.adapters import Adapter
+    import keras
+
+    @keras.saving.register_keras_serializable("custom")
+    def serializable_fn(x):
+        return x
+
+    return (
+        Adapter()
+        .to_array()
+        .as_set(["s1", "s2"])
+        .broadcast("t1", to="t2")
+        .as_time_series(["t1", "t2"])
+        .convert_dtype("float64", "float32", exclude="o1")
+        .concatenate(["x1", "x2"], into="x")
+        .concatenate(["y1", "y2"], into="y")
+        .expand_dims(["z1"], axis=2)
+        .squeeze("z1", axis=2)
+        .log("p1")
+        .constrain("p2", lower=0)
+        .apply(include="p2", forward="exp", inverse="log")
+        .apply(include="p2", forward="log1p")
+        .apply_serializable(include="x", forward=serializable_fn, inverse=serializable_fn)
+        .scale("x", by=[-1, 2])
+        .shift("x", by=2)
+        .split("key_to_split", into=["split_1", "split_2"])
+        .standardize(exclude=["t1", "t2", "o1"])
+        .drop("d1")
+        .one_hot("o1", 10)
+        .keep(["x", "y", "z1", "p1", "p2", "s1", "s2", "s3", "t1", "t2", "o1", "split_1", "split_2"])
+        .rename("o1", "o2")
+        .random_subsample("s3", sample_size=33, axis=0)
+        .take("s3", indices=np.arange(0, 32), axis=0)
+        .group(["p1", "p2"], into="ps", prefix="p")
+        .ungroup("ps", prefix="p")
+    )
+
+
+@pytest.fixture(params=["default_adapter", "complete_adapter"])
+def adapter(request):
+    return request.getfixturevalue(request.param)
+
+
+def get_data(rng):
+    return {
+        "x1": rng.standard_normal(size=(32, 1)),
+        "x2": rng.standard_normal(size=(32, 1)),
+        "y1": rng.standard_normal(size=(32, 2)),
+        "y2": rng.standard_normal(size=(32, 2)),
+        "z1": rng.standard_normal(size=(32, 2)),
+        "p1": rng.lognormal(size=(32, 2)),
+        "p2": rng.lognormal(size=(32, 2)),
+        "p3": rng.lognormal(size=(32, 2)),
+        "n1": 1 - rng.lognormal(size=(32, 2)),
+        "s1": rng.standard_normal(size=(32, 3, 2)),
+        "s2": rng.standard_normal(size=(32, 3, 2)),
+        "t1": np.zeros((3, 2)),
+        "t2": np.ones((32, 3, 2)),
+        "d1": rng.standard_normal(size=(32, 2)),
+        "d2": rng.standard_normal(size=(32, 2)),
+        "o1": rng.integers(0, 9, size=(32, 2)),
+        "s3": rng.standard_normal(size=(35, 2)),
+        "u1": rng.uniform(low=-1, high=2, size=(32, 1)),
+        "key_to_split": rng.standard_normal(size=(32, 10)),
+    }
+
+
+@pytest.fixture
+def data_1():
+    rng = np.random.default_rng(seed=1)
+    return get_data(rng)
+
+
+@pytest.fixture
+def data_2():
+    rng = np.random.default_rng(seed=2)
+    return get_data(rng)
diff --git a/tests/test_compatibility/test_adapters/test_adapters.py b/tests/test_compatibility/test_adapters/test_adapters.py
new file mode 100644
index 000000000..70cf91a31
--- /dev/null
+++ b/tests/test_compatibility/test_adapters/test_adapters.py
@@ -0,0 +1,40 @@
+import pytest
+from utils import SaveLoadTest, load_from_config, save_config, load_path, dump_path
+import numpy as np
+
+
+class TestAdapter(SaveLoadTest):
+    filenames = {
+        "model": "model.pickle",
+        "output": "output.pickle",
+    }
+
+    @pytest.fixture
+    def setup(self, filepaths, mode, adapter, data_1, data_2):
+        if mode == "save":
+            _ = adapter(data_1)
+            save_config(adapter, filepaths["model"])
+
+            output = self.evaluate(adapter, data_2)
+            dump_path(output, filepaths["output"])
+
+        adapter = load_from_config(filepaths["model"])
+        output = load_path(filepaths["output"])
+
+        return adapter, output
+
+    def evaluate(self, adapter, data):
+        adapted = adapter(data)
+        cycled = adapter(adapted, inverse=True)
+        return {"adapted": adapted, "cycled": cycled}
+
+    def test_output(self, setup, data_2):
+        adapter, reference = setup
+        output = self.evaluate(adapter, data_2)
+        for k, v in reference.items():
+            for name, variable in v.items():
+                if name == "s3":
+                    continue
+                np.testing.assert_allclose(
+                    variable, output[k][name], err_msg=f"Values for key '{k}/{name} do not match."
+                )
diff --git a/tests/test_compatibility/test_approximators/__init__.py b/tests/test_compatibility/test_approximators/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/test_compatibility/test_approximators/conftest.py b/tests/test_compatibility/test_approximators/conftest.py
new file mode 100644
index 000000000..16406c9dc
--- /dev/null
+++ b/tests/test_compatibility/test_approximators/conftest.py
@@ -0,0 +1,72 @@
+import pytest
+
+
+@pytest.fixture()
+def batch_size():
+    return 8
+
+
+@pytest.fixture()
+def num_samples():
+    return 100
+
+
+@pytest.fixture()
+def adapter():
+    import bayesflow as bf
+
+    return bf.Adapter.create_default("parameters").rename("observables", "summary_variables")
+
+
+@pytest.fixture(params=["coupling_flow", "flow_matching"])
+def inference_network(request):
+    if request.param == "coupling_flow":
+        from bayesflow.networks import CouplingFlow
+
+        return CouplingFlow(depth=2)
+
+    elif request.param == "flow_matching":
+        from bayesflow.networks import FlowMatching
+
+        return FlowMatching(subnet_kwargs=dict(widths=(32, 32)), use_optimal_transport=False)
+
+
+@pytest.fixture(params=["time_series_transformer", "fusion_transformer", "time_series_network", "custom"])
+def summary_network(request):
+    if request.param == "time_series_transformer":
+        from bayesflow.networks import TimeSeriesTransformer
+
+        return TimeSeriesTransformer(embed_dims=(8, 8), mlp_widths=(16, 8), mlp_depths=(1, 1))
+
+    elif request.param == "fusion_transformer":
+        from bayesflow.networks import FusionTransformer
+
+        return FusionTransformer(
+            embed_dims=(8, 8), mlp_widths=(8, 16), mlp_depths=(2, 1), template_dim=8, bidirectional=False
+        )
+
+    elif request.param == "time_series_network":
+        from bayesflow.networks import TimeSeriesNetwork
+
+        return TimeSeriesNetwork(filters=4, skip_steps=2)
+
+    elif request.param == "custom":
+        from bayesflow.networks import SummaryNetwork
+        from bayesflow.utils.serialization import serializable
+        import keras
+
+        @serializable("test", disable_module_check=True)
+        class Custom(SummaryNetwork):
+            def __init__(self, **kwargs):
+                super().__init__(**kwargs)
+                self.inner = keras.Sequential([keras.layers.LSTM(8), keras.layers.Dense(4)])
+
+            def call(self, x, **kwargs):
+                return self.inner(x, training=kwargs.get("stage") == "training")
+
+        return Custom()
+
+    elif request.param == "fusion_network":
+        from bayesflow.networks import FusionNetwork, DeepSet
+
+        return FusionNetwork({"a": DeepSet(), "b": keras.layers.Flatten()}, head=keras.layers.Dense(2))
diff --git a/tests/test_compatibility/test_approximators/test_continuous_approximator/__init__.py b/tests/test_compatibility/test_approximators/test_continuous_approximator/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/test_compatibility/test_approximators/test_continuous_approximator/test_continuous_approximator.py b/tests/test_compatibility/test_approximators/test_continuous_approximator/test_continuous_approximator.py
new file mode 100644
index 000000000..c5d2d36eb
--- /dev/null
+++ b/tests/test_compatibility/test_approximators/test_continuous_approximator/test_continuous_approximator.py
@@ -0,0 +1,54 @@
+import pytest
+from utils import SaveLoadTest, dump_path, load_path
+import numpy as np
+import keras
+
+
+@pytest.mark.parametrize("inference_network", ["coupling_flow", "flow_matching"], indirect=True)
+@pytest.mark.parametrize(
+    "summary_network,simulator,adapter",
+    [
+        ["time_series_transformer", "sir", None],
+        ["fusion_transformer", "sir", None],
+    ],
+    indirect=True,
+)
+class TestContinuousApproximator(SaveLoadTest):
+    filenames = {
+        "approximator": "approximator.keras",
+        "input": "input.pickle",
+        "output": "output.pickle",
+    }
+
+    @pytest.fixture()
+    def setup(self, filepaths, mode, inference_network, summary_network, simulator, adapter):
+        if mode == "save":
+            import bayesflow as bf
+
+            approximator = bf.approximators.ContinuousApproximator(
+                adapter=adapter,
+                inference_network=inference_network,
+                summary_network=summary_network,
+            )
+            approximator.compile("adamw")
+            approximator.fit(simulator=simulator, epochs=1, batch_size=8, num_batches=2, verbose=0)
+            keras.saving.save_model(approximator, filepaths["approximator"])
+
+            input = simulator.sample(4)
+            output = self.evaluate(approximator, input)
+            dump_path(input, filepaths["input"])
+            dump_path(output, filepaths["output"])
+
+        approximator = keras.saving.load_model(filepaths["approximator"])
+        input = load_path(filepaths["input"])
+        output = load_path(filepaths["output"])
+
+        return approximator, input, output
+
+    def evaluate(self, approximator, data):
+        return approximator.log_prob(data)
+
+    def test_output(self, setup):
+        approximator, input, reference = setup
+        output = self.evaluate(approximator, input)
+        np.testing.assert_allclose(reference, output)
diff --git a/tests/test_compatibility/test_approximators/test_model_comparison_approximators/__init__.py b/tests/test_compatibility/test_approximators/test_model_comparison_approximators/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/test_compatibility/test_approximators/test_model_comparison_approximators/conftest.py b/tests/test_compatibility/test_approximators/test_model_comparison_approximators/conftest.py
new file mode 100644
index 000000000..37a841fd4
--- /dev/null
+++ b/tests/test_compatibility/test_approximators/test_model_comparison_approximators/conftest.py
@@ -0,0 +1,82 @@
+import pytest
+import numpy as np
+
+
+@pytest.fixture
+def simulator():
+    from bayesflow import make_simulator
+    from bayesflow.simulators import ModelComparisonSimulator
+
+    def context(batch_shape, n=None):
+        if n is None:
+            n = np.random.randint(2, 5)
+        return dict(n=n)
+
+    def prior_null():
+        return dict(mu=0.0)
+
+    def prior_alternative():
+        mu = np.random.normal(loc=0, scale=1)
+        return dict(mu=mu)
+
+    def likelihood(n, mu):
+        x = np.random.normal(loc=mu, scale=1, size=n)
+        return dict(x=x)
+
+    simulator_null = make_simulator([prior_null, likelihood])
+    simulator_alternative = make_simulator([prior_alternative, likelihood])
+    return ModelComparisonSimulator(
+        simulators=[simulator_null, simulator_alternative],
+        use_mixed_batches=True,
+        shared_simulator=context,
+    )
+
+
+@pytest.fixture
+def adapter():
+    from bayesflow import Adapter
+
+    return (
+        Adapter()
+        .sqrt("n")
+        .broadcast("n", to="x")
+        .as_set("x")
+        .rename("n", "classifier_conditions")
+        .rename("x", "summary_variables")
+        .drop("mu")
+        .convert_dtype("float64", "float32")
+    )
+
+
+@pytest.fixture
+def summary_network():
+    from bayesflow.networks import DeepSet
+
+    return DeepSet(summary_dim=2, depth=1)
+
+
+@pytest.fixture
+def classifier_network():
+    from bayesflow.networks import MLP
+
+    return MLP(widths=[32, 32])
+
+
+@pytest.fixture
+def approximator(adapter, classifier_network, summary_network, simulator, standardize):
+    from bayesflow.approximators import ModelComparisonApproximator
+
+    return ModelComparisonApproximator(
+        num_models=len(simulator.simulators),
+        classifier_network=classifier_network,
+        adapter=adapter,
+        summary_network=summary_network,
+        # standardize=standardize,
+    )
+
+
+@pytest.fixture(
+    params=["all", None, "classifier_conditions", "summary_variables", ("classifier_conditions", "summary_variables")]
+)
+def standardize(request):
+    return request.param
diff --git a/tests/test_compatibility/test_approximators/test_model_comparison_approximators/test_model_comparison_approximator.py b/tests/test_compatibility/test_approximators/test_model_comparison_approximators/test_model_comparison_approximator.py
new file mode 100644
index 000000000..98fb4344c
--- /dev/null
+++ b/tests/test_compatibility/test_approximators/test_model_comparison_approximators/test_model_comparison_approximator.py
@@ -0,0 +1,40 @@
+import pytest
+from utils import SaveLoadTest, dump_path, load_path
+import numpy as np
+import keras
+
+
+class TestModelComparisonApproximator(SaveLoadTest):
+    filenames = {
+        "approximator": "approximator.keras",
+        "input": "input.pickle",
+        "output": "output.pickle",
+    }
+
+    @pytest.fixture()
+    def setup(self, filepaths, mode, simulator, approximator):
+        if mode == "save":
+            approximator.compile("adamw")
+            approximator.fit(
+                adapter=approximator.adapter, simulator=simulator, epochs=1, batch_size=8, num_batches=2, verbose=0
+            )
+            keras.saving.save_model(approximator, filepaths["approximator"])
+
+            input = simulator.sample(4)
+            output = self.evaluate(approximator, input)
+            dump_path(input, filepaths["input"])
+            dump_path(output, filepaths["output"])
+
+        approximator = keras.saving.load_model(filepaths["approximator"])
+        input = load_path(filepaths["input"])
+        output = load_path(filepaths["output"])
+
+        return approximator, input, output
+
+    def evaluate(self, approximator, data):
+        return approximator.predict(conditions=data)
+
+    def test_output(self, setup):
+        approximator, input, reference = setup
+        output = self.evaluate(approximator, input)
+        np.testing.assert_allclose(reference, output)
diff --git a/tests/test_compatibility/test_distributions/__init__.py b/tests/test_compatibility/test_distributions/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/test_compatibility/test_distributions/conftest.py b/tests/test_compatibility/test_distributions/conftest.py
new file mode 100644
index 000000000..92167e872
--- /dev/null
+++ b/tests/test_compatibility/test_distributions/conftest.py
@@ -0,0 +1,32 @@
+import pytest
+
+
+@pytest.fixture()
+def diagonal_normal():
+    from bayesflow.distributions import DiagonalNormal
+
+    return DiagonalNormal(mean=1.0, std=2.0)
+
+
+@pytest.fixture()
+def diagonal_student_t():
+    from bayesflow.distributions import DiagonalStudentT
+
+    return DiagonalStudentT(df=10, loc=1.0, scale=2.0)
+
+
+@pytest.fixture()
+def mixture():
+    from bayesflow.distributions import DiagonalNormal, DiagonalStudentT, Mixture
+
+    return Mixture([DiagonalNormal(mean=1.0, std=2.0), DiagonalStudentT(df=25, mean=1.0, std=2.0)])
+
+
+@pytest.fixture(params=["diagonal_normal", "diagonal_student_t", "mixture"])
+def distribution(request):
+    return request.getfixturevalue(request.param)
+
+
+@pytest.fixture()
+def shape(batch_size, feature_size):
+    return batch_size, feature_size
diff --git a/tests/test_compatibility/test_distributions/test_distributions.py b/tests/test_compatibility/test_distributions/test_distributions.py
new file mode 100644
index 000000000..a4b2bd0d7
--- /dev/null
+++ b/tests/test_compatibility/test_distributions/test_distributions.py
@@ -0,0 +1,32 @@
+from utils import SaveLoadTest, load_from_config, save_config
+import numpy as np
+import keras
+import pytest
+
+
+class TestDistribution(SaveLoadTest):
+    filenames = {
+        "model": "model.pickle",
+        "output": "output.npy",
+    }
+
+    @pytest.fixture
+    def setup(self, filepaths, mode, distribution, random_samples):
+        if mode == "save":
+            distribution.build(keras.ops.shape(random_samples))
+            save_config(distribution, filepaths["model"])
+
+            output = self.evaluate(distribution, random_samples)
+            np.save(filepaths["output"], output, allow_pickle=False)
+
+        distribution = load_from_config(filepaths["model"])
+        output = np.load(filepaths["output"])
+
+        return distribution, output
+
+    def evaluate(self, distribution, random_samples):
+        return keras.ops.convert_to_numpy(distribution.log_prob(random_samples))
+
+    def test_output(self, setup, random_samples):
+        distribution, output = setup
+        np.testing.assert_allclose(self.evaluate(distribution, random_samples), output)
diff --git a/tests/test_compatibility/test_links/__init__.py b/tests/test_compatibility/test_links/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/test_compatibility/test_links/conftest.py b/tests/test_compatibility/test_links/conftest.py
new file mode 100644
index 000000000..2b4017f5b
--- /dev/null
+++ b/tests/test_compatibility/test_links/conftest.py
@@ -0,0 +1,79 @@
+import numpy as np
+import keras
+import pytest
+
+
+@pytest.fixture()
+def batch_size():
+    return 16
+
+
+@pytest.fixture()
+def feature_size():
+    return 10
+
+
+@pytest.fixture()
+def generic_preactivation(batch_size):
+    return keras.ops.ones((batch_size, 6))
+
+
+@pytest.fixture()
+def ordered():
+    from bayesflow.links import Ordered
+
+    return Ordered(axis=1, anchor_index=2)
+
+
+@pytest.fixture()
+def ordered_quantiles():
+    from bayesflow.links import OrderedQuantiles
+
+    return OrderedQuantiles()
+
+
+@pytest.fixture()
+def cholesky_factor():
+    from bayesflow.links import CholeskyFactor
+
+    return CholeskyFactor()
+
+
+@pytest.fixture()
+def linear():
+    return keras.layers.Activation("linear")
+
+
+@pytest.fixture(params=["ordered", "ordered_quantiles", "cholesky_factor", "linear"], scope="function")
+def link(request):
+    return request.getfixturevalue(request.param)
+
+
+@pytest.fixture()
+def num_quantiles():
+    return 19
+
+
+@pytest.fixture()
+def quantiles_np(num_quantiles):
+    return np.linspace(0, 1, num_quantiles + 2)[1:-1]
+
+
+@pytest.fixture()
+def quantiles_py(quantiles_np):
+    return list(quantiles_np)
+
+
+@pytest.fixture()
+def quantiles_keras(quantiles_np):
+    return keras.ops.convert_to_tensor(quantiles_np)
+
+
+@pytest.fixture()
+def none():
+    return None
+
+
+@pytest.fixture(params=["quantiles_np", "quantiles_py", "quantiles_keras", "none"], scope="function")
+def quantiles(request):
+    return request.getfixturevalue(request.param)
diff --git a/tests/test_compatibility/test_links/test_links.py b/tests/test_compatibility/test_links/test_links.py
new file mode 100644
index 000000000..f4434016c
--- /dev/null
+++ b/tests/test_compatibility/test_links/test_links.py
@@ -0,0 +1,34 @@
+import pytest
+from utils import save_config, load_from_config, dump_path, load_path
+from utils import SaveLoadTest
+import numpy as np
+
+
+class TestLink(SaveLoadTest):
+    filenames = {
+        "model": "model.pickle",
+        "output": "output.pickle",
+    }
+
+    @pytest.fixture
+    def setup(self, filepaths, mode, link, random_samples):
+        if mode == "save":
+            _ = link(random_samples)
+            save_config(link, filepaths["model"])
+
+            output = self.evaluate(link, random_samples)
+            dump_path(output, filepaths["output"])
+
+        link = load_from_config(filepaths["model"])
+        output = load_path(filepaths["output"])
+
+        return link, output
+
+    def evaluate(self, link, data):
+        return link(data)
+
+    def test_output(self, setup, random_samples):
+        link, reference = setup
+        print(reference)
+        output = self.evaluate(link, random_samples)
+        np.testing.assert_allclose(reference, output)
diff --git a/tests/test_compatibility/test_metrics/__init__.py b/tests/test_compatibility/test_metrics/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/test_compatibility/test_metrics/conftest.py b/tests/test_compatibility/test_metrics/conftest.py
new file mode 100644
index 000000000..cbd7d487f
--- /dev/null
+++ b/tests/test_compatibility/test_metrics/conftest.py
@@ -0,0 +1,33 @@
+import pytest
+import numpy as np
+
+
+@pytest.fixture()
+def root_mean_squared_error():
+    from bayesflow.metrics import RootMeanSquaredError
+
+    return RootMeanSquaredError(normalize=True, name="rmse", dtype="float32")
+
+
+@pytest.fixture()
+def maximum_mean_discrepancy():
+    from bayesflow.metrics import MaximumMeanDiscrepancy
+
+    return MaximumMeanDiscrepancy(name="mmd", kernel="gaussian", unbiased=True, dtype="float32")
+
+
+@pytest.fixture(params=["root_mean_squared_error", "maximum_mean_discrepancy"])
+def metric(request):
+    return request.getfixturevalue(request.param)
+
+
+@pytest.fixture
+def samples_1():
+    rng = np.random.default_rng(seed=1)
+    return rng.normal(size=(2, 3))
+
+
+@pytest.fixture
+def samples_2():
+    rng = np.random.default_rng(seed=2)
+    return rng.normal(size=(2, 3))
diff --git a/tests/test_compatibility/test_metrics/test_metrics.py b/tests/test_compatibility/test_metrics/test_metrics.py
new file mode 100644
index 000000000..6e29db69f
--- /dev/null
+++ b/tests/test_compatibility/test_metrics/test_metrics.py
@@ -0,0 +1,31 @@
+import pytest
+from utils import SaveLoadTest, load_from_config, save_config
+import numpy as np
+import keras
+
+
+class TestMetric(SaveLoadTest):
+    filenames = {
+        "model": "model.pickle",
+        "output": "output.npy",
+    }
+
+    @pytest.fixture
+    def setup(self, filepaths, mode, metric, samples_1, samples_2):
+        if mode == "save":
+            save_config(metric, filepaths["model"])
+
+            output = self.evaluate(metric, samples_1, samples_2)
+            np.save(filepaths["output"], output, allow_pickle=False)
+
+        metric = load_from_config(filepaths["model"])
+        output = np.load(filepaths["output"])
+
+        return metric, output
+
+    def evaluate(self, metric, samples_1, samples_2):
+        return keras.ops.convert_to_numpy(metric(samples_1, samples_2))
+
+    def test_output(self, setup, samples_1, samples_2):
+        metric, output = setup
+        np.testing.assert_allclose(self.evaluate(metric, samples_1, samples_2), output)
diff --git a/tests/test_compatibility/test_networks/__init__.py b/tests/test_compatibility/test_networks/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/test_compatibility/test_networks/conftest.py b/tests/test_compatibility/test_networks/conftest.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/test_compatibility/test_networks/test_inference_networks/__init__.py b/tests/test_compatibility/test_networks/test_inference_networks/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/test_compatibility/test_networks/test_inference_networks/conftest.py b/tests/test_compatibility/test_networks/test_inference_networks/conftest.py
new file mode 100644
index 000000000..5add35cba
--- /dev/null
+++ b/tests/test_compatibility/test_networks/test_inference_networks/conftest.py
@@ -0,0 +1,228 @@
+import pytest
+
+from bayesflow.networks import MLP
+
+
+@pytest.fixture()
+def diffusion_model_edm_F():
+    from bayesflow.experimental import DiffusionModel
+
+    return DiffusionModel(
+        subnet=MLP([8, 8]),
+        integrate_kwargs={"method": "rk45", "steps": 10},
+        noise_schedule="edm",
+        prediction_type="F",
+    )
+
+
+@pytest.fixture()
+def diffusion_model_edm_velocity():
+    from bayesflow.experimental import DiffusionModel
+
+    return DiffusionModel(
+        subnet=MLP([8, 8]),
+        integrate_kwargs={"method": "rk45", "steps": 10},
+        noise_schedule="edm",
+        prediction_type="velocity",
+    )
+
+
+@pytest.fixture()
+def diffusion_model_edm_noise():
+    from bayesflow.experimental import DiffusionModel
+
+    return DiffusionModel(
+        subnet=MLP([8, 8]),
+        integrate_kwargs={"method": "rk45", "steps": 10},
+        noise_schedule="edm",
+        prediction_type="noise",
+    )
+
+
+@pytest.fixture()
+def diffusion_model_cosine_F():
+    from bayesflow.experimental import DiffusionModel
+
+    return DiffusionModel(
+        subnet=MLP([8, 8]),
+        integrate_kwargs={"method": "rk45", "steps": 10},
+        noise_schedule="cosine",
+        prediction_type="F",
+    )
+
+
+@pytest.fixture()
+def diffusion_model_cosine_velocity():
+    from bayesflow.experimental import DiffusionModel
+
+    return DiffusionModel(
+        subnet=MLP([8, 8]),
+        integrate_kwargs={"method": "rk45", "steps": 10},
+        noise_schedule="cosine",
+        prediction_type="velocity",
+    )
+
+
+@pytest.fixture()
+def diffusion_model_cosine_noise():
+    from bayesflow.experimental import DiffusionModel
+
+    return DiffusionModel(
+        subnet=MLP([8, 8]),
+        integrate_kwargs={"method": "rk45", "steps": 10},
+        noise_schedule="cosine",
+        prediction_type="noise",
+    )
+
+
+@pytest.fixture()
+def flow_matching():
+    from bayesflow.networks import FlowMatching
+
+    return FlowMatching(
+        subnet=MLP([8, 8]),
+        integrate_kwargs={"method": "rk45", "steps": 10},
+    )
+
+
+@pytest.fixture()
+def consistency_model():
+    from bayesflow.networks import ConsistencyModel
+
+    return ConsistencyModel(total_steps=100, subnet=MLP([8, 8]))
+
+
+@pytest.fixture()
+def affine_coupling_flow():
+    from bayesflow.networks import CouplingFlow
+
+    return CouplingFlow(
+        depth=2, subnet="mlp", subnet_kwargs=dict(widths=[8, 8]), transform="affine", transform_kwargs=dict(clamp=1.8)
+    )
+
+
+@pytest.fixture()
+def spline_coupling_flow():
+    from bayesflow.networks import CouplingFlow
+
+    return CouplingFlow(
+        depth=2, subnet="mlp", subnet_kwargs=dict(widths=[8, 8]), transform="spline", transform_kwargs=dict(bins=8)
+    )
+
+
+@pytest.fixture()
+def free_form_flow():
+    from bayesflow.experimental import FreeFormFlow
+
+    return FreeFormFlow(encoder_subnet=MLP([16, 16]), decoder_subnet=MLP([16, 16]))
+
+
+@pytest.fixture()
+def typical_point_inference_network():
+    from bayesflow.networks import PointInferenceNetwork
+    from bayesflow.scores import MeanScore, MedianScore, QuantileScore, MultivariateNormalScore
+
+    return PointInferenceNetwork(
+        scores=dict(
+            mean=MeanScore(),
+            median=MedianScore(),
+            quantiles=QuantileScore([0.1, 0.2, 0.5, 0.65]),
+            mvn=MultivariateNormalScore(),  # currently not stable
+        )
+    )
+
+
+@pytest.fixture()
+def typical_point_inference_network_subnet():
+    from bayesflow.networks import PointInferenceNetwork
+    from bayesflow.scores import MeanScore, MedianScore, QuantileScore, MultivariateNormalScore
+
+    subnet = MLP([16, 8])
+
+    return PointInferenceNetwork(
+        scores=dict(
+            mean=MeanScore(subnets=dict(value=subnet)),
+            median=MedianScore(subnets=dict(value=subnet)),
+            quantiles=QuantileScore(subnets=dict(value=subnet)),
+            mvn=MultivariateNormalScore(subnets=dict(mean=subnet, covariance=subnet)),
+        ),
+        subnet=subnet,
+    )
+
+
+@pytest.fixture(
+    params=[
+        "typical_point_inference_network",
+        "affine_coupling_flow",
+        "spline_coupling_flow",
+        "flow_matching",
+        "free_form_flow",
+        "consistency_model",
+        pytest.param("diffusion_model_edm_F"),
+        pytest.param("diffusion_model_edm_noise", marks=[pytest.mark.slow]),
+        pytest.param("diffusion_model_cosine_velocity", marks=[pytest.mark.slow]),
+        pytest.param("diffusion_model_cosine_F", marks=[pytest.mark.slow]),
+        pytest.param("diffusion_model_cosine_noise", marks=[pytest.mark.slow]),
+        pytest.param("diffusion_model_cosine_velocity", marks=[pytest.mark.slow]),
+    ],
+    scope="function",
+)
+def inference_network(request):
+    return request.getfixturevalue(request.param)
+
+
+@pytest.fixture(
+    params=[
+        "typical_point_inference_network_subnet",
+        "coupling_flow_subnet",
+        "flow_matching_subnet",
+        "free_form_flow_subnet",
+    ],
+    scope="function",
+)
+def inference_network_subnet(request):
+    return request.getfixturevalue(request.param)
+
+
+@pytest.fixture(
+    params=[
+        "affine_coupling_flow",
+        "spline_coupling_flow",
+        "flow_matching",
+        "free_form_flow",
+        "consistency_model",
+        pytest.param("diffusion_model_edm_F"),
+        pytest.param(
+            "diffusion_model_edm_noise",
+            marks=[
+                pytest.mark.slow,
+                pytest.mark.skip("noise predicition not testable without prior training for numerical reasons."),
+            ],
+        ),
+        pytest.param("diffusion_model_cosine_velocity", marks=[pytest.mark.slow]),
+        pytest.param(
+            "diffusion_model_cosine_F",
+            marks=[
+                pytest.mark.slow,
+                pytest.mark.skip("skip to reduce load on CI."),
+            ],
+        ),
+        pytest.param(
+            "diffusion_model_cosine_noise",
+            marks=[
+                pytest.mark.slow,
+                pytest.mark.skip("noise predicition not testable without prior training for numerical reasons."),
+            ],
+        ),
+        pytest.param(
+            "diffusion_model_cosine_velocity",
+            marks=[
+                pytest.mark.slow,
+                pytest.mark.skip("skip to reduce load on CI."),
+            ],
+        ),
+    ],
+    scope="function",
+)
+def generative_inference_network(request):
+    return request.getfixturevalue(request.param)
diff --git a/tests/test_compatibility/test_networks/test_inference_networks/test_inference_networks.py b/tests/test_compatibility/test_networks/test_inference_networks/test_inference_networks.py
new file mode 100644
index 000000000..2403a3bee
--- /dev/null
+++ b/tests/test_compatibility/test_networks/test_inference_networks/test_inference_networks.py
@@ -0,0 +1,56 @@
+import pytest
+from utils import SaveLoadTest, dump_path, load_path
+import numpy as np
+import keras
+
+
+class TestInferenceNetwork(SaveLoadTest):
+    filenames = {
+        "model": "model.keras",
+        "output": "output.pickle",
+    }
+
+    @pytest.fixture()
+    def setup(self, filepaths, mode, inference_network, random_samples, random_conditions):
+        if mode == "save":
+            xz_shape = keras.ops.shape(random_samples)
+            conditions_shape = keras.ops.shape(random_conditions) if random_conditions is not None else None
+            inference_network.build(xz_shape, conditions_shape)
+
+            _ = inference_network.compute_metrics(random_samples, conditions=random_conditions)
+            keras.saving.save_model(inference_network, filepaths["model"])
+            output = self.evaluate(inference_network, random_samples, random_conditions)
+
+            dump_path(output, filepaths["output"])
+
+        inference_network = keras.saving.load_model(filepaths["model"])
+        output = load_path(filepaths["output"])
+
+        return inference_network, random_samples, random_conditions, output
+
+    def evaluate(self, inference_network, samples, conditions):
+        import bayesflow as bf
+
+        if isinstance(inference_network, bf.networks.ConsistencyModel):
+            # not invertible, but inverse with steps=1 is deterministic
+            return keras.tree.map_structure(
+                keras.ops.convert_to_numpy, inference_network._inverse(samples, conditions, steps=1)
+            )
+        if isinstance(inference_network, bf.networks.PointInferenceNetwork) and conditions is None:
+            pytest.skip("PointInferenceNetwork requires condition")
+        try:
+            return keras.tree.map_structure(
+                keras.ops.convert_to_numpy, inference_network.log_prob(samples, conditions=conditions)
+            )
+        except NotImplementedError:
+            pytest.skip("log_prob not available")
+
+    def test_output(self, setup):
+        approximator, samples, conditions, reference = setup
+        output = self.evaluate(approximator, samples, conditions)
+        print(reference)
+        from keras.tree import flatten
+
+        for ref, out in zip(flatten(reference), flatten(output)):
+            print(ref, out)
+            np.testing.assert_allclose(ref, out)
diff --git a/tests/test_compatibility/test_networks/test_summary_networks/__init__.py b/tests/test_compatibility/test_networks/test_summary_networks/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/test_compatibility/test_networks/test_summary_networks/conftest.py b/tests/test_compatibility/test_networks/test_summary_networks/conftest.py
new file mode 100644
index 000000000..48b73ed10
--- /dev/null
+++ b/tests/test_compatibility/test_networks/test_summary_networks/conftest.py
@@ -0,0 +1,50 @@
+import pytest
+
+
+@pytest.fixture(scope="function")
+def time_series_network(summary_dim):
+    from bayesflow.networks import TimeSeriesNetwork
+
+    return TimeSeriesNetwork(summary_dim=summary_dim)
+
+
+@pytest.fixture(scope="function")
+def time_series_transformer(summary_dim):
+    from bayesflow.networks import TimeSeriesTransformer
+
+    return TimeSeriesTransformer(summary_dim=summary_dim)
+
+
+@pytest.fixture(scope="function")
+def fusion_transformer(summary_dim):
+    from bayesflow.networks import FusionTransformer
+
+    return FusionTransformer(summary_dim=summary_dim)
+
+
+@pytest.fixture(scope="function")
+def set_transformer(summary_dim):
+    from bayesflow.networks import SetTransformer
+
+    return SetTransformer(summary_dim=summary_dim)
+
+
+@pytest.fixture(scope="function")
+def deep_set(summary_dim):
+    from bayesflow.networks import DeepSet
+
+    return DeepSet(summary_dim=summary_dim)
+
+
+@pytest.fixture(
+    params=[
+        "time_series_network",
+        "time_series_transformer",
+        "fusion_transformer",
+        "set_transformer",
+        "deep_set",
+    ],
+    scope="function",
+)
+def summary_network(request, summary_dim):
+    return request.getfixturevalue(request.param)
diff --git a/tests/test_compatibility/test_networks/test_summary_networks/test_summary_networks.py b/tests/test_compatibility/test_networks/test_summary_networks/test_summary_networks.py
new file mode 100644
index 000000000..cfd9dc82f
--- /dev/null
+++ b/tests/test_compatibility/test_networks/test_summary_networks/test_summary_networks.py
@@ -0,0 +1,37 @@
+import pytest
+from utils import SaveLoadTest, dump_path, load_path
+import numpy as np
+import keras
+
+
+class TestInferenceNetwork(SaveLoadTest):
+    filenames = {
+        "model": "model.keras",
+        "output": "output.pickle",
+    }
+
+    @pytest.fixture()
+    def setup(self, filepaths, mode, summary_network, random_set):
+        if mode == "save":
+            shape = keras.ops.shape(random_set)
+            summary_network.build(shape)
+
+            _ = summary_network(random_set)
+            keras.saving.save_model(summary_network, filepaths["model"])
+            output = self.evaluate(summary_network, random_set)
+
+            dump_path(output, filepaths["output"])
+
+        summary_network = keras.saving.load_model(filepaths["model"])
+        output = load_path(filepaths["output"])
+
+        return summary_network, random_set, output
+
+    def evaluate(self, summary_network, data):
+        return keras.ops.convert_to_numpy(summary_network(data))
+
+    def test_output(self, setup):
+        approximator, data, reference = setup
+        output = self.evaluate(approximator, data)
+
+        np.testing.assert_allclose(reference, output)
diff --git a/tests/test_compatibility/test_scores/__init__.py b/tests/test_compatibility/test_scores/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/test_compatibility/test_scores/conftest.py b/tests/test_compatibility/test_scores/conftest.py
new file mode 100644
index 000000000..6bde9076d
--- /dev/null
+++ b/tests/test_compatibility/test_scores/conftest.py
@@ -0,0 +1,51 @@
+import keras
+import pytest
+
+
+@pytest.fixture()
+def reference(batch_size, feature_size):
+    return keras.random.uniform((batch_size, feature_size))
+
+
+@pytest.fixture()
+def median_score():
+    from bayesflow.scores import MedianScore
+
+    return MedianScore()
+
+
+@pytest.fixture()
+def mean_score():
+    from bayesflow.scores import MeanScore
+
+    return MeanScore()
+
+
+@pytest.fixture()
+def normed_diff_score():
+    from bayesflow.scores import NormedDifferenceScore
+
+    return NormedDifferenceScore(k=3)
+
+
+@pytest.fixture(scope="function")
+def quantile_score():
+    from bayesflow.scores import QuantileScore
+
+    return QuantileScore()
+
+
+@pytest.fixture()
+def multivariate_normal_score():
+    from bayesflow.scores import MultivariateNormalScore
+
+    return MultivariateNormalScore()
+
+
+@pytest.fixture(
+    params=["median_score", "mean_score", "normed_diff_score", "quantile_score", "multivariate_normal_score"],
+    scope="function",
+)
+def scoring_rule(request):
+    print("initialize scoring rule in test_scores")
+    return request.getfixturevalue(request.param)
diff --git a/tests/test_compatibility/test_scores/test_scores.py b/tests/test_compatibility/test_scores/test_scores.py
new file mode 100644
index 000000000..d9d1af5e3
--- /dev/null
+++ b/tests/test_compatibility/test_scores/test_scores.py
@@ -0,0 +1,45 @@
+import pytest
+from utils import SaveLoadTest, save_config, load_from_config, dump_path, load_path
+import numpy as np
+import keras
+
+
+class TestScore(SaveLoadTest):
+    filenames = {
+        "model": "model.pickle",
+        "output": "output.pickle",
+    }
+
+    @pytest.fixture
+    def setup(self, filepaths, mode, scoring_rule, random_samples, request):
+        if mode == "save":
+            save_config(scoring_rule, filepaths["model"])
+
+            output = self.evaluate(scoring_rule, random_samples)
+            dump_path(output, filepaths["output"])
+
+        scoring_rule = load_from_config(filepaths["model"])
+        output = load_path(filepaths["output"])
+
+        return scoring_rule, output
+
+    def evaluate(self, scoring_rule, data):
+        # Using random data also as targets for the purpose of this test.
+        head_shapes = scoring_rule.get_head_shapes_from_target_shape(data.shape)
+        estimates = {}
+        for key, output_shape in head_shapes.items():
+            link = scoring_rule.get_link(key)
+            if hasattr(link, "compute_input_shape"):
+                link_input_shape = link.compute_input_shape(output_shape)
+            else:
+                link_input_shape = output_shape
+            dummy_input = keras.ops.ones((data.shape[0],) + link_input_shape)
+            estimates[key] = link(dummy_input)
+
+        score = scoring_rule.score(estimates, data)
+        return score
+
+    def test_output(self, setup, random_samples):
+        scoring_rule, reference = setup
+        output = self.evaluate(scoring_rule, random_samples)
+        np.testing.assert_allclose(reference, output)
diff --git a/tests/test_compatibility/test_workflows/__init__.py b/tests/test_compatibility/test_workflows/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/test_compatibility/test_workflows/conftest.py b/tests/test_compatibility/test_workflows/conftest.py
new file mode 100644
index 000000000..98f6b8704
--- /dev/null
+++ b/tests/test_compatibility/test_workflows/conftest.py
@@ -0,0 +1,65 @@
+import pytest
+
+import keras
+
+from bayesflow.utils.serialization import serializable
+
+
+@pytest.fixture(params=["coupling_flow", "flow_matching"])
+def inference_network(request):
+    if request.param == "coupling_flow":
+        from bayesflow.networks import CouplingFlow
+
+        return CouplingFlow(depth=2)
+
+    elif request.param == "flow_matching":
+        from bayesflow.networks import FlowMatching
+
+        return FlowMatching(subnet_kwargs=dict(widths=(32, 32)), use_optimal_transport=False)
+
+
+@pytest.fixture(params=["time_series_transformer", "fusion_transformer", "time_series_network", "custom"])
+def summary_network(request):
+    if request.param == "time_series_transformer":
+        from bayesflow.networks import TimeSeriesTransformer
+
+        return TimeSeriesTransformer(embed_dims=(8, 8), mlp_widths=(16, 8), mlp_depths=(1, 1))
+
+    elif request.param == "fusion_transformer":
+        from bayesflow.networks import FusionTransformer
+
+        return FusionTransformer(
+            embed_dims=(8, 8), mlp_widths=(8, 16), mlp_depths=(2, 1), template_dim=8, bidirectional=False
+        )
+
+    elif request.param == "time_series_network":
+        from bayesflow.networks import TimeSeriesNetwork
+
+        return TimeSeriesNetwork(filters=4, skip_steps=2)
+
+    elif request.param == "custom":
+        from bayesflow.networks import SummaryNetwork
+
+        @serializable("test", disable_module_check=True)
+        class Custom(SummaryNetwork):
+            def __init__(self, **kwargs):
+                super().__init__(**kwargs)
+                self.inner = keras.Sequential([keras.layers.LSTM(8), keras.layers.Dense(4)])
+
+            def call(self, x, **kwargs):
+                return self.inner(x, training=kwargs.get("stage") == "training")
+
+        return Custom()
+
+    elif request.param == "fusion_network":
+        from bayesflow.networks import FusionNetwork, DeepSet
+
+        return FusionNetwork({"a": DeepSet(), "b": keras.layers.Flatten()}, head=keras.layers.Dense(2))
+
+
+@pytest.fixture(params=["fusion_adapter", None])
+def adapter(request):
+    from bayesflow import Adapter
+
+    if request.param == "fusion_adapter":
+        return Adapter.create_default(["mean"]).group(["a", "b"], "summary_variables")
diff --git a/tests/test_compatibility/test_workflows/test_workflows.py b/tests/test_compatibility/test_workflows/test_workflows.py
new file mode 100644
index 000000000..81789210d
--- /dev/null
+++ b/tests/test_compatibility/test_workflows/test_workflows.py
@@ -0,0 +1,58 @@
+import pytest
+from utils import SaveLoadTest, dump_path, load_path
+import numpy as np
+import keras
+
+
+@pytest.mark.parametrize("inference_network", ["coupling_flow", "flow_matching"], indirect=True)
+@pytest.mark.parametrize(
+    "summary_network,simulator,adapter",
+    [
+        ["time_series_transformer", "sir", None],
+        ["fusion_transformer", "sir", None],
+        ["fusion_network", "fusion", "fusion_adapter"],
+    ],
+    indirect=True,
+)
+class TestWorkflow(SaveLoadTest):
+    filenames = {
+        "approximator": "approximator.keras",
+        "input": "input.pickle",
+        "output": "output.pickle",
+    }
+
+    @pytest.fixture()
+    def setup(self, filepaths, mode, inference_network, summary_network, simulator, adapter):
+        if mode == "save":
+            import bayesflow as bf
+
+            workflow = bf.BasicWorkflow(
+                adapter=adapter,
+                inference_network=inference_network,
+                summary_network=summary_network,
+                inference_variables=["parameters"],
+                summary_variables=["observables"],
+                simulator=simulator,
+            )
+            workflow.fit_online(epochs=1, batch_size=8, num_batches_per_epoch=2, verbose=0)
+            keras.saving.save_model(workflow.approximator, filepaths["approximator"])
+
+            input = workflow.simulate(4)
+            output = self.evaluate(workflow.approximator, input)
+            dump_path(input, filepaths["input"])
+            dump_path(output, filepaths["output"])
+
+        approximator = keras.saving.load_model(filepaths["approximator"])
+        input = load_path(filepaths["input"])
+        output = load_path(filepaths["output"])
+
+        return approximator, input, output
+
+    def evaluate(self, approximator, data):
+        return approximator.log_prob(data)
+
+    def test_output(self, setup):
+        approximator, input, reference = setup
+        output = self.evaluate(approximator, input)
+        print(reference)
+        np.testing.assert_allclose(reference, output)
diff --git a/tests/test_compatibility/utils/__init__.py b/tests/test_compatibility/utils/__init__.py
new file mode 100644
index 000000000..3f5c7efaa
--- /dev/null
+++ b/tests/test_compatibility/utils/__init__.py
@@ -0,0 +1,2 @@
+from .io import *
+from .helpers import *
diff --git a/tests/test_compatibility/utils/helpers.py b/tests/test_compatibility/utils/helpers.py
new file mode 100644
index 000000000..7ab9b98a5
--- /dev/null
+++ b/tests/test_compatibility/utils/helpers.py
@@ -0,0 +1,17 @@
+import pytest
+from utils import get_valid_filename, get_path
+
+
+class SaveLoadTest:
+    filenames = {}
+
+    @pytest.fixture(autouse=True)
+    def filepaths(self, data_dir, mode, request):
+        prefix = get_valid_filename(request._pyfuncitem.name)
+        files = {}
+        for label, filename in self.filenames.items():
+            path = get_path(data_dir, f"{prefix}__{filename}", create=mode == "save")
+            if mode == "load" and not path.exists():
+                pytest.skip(f"Required file not available: {path}")
+            files[label] = path
+        return files
diff --git a/tests/test_compatibility/utils/io.py b/tests/test_compatibility/utils/io.py
new file mode 100644
index 000000000..c2a554617
--- /dev/null
+++ b/tests/test_compatibility/utils/io.py
@@ -0,0 +1,48 @@
+import inspect
+from keras.saving import deserialize_keras_object, serialize_keras_object
+from pathlib import Path
+import pickle
+import re
+
+
+def get_path(data_dir: Path | str = "", filename: str = "", *, create: bool = False) -> Path:
+    frame = inspect.stack()[1]
+    base_path = Path(inspect.stack()[1].filename[:-3])
+    function_name = frame.function
+    if "self" in frame[0].f_locals:
+        filepath = base_path / frame[0].f_locals["self"].__class__.__name__ / function_name
+    else:
+        filepath = base_path / function_name
+    filepath = Path(data_dir) / filepath.relative_to(Path("tests").absolute())
+    if create is True:
+        filepath.mkdir(parents=True, exist_ok=True)
+    if filename:
+        return filepath / filename
+    return filepath
+
+
+def get_valid_filename(name):
+    s = str(name).strip().replace(" ", "_")
+    s = re.sub(r"(?u)[^-\w.]", "_", s)
+    if s in {"", ".", ".."}:
+        raise ValueError("Could not derive file name from '%s'" % name)
+    return s
+
+
+def dump_path(object, filepath: Path | str):
+    with open(filepath, "wb") as f:
+        pickle.dump(object, f)
+
+
+def load_path(filepath: Path | str):
+    with open(filepath, "rb") as f:
+        return pickle.load(f)
+
+
+def save_config(object, filepath: Path | str):
+    dump_path(serialize_keras_object(object), filepath)
+
+
+def load_from_config(filepath: Path | str, custom_objects=None):
+    config = load_path(filepath)
+    return deserialize_keras_object(config, custom_objects=custom_objects)

From e6e89d76dd8f18a0b9a0528bd7a701d17b065ca4 Mon Sep 17 00:00:00 2001
From: Valentin Pratz <git@valentinpratz.de>
Date: Sat, 21 Jun 2025 11:03:15 +0000
Subject: [PATCH 04/10] fix trainable parameters in distributions

---
 bayesflow/distributions/diagonal_normal.py    | 13 +++++-----
 bayesflow/distributions/diagonal_student_t.py | 25 ++++++++++---------
 bayesflow/distributions/mixture.py            |  2 +-
 3 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/bayesflow/distributions/diagonal_normal.py b/bayesflow/distributions/diagonal_normal.py
index f8d93b945..6b64445c7 100644
--- a/bayesflow/distributions/diagonal_normal.py
+++ b/bayesflow/distributions/diagonal_normal.py
@@ -58,7 +58,6 @@ def __init__(
         self.seed_generator = seed_generator or keras.random.SeedGenerator()
 
         self.dim = None
-        self.log_normalization_constant = None
         self._mean = None
         self._std = None
 
@@ -71,17 +70,18 @@ def build(self, input_shape: Shape) -> None:
         self.mean = ops.cast(ops.broadcast_to(self.mean, (self.dim,)), "float32")
         self.std = ops.cast(ops.broadcast_to(self.std, (self.dim,)), "float32")
 
-        self.log_normalization_constant = -0.5 * self.dim * math.log(2.0 * math.pi) - ops.sum(ops.log(self.std))
-
         if self.trainable_parameters:
             self._mean = self.add_weight(
                 shape=ops.shape(self.mean),
-                initializer=keras.initializers.get(self.mean),
+                initializer=keras.initializers.get(keras.ops.copy(self.mean)),
                 dtype="float32",
                 trainable=True,
             )
             self._std = self.add_weight(
-                shape=ops.shape(self.std), initializer=keras.initializers.get(self.std), dtype="float32", trainable=True
+                shape=ops.shape(self.std),
+                initializer=keras.initializers.get(keras.ops.copy(self.std)),
+                dtype="float32",
+                trainable=True,
             )
         else:
             self._mean = self.mean
@@ -91,7 +91,8 @@ def log_prob(self, samples: Tensor, *, normalize: bool = True) -> Tensor:
         result = -0.5 * ops.sum((samples - self._mean) ** 2 / self._std**2, axis=-1)
 
         if normalize:
-            result += self.log_normalization_constant
+            log_normalization_constant = -0.5 * self.dim * math.log(2.0 * math.pi) - ops.sum(ops.log(self._std))
+            result += log_normalization_constant
 
         return result
 
diff --git a/bayesflow/distributions/diagonal_student_t.py b/bayesflow/distributions/diagonal_student_t.py
index 98e3fb7eb..9b02ee821 100644
--- a/bayesflow/distributions/diagonal_student_t.py
+++ b/bayesflow/distributions/diagonal_student_t.py
@@ -63,7 +63,6 @@ def __init__(
 
         self.seed_generator = seed_generator or keras.random.SeedGenerator()
 
-        self.log_normalization_constant = None
         self.dim = None
         self._loc = None
         self._scale = None
@@ -78,21 +77,16 @@ def build(self, input_shape: Shape) -> None:
         self.loc = ops.cast(ops.broadcast_to(self.loc, (self.dim,)), "float32")
         self.scale = ops.cast(ops.broadcast_to(self.scale, (self.dim,)), "float32")
 
-        self.log_normalization_constant = (
-            -0.5 * self.dim * math.log(self.df)
-            - 0.5 * self.dim * math.log(math.pi)
-            - math.lgamma(0.5 * self.df)
-            + math.lgamma(0.5 * (self.df + self.dim))
-            - ops.sum(keras.ops.log(self.scale))
-        )
-
         if self.trainable_parameters:
             self._loc = self.add_weight(
-                shape=ops.shape(self.loc), initializer=keras.initializers.get(self.loc), dtype="float32", trainable=True
+                shape=ops.shape(self.loc),
+                initializer=keras.initializers.get(keras.ops.copy(self.loc)),
+                dtype="float32",
+                trainable=True,
             )
             self._scale = self.add_weight(
                 shape=ops.shape(self.scale),
-                initializer=keras.initializers.get(self.scale),
+                initializer=keras.initializers.get(keras.ops.copy(self.scale)),
                 dtype="float32",
                 trainable=True,
             )
@@ -105,7 +99,14 @@ def log_prob(self, samples: Tensor, *, normalize: bool = True) -> Tensor:
         result = -0.5 * (self.df + self.dim) * ops.log1p(mahalanobis_term / self.df)
 
         if normalize:
-            result += self.log_normalization_constant
+            log_normalization_constant = (
+                -0.5 * self.dim * math.log(self.df)
+                - 0.5 * self.dim * math.log(math.pi)
+                - math.lgamma(0.5 * self.df)
+                + math.lgamma(0.5 * (self.df + self.dim))
+                - ops.sum(keras.ops.log(self._scale))
+            )
+            result += log_normalization_constant
 
         return result
 
diff --git a/bayesflow/distributions/mixture.py b/bayesflow/distributions/mixture.py
index a7bf2ea27..e1f04e88f 100644
--- a/bayesflow/distributions/mixture.py
+++ b/bayesflow/distributions/mixture.py
@@ -144,7 +144,7 @@ def build(self, input_shape: Shape) -> None:
 
         self._mixture_logits = self.add_weight(
             shape=(len(self.distributions),),
-            initializer=keras.initializers.get(self.mixture_logits),
+            initializer=keras.initializers.get(keras.ops.copy(self.mixture_logits)),
             dtype="float32",
             trainable=self.trainable_mixture,
         )

From 8ed5dab289f3e690b1ef8051b5e7ed535ebb2ef2 Mon Sep 17 00:00:00 2001
From: Valentin Pratz <git@valentinpratz.de>
Date: Sat, 21 Jun 2025 13:09:49 +0000
Subject: [PATCH 05/10] allow parametrization of kwargs

---
 tests/conftest.py                             |   6 +
 tests/test_compatibility/conftest.py          | 180 +++++++++++--
 .../test_adapters/conftest.py                 |   1 -
 .../test_approximators/conftest.py            |  72 -----
 .../test_continuous_approximator/conftest.py  |  10 +
 .../test_continuous_approximator.py           |  20 +-
 .../__init__.py                               |   0
 .../conftest.py                               |   9 +-
 .../test_model_comparison_approximator.py     |   3 +-
 .../test_point_approximator}/__init__.py      |   0
 .../test_point_approximator/conftest.py       |  54 ++++
 .../test_point_approximator.py                |  52 ++++
 .../test_distributions/conftest.py            |  26 +-
 .../test_distributions/test_distributions.py  |  52 +++-
 .../test_compatibility/test_links/conftest.py |  81 ++----
 .../test_links/test_links.py                  |  10 +
 .../test_metrics/conftest.py                  |  27 +-
 .../test_metrics/test_metrics.py              |  10 +
 .../test_inference_networks/conftest.py       | 255 +++---------------
 .../test_inference_networks.py                |  55 ++++
 .../test_summary_networks/conftest.py         |  14 +-
 .../test_summary_networks.py                  |  13 +-
 .../test_scores/conftest.py                   |  68 ++---
 .../test_scores/test_scores.py                |  11 +
 .../test_workflows/conftest.py                |  65 -----
 .../test_workflows/test_workflows.py          |  58 ----
 26 files changed, 561 insertions(+), 591 deletions(-)
 create mode 100644 tests/test_compatibility/test_approximators/test_continuous_approximator/conftest.py
 rename tests/test_compatibility/test_approximators/{test_model_comparison_approximators => test_model_comparison_approximator}/__init__.py (100%)
 rename tests/test_compatibility/test_approximators/{test_model_comparison_approximators => test_model_comparison_approximator}/conftest.py (92%)
 rename tests/test_compatibility/test_approximators/{test_model_comparison_approximators => test_model_comparison_approximator}/test_model_comparison_approximator.py (88%)
 rename tests/test_compatibility/{test_workflows => test_approximators/test_point_approximator}/__init__.py (100%)
 create mode 100644 tests/test_compatibility/test_approximators/test_point_approximator/conftest.py
 create mode 100644 tests/test_compatibility/test_approximators/test_point_approximator/test_point_approximator.py
 delete mode 100644 tests/test_compatibility/test_workflows/conftest.py
 delete mode 100644 tests/test_compatibility/test_workflows/test_workflows.py

diff --git a/tests/conftest.py b/tests/conftest.py
index 95ec8f693..c0abb1f3d 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -6,6 +6,12 @@
 BACKENDS = ["jax", "numpy", "tensorflow", "torch"]
 
 
+def pytest_addoption(parser):
+    parser.addoption("--mode", choices=["save", "load"])
+    parser.addoption("--commit", type=str)
+    parser.addoption("--from", type=str, required=False, dest="from_")
+
+
 def pytest_runtest_setup(item):
     """Skips backends by test markers. Unmarked tests are treated as backend-agnostic"""
     backend = keras.backend.backend()
diff --git a/tests/test_compatibility/conftest.py b/tests/test_compatibility/conftest.py
index 6d9e1fc29..a33a90f4a 100644
--- a/tests/test_compatibility/conftest.py
+++ b/tests/test_compatibility/conftest.py
@@ -36,26 +36,174 @@ def data_dir(request, commit, from_commit, tmp_path_factory):
     return Path(tmp_path_factory.mktemp("_compatibility_data"))
 
 
+# reduce number of test configurations
+@pytest.fixture(params=[None, 3])
+def conditions_size(request):
+    return request.param
+
+
+@pytest.fixture(params=[1, 2])
+def summary_dim(request):
+    return request.param
+
+
+@pytest.fixture(params=[4])
+def feature_size(request):
+    return request.param
+
+
+# Generic fixtures for use as input to the tested classes.
+# The classes to test are constructed in the respective subdirectories, to allow for more thorough configuation.
+@pytest.fixture(params=[None, "all"])
+def standardize(request):
+    return request.param
+
+
+@pytest.fixture()
+def adapter(request):
+    import bayesflow as bf
+
+    match request.param:
+        case "summary":
+            return bf.Adapter.create_default("parameters").rename("observables", "summary_variables")
+        case "direct":
+            return bf.Adapter.create_default("parameters").rename("observables", "direct_conditions")
+        case "default":
+            return bf.Adapter.create_default("parameters")
+        case "empty":
+            return bf.Adapter()
+        case None:
+            return None
+        case _:
+            raise ValueError(f"Invalid request parameter for adapter: {request.param}")
+
+
+@pytest.fixture(params=["coupling_flow", "flow_matching"])
+def inference_network(request):
+    match request.param:
+        case "coupling_flow":
+            from bayesflow.networks import CouplingFlow
+
+            return CouplingFlow(depth=2)
+
+        case "flow_matching":
+            from bayesflow.networks import FlowMatching
+
+            return FlowMatching(subnet_kwargs=dict(widths=(32, 32)), use_optimal_transport=False)
+
+        case None:
+            return None
+
+        case _:
+            raise ValueError(f"Invalid request parameter for inference_network: {request.param}")
+
+
+@pytest.fixture(params=["time_series_transformer", "fusion_transformer", "time_series_network", "custom"])
+def summary_network(request):
+    match request.param:
+        case "time_series_transformer":
+            from bayesflow.networks import TimeSeriesTransformer
+
+            return TimeSeriesTransformer(embed_dims=(8, 8), mlp_widths=(16, 8), mlp_depths=(1, 1))
+
+        case "fusion_transformer":
+            from bayesflow.networks import FusionTransformer
+
+            return FusionTransformer(
+                embed_dims=(8, 8), mlp_widths=(8, 16), mlp_depths=(2, 1), template_dim=8, bidirectional=False
+            )
+
+        case "time_series_network":
+            from bayesflow.networks import TimeSeriesNetwork
+
+            return TimeSeriesNetwork(filters=4, skip_steps=2)
+
+        case "deep_set":
+            from bayesflow.networks import DeepSet
+
+            return DeepSet(summary_dim=2, depth=1)
+
+        case "custom":
+            from bayesflow.networks import SummaryNetwork
+            from bayesflow.utils.serialization import serializable
+            import keras
+
+            @serializable("test", disable_module_check=True)
+            class Custom(SummaryNetwork):
+                def __init__(self, **kwargs):
+                    super().__init__(**kwargs)
+                    self.inner = keras.Sequential([keras.layers.LSTM(8), keras.layers.Dense(4)])
+
+                def call(self, x, **kwargs):
+                    return self.inner(x, training=kwargs.get("stage") == "training")
+
+            return Custom()
+
+        case "flatten":
+            # very simple summary network for fast training
+            from bayesflow.networks import SummaryNetwork
+            from bayesflow.utils.serialization import serializable
+            import keras
+
+            @serializable("test", disable_module_check=True)
+            class FlattenSummaryNetwork(SummaryNetwork):
+                def __init__(self, **kwargs):
+                    super().__init__(**kwargs)
+                    self.inner = keras.layers.Flatten()
+
+                def call(self, x, **kwargs):
+                    return self.inner(x, training=kwargs.get("stage") == "training")
+
+            return FlattenSummaryNetwork()
+
+        case "fusion_network":
+            from bayesflow.networks import FusionNetwork, DeepSet
+
+            return FusionNetwork({"a": DeepSet(), "b": keras.layers.Flatten()}, head=keras.layers.Dense(2))
+        case None:
+            return None
+        case _:
+            raise ValueError(f"Invalid request parameter for summary_network: {request.param}")
+
+
 @pytest.fixture(params=["sir", "fusion"])
 def simulator(request):
-    if request.param == "sir":
-        from bayesflow.simulators import SIR
+    match request.param:
+        case "sir":
+            from bayesflow.simulators import SIR
+
+            return SIR()
+        case "lotka_volterra":
+            from bayesflow.simulators import LotkaVolterra
+
+            return LotkaVolterra()
+
+        case "two_moons":
+            from bayesflow.simulators import TwoMoons
+
+            return TwoMoons()
+        case "normal":
+            from tests.utils.normal_simulator import NormalSimulator
 
-        return SIR()
-    elif request.param == "fusion":
-        from bayesflow.simulators import Simulator
-        from bayesflow.types import Shape, Tensor
-        from bayesflow.utils.decorators import allow_batch_size
-        import numpy as np
+            return NormalSimulator()
+        case "fusion":
+            from bayesflow.simulators import Simulator
+            from bayesflow.types import Shape, Tensor
+            from bayesflow.utils.decorators import allow_batch_size
+            import numpy as np
 
-        class FusionSimulator(Simulator):
-            @allow_batch_size
-            def sample(self, batch_shape: Shape, num_observations: int = 4) -> dict[str, Tensor]:
-                mean = np.random.normal(0.0, 0.1, size=batch_shape + (2,))
-                noise = np.random.standard_normal(batch_shape + (num_observations, 2))
+            class FusionSimulator(Simulator):
+                @allow_batch_size
+                def sample(self, batch_shape: Shape, num_observations: int = 4) -> dict[str, Tensor]:
+                    mean = np.random.normal(0.0, 0.1, size=batch_shape + (2,))
+                    noise = np.random.standard_normal(batch_shape + (num_observations, 2))
 
-                x = mean[:, None] + noise
+                    x = mean[:, None] + noise
 
-                return dict(mean=mean, a=x, b=x)
+                    return dict(mean=mean, a=x, b=x)
 
-        return FusionSimulator()
+            return FusionSimulator()
+        case None:
+            return None
+        case _:
+            raise ValueError(f"Invalid request parameter for simulator: {request.param}")
diff --git a/tests/test_compatibility/test_adapters/conftest.py b/tests/test_compatibility/test_adapters/conftest.py
index 73dfa1eb3..bc8ad90d3 100644
--- a/tests/test_compatibility/test_adapters/conftest.py
+++ b/tests/test_compatibility/test_adapters/conftest.py
@@ -37,7 +37,6 @@ def serializable_fn(x):
         .scale("x", by=[-1, 2])
         .shift("x", by=2)
         .split("key_to_split", into=["split_1", "split_2"])
-        .standardize(exclude=["t1", "t2", "o1"])
         .drop("d1")
         .one_hot("o1", 10)
         .keep(["x", "y", "z1", "p1", "p2", "s1", "s2", "s3", "t1", "t2", "o1", "split_1", "split_2"])
diff --git a/tests/test_compatibility/test_approximators/conftest.py b/tests/test_compatibility/test_approximators/conftest.py
index 16406c9dc..e69de29bb 100644
--- a/tests/test_compatibility/test_approximators/conftest.py
+++ b/tests/test_compatibility/test_approximators/conftest.py
@@ -1,72 +0,0 @@
-import pytest
-
-
-@pytest.fixture()
-def batch_size():
-    return 8
-
-
-@pytest.fixture()
-def num_samples():
-    return 100
-
-
-@pytest.fixture()
-def adapter():
-    import bayesflow as bf
-
-    return bf.Adapter.create_default("parameters").rename("observables", "summary_variables")
-
-
-@pytest.fixture(params=["coupling_flow", "flow_matching"])
-def inference_network(request):
-    if request.param == "coupling_flow":
-        from bayesflow.networks import CouplingFlow
-
-        return CouplingFlow(depth=2)
-
-    elif request.param == "flow_matching":
-        from bayesflow.networks import FlowMatching
-
-        return FlowMatching(subnet_kwargs=dict(widths=(32, 32)), use_optimal_transport=False)
-
-
-@pytest.fixture(params=["time_series_transformer", "fusion_transformer", "time_series_network", "custom"])
-def summary_network(request):
-    if request.param == "time_series_transformer":
-        from bayesflow.networks import TimeSeriesTransformer
-
-        return TimeSeriesTransformer(embed_dims=(8, 8), mlp_widths=(16, 8), mlp_depths=(1, 1))
-
-    elif request.param == "fusion_transformer":
-        from bayesflow.networks import FusionTransformer
-
-        return FusionTransformer(
-            embed_dims=(8, 8), mlp_widths=(8, 16), mlp_depths=(2, 1), template_dim=8, bidirectional=False
-        )
-
-    elif request.param == "time_series_network":
-        from bayesflow.networks import TimeSeriesNetwork
-
-        return TimeSeriesNetwork(filters=4, skip_steps=2)
-
-    elif request.param == "custom":
-        from bayesflow.networks import SummaryNetwork
-        from bayesflow.utils.serialization import serializable
-        import keras
-
-        @serializable("test", disable_module_check=True)
-        class Custom(SummaryNetwork):
-            def __init__(self, **kwargs):
-                super().__init__(**kwargs)
-                self.inner = keras.Sequential([keras.layers.LSTM(8), keras.layers.Dense(4)])
-
-            def call(self, x, **kwargs):
-                return self.inner(x, training=kwargs.get("stage") == "training")
-
-        return Custom()
-
-    elif request.param == "fusion_network":
-        from bayesflow.networks import FusionNetwork, DeepSet
-
-        return FusionNetwork({"a": DeepSet(), "b": keras.layers.Flatten()}, head=keras.layers.Dense(2))
diff --git a/tests/test_compatibility/test_approximators/test_continuous_approximator/conftest.py b/tests/test_compatibility/test_approximators/test_continuous_approximator/conftest.py
new file mode 100644
index 000000000..1006b2c61
--- /dev/null
+++ b/tests/test_compatibility/test_approximators/test_continuous_approximator/conftest.py
@@ -0,0 +1,10 @@
+import pytest
+
+
+@pytest.fixture
+def approximator(adapter, inference_network, summary_network, standardize):
+    from bayesflow.approximators import ContinuousApproximator
+
+    return ContinuousApproximator(
+        adapter=adapter, inference_network=inference_network, summary_network=summary_network, standardize=standardize
+    )
diff --git a/tests/test_compatibility/test_approximators/test_continuous_approximator/test_continuous_approximator.py b/tests/test_compatibility/test_approximators/test_continuous_approximator/test_continuous_approximator.py
index c5d2d36eb..ddbc9c84b 100644
--- a/tests/test_compatibility/test_approximators/test_continuous_approximator/test_continuous_approximator.py
+++ b/tests/test_compatibility/test_approximators/test_continuous_approximator/test_continuous_approximator.py
@@ -4,12 +4,13 @@
 import keras
 
 
-@pytest.mark.parametrize("inference_network", ["coupling_flow", "flow_matching"], indirect=True)
+@pytest.mark.parametrize("inference_network", ["coupling_flow"], indirect=True)
 @pytest.mark.parametrize(
-    "summary_network,simulator,adapter",
+    "summary_network,simulator,adapter,standardize",
     [
-        ["time_series_transformer", "sir", None],
-        ["fusion_transformer", "sir", None],
+        ["deep_set", "sir", "summary", ["summary_variables", "inference_variables"]],  # use deep_set for speed
+        [None, "two_moons", "direct", "all"],
+        [None, "two_moons", "direct", None],
     ],
     indirect=True,
 )
@@ -21,16 +22,9 @@ class TestContinuousApproximator(SaveLoadTest):
     }
 
     @pytest.fixture()
-    def setup(self, filepaths, mode, inference_network, summary_network, simulator, adapter):
+    def setup(self, filepaths, mode, approximator, adapter, inference_network, summary_network, standardize, simulator):
         if mode == "save":
-            import bayesflow as bf
-
-            approximator = bf.approximators.ContinuousApproximator(
-                adapter=adapter,
-                inference_network=inference_network,
-                summary_network=summary_network,
-            )
-            approximator.compile("adamw")
+            approximator.compile("adamw", run_eagerly=False)
             approximator.fit(simulator=simulator, epochs=1, batch_size=8, num_batches=2, verbose=0)
             keras.saving.save_model(approximator, filepaths["approximator"])
 
diff --git a/tests/test_compatibility/test_approximators/test_model_comparison_approximators/__init__.py b/tests/test_compatibility/test_approximators/test_model_comparison_approximator/__init__.py
similarity index 100%
rename from tests/test_compatibility/test_approximators/test_model_comparison_approximators/__init__.py
rename to tests/test_compatibility/test_approximators/test_model_comparison_approximator/__init__.py
diff --git a/tests/test_compatibility/test_approximators/test_model_comparison_approximators/conftest.py b/tests/test_compatibility/test_approximators/test_model_comparison_approximator/conftest.py
similarity index 92%
rename from tests/test_compatibility/test_approximators/test_model_comparison_approximators/conftest.py
rename to tests/test_compatibility/test_approximators/test_model_comparison_approximator/conftest.py
index 37a841fd4..b11b752e8 100644
--- a/tests/test_compatibility/test_approximators/test_model_comparison_approximators/conftest.py
+++ b/tests/test_compatibility/test_approximators/test_model_comparison_approximator/conftest.py
@@ -48,13 +48,6 @@ def adapter():
     )
 
 
-@pytest.fixture
-def summary_network():
-    from bayesflow.networks import DeepSet
-
-    return DeepSet(summary_dim=2, depth=1)
-
-
 @pytest.fixture
 def classifier_network():
     from bayesflow.networks import MLP
@@ -71,7 +64,7 @@ def approximator(adapter, classifier_network, summary_network, simulator, standa
         classifier_network=classifier_network,
         adapter=adapter,
         summary_network=summary_network,
-        # standardize=standardize,
+        standardize=standardize,
     )
 
 
diff --git a/tests/test_compatibility/test_approximators/test_model_comparison_approximators/test_model_comparison_approximator.py b/tests/test_compatibility/test_approximators/test_model_comparison_approximator/test_model_comparison_approximator.py
similarity index 88%
rename from tests/test_compatibility/test_approximators/test_model_comparison_approximators/test_model_comparison_approximator.py
rename to tests/test_compatibility/test_approximators/test_model_comparison_approximator/test_model_comparison_approximator.py
index 98fb4344c..e90f2fc36 100644
--- a/tests/test_compatibility/test_approximators/test_model_comparison_approximators/test_model_comparison_approximator.py
+++ b/tests/test_compatibility/test_approximators/test_model_comparison_approximator/test_model_comparison_approximator.py
@@ -4,6 +4,7 @@
 import keras
 
 
+@pytest.mark.parametrize("summary_network", ["deep_set"], indirect=True)
 class TestModelComparisonApproximator(SaveLoadTest):
     filenames = {
         "approximator": "approximator.keras",
@@ -12,7 +13,7 @@ class TestModelComparisonApproximator(SaveLoadTest):
     }
 
     @pytest.fixture()
-    def setup(self, filepaths, mode, simulator, approximator):
+    def setup(self, filepaths, mode, simulator, approximator, classifier_network, summary_network):
         if mode == "save":
             approximator.compile("adamw")
             approximator.fit(
diff --git a/tests/test_compatibility/test_workflows/__init__.py b/tests/test_compatibility/test_approximators/test_point_approximator/__init__.py
similarity index 100%
rename from tests/test_compatibility/test_workflows/__init__.py
rename to tests/test_compatibility/test_approximators/test_point_approximator/__init__.py
diff --git a/tests/test_compatibility/test_approximators/test_point_approximator/conftest.py b/tests/test_compatibility/test_approximators/test_point_approximator/conftest.py
new file mode 100644
index 000000000..4ee38c142
--- /dev/null
+++ b/tests/test_compatibility/test_approximators/test_point_approximator/conftest.py
@@ -0,0 +1,54 @@
+import pytest
+
+
+@pytest.fixture()
+def batch_size():
+    return 8
+
+
+@pytest.fixture()
+def num_samples():
+    return 100
+
+
+@pytest.fixture(params=["single_parametric", "multiple_parametric"])
+def point_inference_network(request):
+    match request.param:
+        case "single_parametric":
+            from bayesflow.networks import PointInferenceNetwork
+            from bayesflow.scores import NormedDifferenceScore, QuantileScore, MultivariateNormalScore
+
+            return PointInferenceNetwork(
+                scores=dict(
+                    mean=NormedDifferenceScore(k=2),
+                    quantiles=QuantileScore(q=[0.1, 0.5, 0.9]),
+                    mvn=MultivariateNormalScore(),
+                ),
+                subnet="mlp",
+                subnet_kwargs=dict(widths=(32, 32)),
+            )
+
+        case "multiple_parametric":
+            from bayesflow.networks import PointInferenceNetwork
+            from bayesflow.scores import MultivariateNormalScore
+
+            return PointInferenceNetwork(
+                scores=dict(
+                    mvn1=MultivariateNormalScore(),
+                    mvn2=MultivariateNormalScore(),
+                ),
+            )
+        case _:
+            raise ValueError(f"Invalid request parameter for point_inference_network: {request.param}")
+
+
+@pytest.fixture
+def approximator(adapter, point_inference_network, summary_network, standardize):
+    from bayesflow.approximators import PointApproximator
+
+    return PointApproximator(
+        adapter=adapter,
+        inference_network=point_inference_network,
+        summary_network=summary_network,
+        standardize=standardize,
+    )
diff --git a/tests/test_compatibility/test_approximators/test_point_approximator/test_point_approximator.py b/tests/test_compatibility/test_approximators/test_point_approximator/test_point_approximator.py
new file mode 100644
index 000000000..a45801e79
--- /dev/null
+++ b/tests/test_compatibility/test_approximators/test_point_approximator/test_point_approximator.py
@@ -0,0 +1,52 @@
+import pytest
+from utils import SaveLoadTest, dump_path, load_path
+import numpy as np
+import keras
+
+
+@pytest.mark.parametrize(
+    "summary_network,simulator,adapter,standardize",
+    [
+        ["deep_set", "sir", "summary", "all"],  # use deep_set for speed
+        [None, "two_moons", "direct", None],
+    ],
+    indirect=True,
+)
+class TestPointApproximator(SaveLoadTest):
+    filenames = {
+        "approximator": "approximator.keras",
+        "input": "input.pickle",
+        "output": "output.pickle",
+    }
+
+    @pytest.fixture()
+    def setup(
+        self, filepaths, mode, approximator, adapter, point_inference_network, summary_network, standardize, simulator
+    ):
+        if mode == "save":
+            approximator.compile("adamw", run_eagerly=False)
+            approximator.fit(simulator=simulator, epochs=1, batch_size=8, num_batches=2, verbose=0)
+            keras.saving.save_model(approximator, filepaths["approximator"])
+
+            input = simulator.sample(4)
+            output = self.evaluate(approximator, input)
+            dump_path(input, filepaths["input"])
+            dump_path(output, filepaths["output"])
+
+        approximator = keras.saving.load_model(filepaths["approximator"])
+        input = load_path(filepaths["input"])
+        output = load_path(filepaths["output"])
+
+        return approximator, input, output
+
+    def evaluate(self, approximator, data):
+        return approximator.estimate(data)
+
+    def test_output(self, setup):
+        approximator, input, reference = setup
+        output = self.evaluate(approximator, input)
+
+        from keras.tree import flatten
+
+        for ref, out in zip(flatten(reference), flatten(output)):
+            np.testing.assert_allclose(ref, out)
diff --git a/tests/test_compatibility/test_distributions/conftest.py b/tests/test_compatibility/test_distributions/conftest.py
index 92167e872..4c77bcf27 100644
--- a/tests/test_compatibility/test_distributions/conftest.py
+++ b/tests/test_compatibility/test_distributions/conftest.py
@@ -24,9 +24,25 @@ def mixture():
 
 @pytest.fixture(params=["diagonal_normal", "diagonal_student_t", "mixture"])
 def distribution(request):
+    name, kwargs = request.param
+
+    match name:
+        case "diagonal_normal":
+            from bayesflow.distributions import DiagonalNormal
+
+            return DiagonalNormal(mean=1.0, std=2.0, **kwargs)
+        case "diagonal_student_t":
+            from bayesflow.distributions import DiagonalStudentT
+
+            return DiagonalStudentT(df=10, loc=1.0, scale=2.0, **kwargs)
+        case "mixture":
+            from bayesflow.distributions import DiagonalNormal, DiagonalStudentT, Mixture
+
+            return Mixture(
+                [
+                    DiagonalNormal(mean=1.0, std=2.0, trainable_parameters=True),
+                    DiagonalStudentT(df=25, mean=1.0, std=2.0),
+                ],
+                **kwargs,
+            )
     return request.getfixturevalue(request.param)
-
-
-@pytest.fixture()
-def shape(batch_size, feature_size):
-    return batch_size, feature_size
diff --git a/tests/test_compatibility/test_distributions/test_distributions.py b/tests/test_compatibility/test_distributions/test_distributions.py
index a4b2bd0d7..6af2263c7 100644
--- a/tests/test_compatibility/test_distributions/test_distributions.py
+++ b/tests/test_compatibility/test_distributions/test_distributions.py
@@ -1,25 +1,67 @@
-from utils import SaveLoadTest, load_from_config, save_config
+from utils import SaveLoadTest
 import numpy as np
 import keras
 import pytest
 
 
+@pytest.mark.parametrize(
+    "distribution",
+    [
+        ["diagonal_normal", dict(trainable_parameters=False)],
+        ["diagonal_normal", dict(trainable_parameters=True)],
+        ["diagonal_student_t", dict(trainable_parameters=False)],
+        ["diagonal_student_t", dict(trainable_parameters=True)],
+        ["mixture", dict(trainable_mixture=False)],
+        ["mixture", dict(trainable_mixture=True)],
+    ],
+    indirect=True,
+)
 class TestDistribution(SaveLoadTest):
     filenames = {
-        "model": "model.pickle",
+        "model": "model.keras",
         "output": "output.npy",
     }
 
     @pytest.fixture
     def setup(self, filepaths, mode, distribution, random_samples):
+        from bayesflow.utils.serialization import serialize, deserialize
+
+        class DummyModel(keras.Model):
+            def __init__(self, distribution, **kwargs):
+                super().__init__(**kwargs)
+                self.distribution = distribution
+
+            def call(self, inputs):
+                return self.distribution.log_prob(inputs)
+
+            def get_config(self):
+                base_config = super().get_config()
+                config = {"distribution": self.distribution}
+                return base_config | serialize(config)
+
+            @classmethod
+            def from_config(cls, config, custom_objects=None):
+                return cls(**deserialize(config, custom_objects=custom_objects))
+
         if mode == "save":
             distribution.build(keras.ops.shape(random_samples))
-            save_config(distribution, filepaths["model"])
 
-            output = self.evaluate(distribution, random_samples)
+            model = DummyModel(distribution)
+            model.compile(loss=keras.losses.MeanSquaredError())
+            model.fit(
+                random_samples,
+                keras.ops.ones(keras.ops.shape(random_samples)[:-1]),
+                batch_size=keras.ops.shape(random_samples)[0],
+                epochs=1,
+            )
+            model.save(filepaths["model"])
+
+            output = self.evaluate(model.distribution, random_samples)
             np.save(filepaths["output"], output, allow_pickle=False)
 
-        distribution = load_from_config(filepaths["model"])
+        distribution = keras.saving.load_model(
+            filepaths["model"], custom_objects={"DummyModel": DummyModel}
+        ).distribution
         output = np.load(filepaths["output"])
 
         return distribution, output
diff --git a/tests/test_compatibility/test_links/conftest.py b/tests/test_compatibility/test_links/conftest.py
index 2b4017f5b..883393db3 100644
--- a/tests/test_compatibility/test_links/conftest.py
+++ b/tests/test_compatibility/test_links/conftest.py
@@ -1,4 +1,3 @@
-import numpy as np
 import keras
 import pytest
 
@@ -13,67 +12,21 @@ def feature_size():
     return 10
 
 
-@pytest.fixture()
-def generic_preactivation(batch_size):
-    return keras.ops.ones((batch_size, 6))
-
-
-@pytest.fixture()
-def ordered():
-    from bayesflow.links import Ordered
-
-    return Ordered(axis=1, anchor_index=2)
-
-
-@pytest.fixture()
-def ordered_quantiles():
-    from bayesflow.links import OrderedQuantiles
-
-    return OrderedQuantiles()
-
-
-@pytest.fixture()
-def cholesky_factor():
-    from bayesflow.links import CholeskyFactor
-
-    return CholeskyFactor()
-
-
-@pytest.fixture()
-def linear():
-    return keras.layers.Activation("linear")
-
-
-@pytest.fixture(params=["ordered", "ordered_quantiles", "cholesky_factor", "linear"], scope="function")
+@pytest.fixture
 def link(request):
-    return request.getfixturevalue(request.param)
-
-
-@pytest.fixture()
-def num_quantiles():
-    return 19
-
-
-@pytest.fixture()
-def quantiles_np(num_quantiles):
-    return np.linspace(0, 1, num_quantiles + 2)[1:-1]
-
-
-@pytest.fixture()
-def quantiles_py(quantiles_np):
-    return list(quantiles_np)
-
-
-@pytest.fixture()
-def quantiles_keras(quantiles_np):
-    return keras.ops.convert_to_tensor(quantiles_np)
-
-
-@pytest.fixture()
-def none():
-    return None
-
-
-@pytest.fixture(params=["quantiles_np", "quantiles_py", "quantiles_keras", "none"], scope="function")
-def quantiles(request):
-    return request.getfixturevalue(request.param)
+    name, kwargs = request.param
+    match name:
+        case "ordered":
+            from bayesflow.links import Ordered
+
+            return Ordered(**kwargs)
+        case "ordered_quantiles":
+            from bayesflow.links import OrderedQuantiles
+
+            return OrderedQuantiles(**kwargs)
+        case "cholesky_factor":
+            from bayesflow.links import CholeskyFactor
+
+            return CholeskyFactor(**kwargs)
+        case "linear":
+            return keras.layers.Activation("linear", **kwargs)
diff --git a/tests/test_compatibility/test_links/test_links.py b/tests/test_compatibility/test_links/test_links.py
index f4434016c..9343b048d 100644
--- a/tests/test_compatibility/test_links/test_links.py
+++ b/tests/test_compatibility/test_links/test_links.py
@@ -4,6 +4,16 @@
 import numpy as np
 
 
+@pytest.mark.parametrize(
+    "link",
+    [
+        ["ordered", dict(axis=1, anchor_index=2)],
+        ["ordered_quantiles", dict()],
+        ["cholesky_factor", dict()],
+        ["linear", dict()],
+    ],
+    indirect=True,
+)
 class TestLink(SaveLoadTest):
     filenames = {
         "model": "model.pickle",
diff --git a/tests/test_compatibility/test_metrics/conftest.py b/tests/test_compatibility/test_metrics/conftest.py
index cbd7d487f..63475e68a 100644
--- a/tests/test_compatibility/test_metrics/conftest.py
+++ b/tests/test_compatibility/test_metrics/conftest.py
@@ -3,31 +3,28 @@
 
 
 @pytest.fixture()
-def root_mean_squared_error():
-    from bayesflow.metrics import RootMeanSquaredError
-
-    return RootMeanSquaredError(normalize=True, name="rmse", dtype="float32")
-
-
-@pytest.fixture()
-def maximum_mean_discrepancy():
-    from bayesflow.metrics import MaximumMeanDiscrepancy
+def metric(request):
+    name, kwargs = request.param
 
-    return MaximumMeanDiscrepancy(name="mmd", kernel="gaussian", unbiased=True, dtype="float32")
+    match name:
+        case "root_mean_squared_error":
+            from bayesflow.metrics import RootMeanSquaredError
 
+            return RootMeanSquaredError(**kwargs)
+        case "maximum_mean_discrepancy":
+            from bayesflow.metrics import MaximumMeanDiscrepancy
 
-@pytest.fixture(params=["root_mean_squared_error", "maximum_mean_discrepancy"])
-def metric(request):
-    return request.getfixturevalue(request.param)
+            return MaximumMeanDiscrepancy(**kwargs)
+    raise ValueError(f"unknown name: {name}")
 
 
 @pytest.fixture
 def samples_1():
     rng = np.random.default_rng(seed=1)
-    return rng.normal(size=(2, 3))
+    return rng.normal(size=(2, 3)).astype(np.float32)
 
 
 @pytest.fixture
 def samples_2():
     rng = np.random.default_rng(seed=2)
-    return rng.normal(size=(2, 3))
+    return rng.normal(size=(2, 3)).astype(np.float32)
diff --git a/tests/test_compatibility/test_metrics/test_metrics.py b/tests/test_compatibility/test_metrics/test_metrics.py
index 6e29db69f..ce2d7267d 100644
--- a/tests/test_compatibility/test_metrics/test_metrics.py
+++ b/tests/test_compatibility/test_metrics/test_metrics.py
@@ -4,6 +4,16 @@
 import keras
 
 
+@pytest.mark.parametrize(
+    "metric",
+    [
+        ["root_mean_squared_error", dict(normalize=True, dtype="float32")],
+        ["root_mean_squared_error", dict(normalize=False)],
+        ["maximum_mean_discrepancy", dict(kernel="gaussian", unbiased=True, dtype="float32")],
+        ["maximum_mean_discrepancy", dict(kernel="inverse_multiquadratic", unbiased=False)],
+    ],
+    indirect=True,
+)
 class TestMetric(SaveLoadTest):
     filenames = {
         "model": "model.pickle",
diff --git a/tests/test_compatibility/test_networks/test_inference_networks/conftest.py b/tests/test_compatibility/test_networks/test_inference_networks/conftest.py
index 5add35cba..4d0126da2 100644
--- a/tests/test_compatibility/test_networks/test_inference_networks/conftest.py
+++ b/tests/test_compatibility/test_networks/test_inference_networks/conftest.py
@@ -3,226 +3,37 @@
 from bayesflow.networks import MLP
 
 
-@pytest.fixture()
-def diffusion_model_edm_F():
-    from bayesflow.experimental import DiffusionModel
-
-    return DiffusionModel(
-        subnet=MLP([8, 8]),
-        integrate_kwargs={"method": "rk45", "steps": 10},
-        noise_schedule="edm",
-        prediction_type="F",
-    )
-
-
-@pytest.fixture()
-def diffusion_model_edm_velocity():
-    from bayesflow.experimental import DiffusionModel
-
-    return DiffusionModel(
-        subnet=MLP([8, 8]),
-        integrate_kwargs={"method": "rk45", "steps": 10},
-        noise_schedule="edm",
-        prediction_type="velocity",
-    )
-
-
-@pytest.fixture()
-def diffusion_model_edm_noise():
-    from bayesflow.experimental import DiffusionModel
-
-    return DiffusionModel(
-        subnet=MLP([8, 8]),
-        integrate_kwargs={"method": "rk45", "steps": 10},
-        noise_schedule="edm",
-        prediction_type="noise",
-    )
-
-
-@pytest.fixture()
-def diffusion_model_cosine_F():
-    from bayesflow.experimental import DiffusionModel
-
-    return DiffusionModel(
-        subnet=MLP([8, 8]),
-        integrate_kwargs={"method": "rk45", "steps": 10},
-        noise_schedule="cosine",
-        prediction_type="F",
-    )
-
-
-@pytest.fixture()
-def diffusion_model_cosine_velocity():
-    from bayesflow.experimental import DiffusionModel
-
-    return DiffusionModel(
-        subnet=MLP([8, 8]),
-        integrate_kwargs={"method": "rk45", "steps": 10},
-        noise_schedule="cosine",
-        prediction_type="velocity",
-    )
-
-
-@pytest.fixture()
-def diffusion_model_cosine_noise():
-    from bayesflow.experimental import DiffusionModel
-
-    return DiffusionModel(
-        subnet=MLP([8, 8]),
-        integrate_kwargs={"method": "rk45", "steps": 10},
-        noise_schedule="cosine",
-        prediction_type="noise",
-    )
-
-
-@pytest.fixture()
-def flow_matching():
-    from bayesflow.networks import FlowMatching
-
-    return FlowMatching(
-        subnet=MLP([8, 8]),
-        integrate_kwargs={"method": "rk45", "steps": 10},
-    )
-
-
-@pytest.fixture()
-def consistency_model():
-    from bayesflow.networks import ConsistencyModel
-
-    return ConsistencyModel(total_steps=100, subnet=MLP([8, 8]))
-
-
-@pytest.fixture()
-def affine_coupling_flow():
-    from bayesflow.networks import CouplingFlow
-
-    return CouplingFlow(
-        depth=2, subnet="mlp", subnet_kwargs=dict(widths=[8, 8]), transform="affine", transform_kwargs=dict(clamp=1.8)
-    )
-
-
-@pytest.fixture()
-def spline_coupling_flow():
-    from bayesflow.networks import CouplingFlow
-
-    return CouplingFlow(
-        depth=2, subnet="mlp", subnet_kwargs=dict(widths=[8, 8]), transform="spline", transform_kwargs=dict(bins=8)
-    )
-
-
-@pytest.fixture()
-def free_form_flow():
-    from bayesflow.experimental import FreeFormFlow
-
-    return FreeFormFlow(encoder_subnet=MLP([16, 16]), decoder_subnet=MLP([16, 16]))
-
-
-@pytest.fixture()
-def typical_point_inference_network():
-    from bayesflow.networks import PointInferenceNetwork
-    from bayesflow.scores import MeanScore, MedianScore, QuantileScore, MultivariateNormalScore
-
-    return PointInferenceNetwork(
-        scores=dict(
-            mean=MeanScore(),
-            median=MedianScore(),
-            quantiles=QuantileScore([0.1, 0.2, 0.5, 0.65]),
-            mvn=MultivariateNormalScore(),  # currently not stable
-        )
-    )
-
-
-@pytest.fixture()
-def typical_point_inference_network_subnet():
-    from bayesflow.networks import PointInferenceNetwork
-    from bayesflow.scores import MeanScore, MedianScore, QuantileScore, MultivariateNormalScore
-
-    subnet = MLP([16, 8])
-
-    return PointInferenceNetwork(
-        scores=dict(
-            mean=MeanScore(subnets=dict(value=subnet)),
-            median=MedianScore(subnets=dict(value=subnet)),
-            quantiles=QuantileScore(subnets=dict(value=subnet)),
-            mvn=MultivariateNormalScore(subnets=dict(mean=subnet, covariance=subnet)),
-        ),
-        subnet=subnet,
-    )
-
-
-@pytest.fixture(
-    params=[
-        "typical_point_inference_network",
-        "affine_coupling_flow",
-        "spline_coupling_flow",
-        "flow_matching",
-        "free_form_flow",
-        "consistency_model",
-        pytest.param("diffusion_model_edm_F"),
-        pytest.param("diffusion_model_edm_noise", marks=[pytest.mark.slow]),
-        pytest.param("diffusion_model_cosine_velocity", marks=[pytest.mark.slow]),
-        pytest.param("diffusion_model_cosine_F", marks=[pytest.mark.slow]),
-        pytest.param("diffusion_model_cosine_noise", marks=[pytest.mark.slow]),
-        pytest.param("diffusion_model_cosine_velocity", marks=[pytest.mark.slow]),
-    ],
-    scope="function",
-)
+@pytest.fixture
 def inference_network(request):
-    return request.getfixturevalue(request.param)
-
-
-@pytest.fixture(
-    params=[
-        "typical_point_inference_network_subnet",
-        "coupling_flow_subnet",
-        "flow_matching_subnet",
-        "free_form_flow_subnet",
-    ],
-    scope="function",
-)
-def inference_network_subnet(request):
-    return request.getfixturevalue(request.param)
-
-
-@pytest.fixture(
-    params=[
-        "affine_coupling_flow",
-        "spline_coupling_flow",
-        "flow_matching",
-        "free_form_flow",
-        "consistency_model",
-        pytest.param("diffusion_model_edm_F"),
-        pytest.param(
-            "diffusion_model_edm_noise",
-            marks=[
-                pytest.mark.slow,
-                pytest.mark.skip("noise predicition not testable without prior training for numerical reasons."),
-            ],
-        ),
-        pytest.param("diffusion_model_cosine_velocity", marks=[pytest.mark.slow]),
-        pytest.param(
-            "diffusion_model_cosine_F",
-            marks=[
-                pytest.mark.slow,
-                pytest.mark.skip("skip to reduce load on CI."),
-            ],
-        ),
-        pytest.param(
-            "diffusion_model_cosine_noise",
-            marks=[
-                pytest.mark.slow,
-                pytest.mark.skip("noise predicition not testable without prior training for numerical reasons."),
-            ],
-        ),
-        pytest.param(
-            "diffusion_model_cosine_velocity",
-            marks=[
-                pytest.mark.slow,
-                pytest.mark.skip("skip to reduce load on CI."),
-            ],
-        ),
-    ],
-    scope="function",
-)
-def generative_inference_network(request):
-    return request.getfixturevalue(request.param)
+    name, kwargs = request.param
+    from bayesflow.utils.dispatch import find_inference_network
+
+    try:
+        return find_inference_network(name, **kwargs)
+    except ValueError:
+        # network not yet in find_inference_network
+        pass
+    match name:
+        case "diffusion_model":
+            from bayesflow.experimental import DiffusionModel
+
+            return DiffusionModel(**kwargs)
+        case "free_form_flow":
+            from bayesflow.experimental import FreeFormFlow
+
+            return FreeFormFlow(**kwargs)
+        case "point_inference_network":
+            from bayesflow.networks import PointInferenceNetwork
+            from bayesflow.scores import MeanScore, MedianScore, QuantileScore, MultivariateNormalScore
+
+            return PointInferenceNetwork(
+                scores=dict(
+                    mean=MeanScore(subnets=dict(value=MLP([16, 8]))),
+                    median=MedianScore(subnets=dict(value=MLP([16, 8]))),
+                    quantiles=QuantileScore(subnets=dict(value=MLP([16, 8]))),
+                    mvn=MultivariateNormalScore(subnets=dict(mean=MLP([16, 8]), covariance=MLP([16, 8]))),
+                ),
+                **kwargs,
+            )
+        case _:
+            raise ValueError(f"Invalid request parameter for inference_network: {name}")
diff --git a/tests/test_compatibility/test_networks/test_inference_networks/test_inference_networks.py b/tests/test_compatibility/test_networks/test_inference_networks/test_inference_networks.py
index 2403a3bee..99d8a5be6 100644
--- a/tests/test_compatibility/test_networks/test_inference_networks/test_inference_networks.py
+++ b/tests/test_compatibility/test_networks/test_inference_networks/test_inference_networks.py
@@ -4,6 +4,61 @@
 import keras
 
 
+@pytest.mark.parametrize(
+    "inference_network",
+    [
+        [
+            "coupling_flow",
+            dict(
+                depth=2,
+                subnet="mlp",
+                subnet_kwargs=dict(widths=[8, 8]),
+                transform="affine",
+                transform_kwargs=dict(clamp=1.8),
+            ),
+        ],
+        [
+            "coupling_flow",
+            dict(
+                depth=2,
+                subnet="mlp",
+                subnet_kwargs=dict(widths=[8, 8]),
+                transform="spline",
+                transform_kwargs=dict(bins=8),
+            ),
+        ],
+        ["flow_matching", dict(integrate_kwargs={"method": "rk45", "steps": 10})],
+        ["consistency_model", dict(total_steps=10)],
+        [
+            "diffusion_model",
+            dict(noise_schedule="edm", prediction_type="F", integrate_kwargs={"method": "rk45", "steps": 10}),
+        ],
+        [
+            "diffusion_model",
+            dict(noise_schedule="edm", prediction_type="velocity", integrate_kwargs={"method": "euler", "steps": 10}),
+        ],
+        [
+            "diffusion_model",
+            dict(noise_schedule="edm", prediction_type="noise", integrate_kwargs={"method": "euler", "steps": 10}),
+        ],
+        [
+            "diffusion_model",
+            dict(noise_schedule="cosine", prediction_type="F", integrate_kwargs={"method": "euler", "steps": 10}),
+        ],
+        [
+            "diffusion_model",
+            dict(
+                noise_schedule="cosine", prediction_type="velocity", integrate_kwargs={"method": "euler", "steps": 10}
+            ),
+        ],
+        [
+            "free_form_flow",
+            dict(encoder_subnet_kwargs={"widths": [16, 16]}, decoder_subnet_kwargs={"widths": [16, 16]}),
+        ],
+        ["point_inference_network", dict(subnet_kwargs={"widths": [8, 8]})],
+    ],
+    indirect=True,
+)
 class TestInferenceNetwork(SaveLoadTest):
     filenames = {
         "model": "model.keras",
diff --git a/tests/test_compatibility/test_networks/test_summary_networks/conftest.py b/tests/test_compatibility/test_networks/test_summary_networks/conftest.py
index 48b73ed10..078486c59 100644
--- a/tests/test_compatibility/test_networks/test_summary_networks/conftest.py
+++ b/tests/test_compatibility/test_networks/test_summary_networks/conftest.py
@@ -47,4 +47,16 @@ def deep_set(summary_dim):
     scope="function",
 )
 def summary_network(request, summary_dim):
-    return request.getfixturevalue(request.param)
+    from bayesflow.utils.dispatch import find_summary_network
+
+    name, kwargs = request.param
+    print(name)
+    try:
+        return find_summary_network(name, summary_dim=summary_dim, **kwargs)
+    except ValueError:
+        # network not in dispatch
+        pass
+
+    match name:
+        case _:
+            raise ValueError(f"Invalid request parameter for summary_network: {name}")
diff --git a/tests/test_compatibility/test_networks/test_summary_networks/test_summary_networks.py b/tests/test_compatibility/test_networks/test_summary_networks/test_summary_networks.py
index cfd9dc82f..421ce55a7 100644
--- a/tests/test_compatibility/test_networks/test_summary_networks/test_summary_networks.py
+++ b/tests/test_compatibility/test_networks/test_summary_networks/test_summary_networks.py
@@ -4,6 +4,17 @@
 import keras
 
 
+@pytest.mark.parametrize(
+    "summary_network",
+    [
+        ["time_series_network", dict()],
+        ["time_series_transformer", dict()],
+        ["fusion_transformer", dict()],
+        ["set_transformer", dict()],
+        ["deep_set", dict()],
+    ],
+    indirect=True,
+)
 class TestInferenceNetwork(SaveLoadTest):
     filenames = {
         "model": "model.keras",
@@ -11,7 +22,7 @@ class TestInferenceNetwork(SaveLoadTest):
     }
 
     @pytest.fixture()
-    def setup(self, filepaths, mode, summary_network, random_set):
+    def setup(self, filepaths, mode, summary_network, summary_dim, random_set):
         if mode == "save":
             shape = keras.ops.shape(random_set)
             summary_network.build(shape)
diff --git a/tests/test_compatibility/test_scores/conftest.py b/tests/test_compatibility/test_scores/conftest.py
index 6bde9076d..db3800f98 100644
--- a/tests/test_compatibility/test_scores/conftest.py
+++ b/tests/test_compatibility/test_scores/conftest.py
@@ -1,51 +1,31 @@
-import keras
 import pytest
 
 
-@pytest.fixture()
-def reference(batch_size, feature_size):
-    return keras.random.uniform((batch_size, feature_size))
-
-
-@pytest.fixture()
-def median_score():
-    from bayesflow.scores import MedianScore
-
-    return MedianScore()
-
-
-@pytest.fixture()
-def mean_score():
-    from bayesflow.scores import MeanScore
-
-    return MeanScore()
-
-
-@pytest.fixture()
-def normed_diff_score():
-    from bayesflow.scores import NormedDifferenceScore
-
-    return NormedDifferenceScore(k=3)
-
-
-@pytest.fixture(scope="function")
-def quantile_score():
-    from bayesflow.scores import QuantileScore
-
-    return QuantileScore()
-
-
-@pytest.fixture()
-def multivariate_normal_score():
-    from bayesflow.scores import MultivariateNormalScore
-
-    return MultivariateNormalScore()
-
-
 @pytest.fixture(
     params=["median_score", "mean_score", "normed_diff_score", "quantile_score", "multivariate_normal_score"],
-    scope="function",
 )
 def scoring_rule(request):
-    print("initialize scoring rule in test_scores")
-    return request.getfixturevalue(request.param)
+    name, kwargs = request.param
+    match name:
+        case "median_score":
+            from bayesflow.scores import MedianScore
+
+            return MedianScore(**kwargs)
+        case "mean_score":
+            from bayesflow.scores import MeanScore
+
+            return MeanScore(**kwargs)
+        case "normed_diff_score":
+            from bayesflow.scores import NormedDifferenceScore
+
+            return NormedDifferenceScore(**kwargs)
+        case "quantile_score":
+            from bayesflow.scores import QuantileScore
+
+            return QuantileScore(**kwargs)
+        case "multivariate_normal_score":
+            from bayesflow.scores import MultivariateNormalScore
+
+            return MultivariateNormalScore(**kwargs)
+        case _:
+            raise ValueError(f"Invalid request parameter for scoring_rule: {name}")
diff --git a/tests/test_compatibility/test_scores/test_scores.py b/tests/test_compatibility/test_scores/test_scores.py
index d9d1af5e3..11be69c49 100644
--- a/tests/test_compatibility/test_scores/test_scores.py
+++ b/tests/test_compatibility/test_scores/test_scores.py
@@ -4,6 +4,17 @@
 import keras
 
 
+@pytest.mark.parametrize(
+    "scoring_rule",
+    [
+        ["median_score", {}],
+        ["mean_score", {}],
+        ["normed_diff_score", dict(k=3)],
+        ["quantile_score", {}],
+        ["multivariate_normal_score", {}],
+    ],
+    indirect=True,
+)
 class TestScore(SaveLoadTest):
     filenames = {
         "model": "model.pickle",
diff --git a/tests/test_compatibility/test_workflows/conftest.py b/tests/test_compatibility/test_workflows/conftest.py
deleted file mode 100644
index 98f6b8704..000000000
--- a/tests/test_compatibility/test_workflows/conftest.py
+++ /dev/null
@@ -1,65 +0,0 @@
-import pytest
-
-import keras
-
-from bayesflow.utils.serialization import serializable
-
-
-@pytest.fixture(params=["coupling_flow", "flow_matching"])
-def inference_network(request):
-    if request.param == "coupling_flow":
-        from bayesflow.networks import CouplingFlow
-
-        return CouplingFlow(depth=2)
-
-    elif request.param == "flow_matching":
-        from bayesflow.networks import FlowMatching
-
-        return FlowMatching(subnet_kwargs=dict(widths=(32, 32)), use_optimal_transport=False)
-
-
-@pytest.fixture(params=["time_series_transformer", "fusion_transformer", "time_series_network", "custom"])
-def summary_network(request):
-    if request.param == "time_series_transformer":
-        from bayesflow.networks import TimeSeriesTransformer
-
-        return TimeSeriesTransformer(embed_dims=(8, 8), mlp_widths=(16, 8), mlp_depths=(1, 1))
-
-    elif request.param == "fusion_transformer":
-        from bayesflow.networks import FusionTransformer
-
-        return FusionTransformer(
-            embed_dims=(8, 8), mlp_widths=(8, 16), mlp_depths=(2, 1), template_dim=8, bidirectional=False
-        )
-
-    elif request.param == "time_series_network":
-        from bayesflow.networks import TimeSeriesNetwork
-
-        return TimeSeriesNetwork(filters=4, skip_steps=2)
-
-    elif request.param == "custom":
-        from bayesflow.networks import SummaryNetwork
-
-        @serializable("test", disable_module_check=True)
-        class Custom(SummaryNetwork):
-            def __init__(self, **kwargs):
-                super().__init__(**kwargs)
-                self.inner = keras.Sequential([keras.layers.LSTM(8), keras.layers.Dense(4)])
-
-            def call(self, x, **kwargs):
-                return self.inner(x, training=kwargs.get("stage") == "training")
-
-        return Custom()
-
-    elif request.param == "fusion_network":
-        from bayesflow.networks import FusionNetwork, DeepSet
-
-        return FusionNetwork({"a": DeepSet(), "b": keras.layers.Flatten()}, head=keras.layers.Dense(2))
-
-
-@pytest.fixture(params=["fusion_adapter", None])
-def adapter(request):
-    from bayesflow import Adapter
-
-    if request.param == "fusion_adapter":
-        return Adapter.create_default(["mean"]).group(["a", "b"], "summary_variables")
diff --git a/tests/test_compatibility/test_workflows/test_workflows.py b/tests/test_compatibility/test_workflows/test_workflows.py
deleted file mode 100644
index 81789210d..000000000
--- a/tests/test_compatibility/test_workflows/test_workflows.py
+++ /dev/null
@@ -1,58 +0,0 @@
-import pytest
-from utils import SaveLoadTest, dump_path, load_path
-import numpy as np
-import keras
-
-
-@pytest.mark.parametrize("inference_network", ["coupling_flow", "flow_matching"], indirect=True)
-@pytest.mark.parametrize(
-    "summary_network,simulator,adapter",
-    [
-        ["time_series_transformer", "sir", None],
-        ["fusion_transformer", "sir", None],
-        ["fusion_network", "fusion", "fusion_adapter"],
-    ],
-    indirect=True,
-)
-class TestWorkflow(SaveLoadTest):
-    filenames = {
-        "approximator": "approximator.keras",
-        "input": "input.pickle",
-        "output": "output.pickle",
-    }
-
-    @pytest.fixture()
-    def setup(self, filepaths, mode, inference_network, summary_network, simulator, adapter):
-        if mode == "save":
-            import bayesflow as bf
-
-            workflow = bf.BasicWorkflow(
-                adapter=adapter,
-                inference_network=inference_network,
-                summary_network=summary_network,
-                inference_variables=["parameters"],
-                summary_variables=["observables"],
-                simulator=simulator,
-            )
-            workflow.fit_online(epochs=1, batch_size=8, num_batches_per_epoch=2, verbose=0)
-            keras.saving.save_model(workflow.approximator, filepaths["approximator"])
-
-            input = workflow.simulate(4)
-            output = self.evaluate(workflow.approximator, input)
-            dump_path(input, filepaths["input"])
-            dump_path(output, filepaths["output"])
-
-        approximator = keras.saving.load_model(filepaths["approximator"])
-        input = load_path(filepaths["input"])
-        output = load_path(filepaths["output"])
-
-        return approximator, input, output
-
-    def evaluate(self, approximator, data):
-        return approximator.log_prob(data)
-
-    def test_output(self, setup):
-        approximator, input, reference = setup
-        output = self.evaluate(approximator, input)
-        print(reference)
-        np.testing.assert_allclose(reference, output)

From 09e5608a7a1355e98fb25f6bd571535a42a01281 Mon Sep 17 00:00:00 2001
From: Valentin Pratz <git@valentinpratz.de>
Date: Sat, 21 Jun 2025 18:21:35 +0000
Subject: [PATCH 06/10] clean-up and fixes

---
 noxfile.py                                    | 70 +++++++++++++++----
 pyproject.toml                                |  2 +
 tests/conftest.py                             | 15 ++--
 tests/test_compatibility/conftest.py          | 27 +++----
 .../test_summary_networks.py                  |  2 +-
 tests/test_compatibility/utils/helpers.py     | 22 +++++-
 tests/test_compatibility/utils/io.py          | 28 +-------
 7 files changed, 96 insertions(+), 70 deletions(-)

diff --git a/noxfile.py b/noxfile.py
index 9eb7c645b..662af68a2 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -2,6 +2,14 @@
 import argparse
 from pathlib import Path
 import os
+import tempfile
+import shutil
+
+
+def git_rev_parse(session, commit):
+    print(f"Converting provided commit '{commit}' to Git revision...")
+    rev = session.run("git", "rev-parse", commit, external=True, silent=True).strip()
+    return rev
 
 
 @nox.session
@@ -13,7 +21,7 @@ def save_and_load(session: nox.Session):
     In load mode, the stored models and outputs are loaded from disk, and old and new outputs are compared.
     This helps to detect breaking serialization between versions.
 
-    Important: The test code from the current checkout is used, not from the installed version.
+    Important: The test code from the current checkout, not from `commit`, is used.
     """
     # parse the arguments
     parser = argparse.ArgumentParser()
@@ -21,24 +29,60 @@ def save_and_load(session: nox.Session):
     subparsers = parser.add_subparsers(help="subcommand help", dest="mode")
     # save command
     parser_save = subparsers.add_parser("save")
-    parser_save.add_argument("commit", type=str, default=".")
+    parser_save.add_argument("--install", type=str, default=".", required=True, dest="commit")
     # load command, additional "from" argument
     parser_load = subparsers.add_parser("load")
-    parser_load.add_argument("commit", type=str, default=".")
-    parser.add_argument("--from", type=str, default="", required=False, dest="from_commit")
+    parser_load.add_argument("--from", type=str, required=True, dest="from_commit")
+    parser_load.add_argument("--install", type=str, required=True, dest="commit")
+
     # keep unknown arguments, they will be forwarded to pytest below
     args, unknownargs = parser.parse_known_args(session.posargs)
 
+    if args.mode == "load":
+        if args.from_commit == ".":
+            from_commit = "local"
+        else:
+            from_commit = git_rev_parse(session, args.from_commit)
+
+        from_path = Path("_compatibility_data").absolute() / from_commit
+        if not from_path.exists():
+            raise FileNotFoundError(
+                f"The directory {from_path} does not exist, cannot load data.\n"
+                f"Please run 'nox -- save {args.from_commit}' to create it, and then rerun this command."
+            )
+
+        print(f"Data will be loaded from path {from_path}.")
+
     # install dependencies, currently the jax backend is used, but we could add a configuration option for this
-    repo_path = Path(os.curdir).absolute().parent / "bf2"
-    session.install(f"git+file://{str(repo_path)}@{args.commit}")
+    repo_path = Path(os.curdir).absolute()
+    if args.commit == ".":
+        print("'.' provided, installing local state...")
+        if args.mode == "save":
+            print("Output will be saved to the alias 'local'")
+        commit = "local"
+        session.install(".[test]")
+    else:
+        commit = git_rev_parse(session, args.commit)
+        print("Installing specified revision...")
+        session.install(f"bayesflow[test] @ git+file://{str(repo_path)}@{commit}")
     session.install("jax")
-    session.install("pytest")
 
-    # pass mode and commits to pytest, required for correct save and load behavior
-    cmd = ["pytest", "--mode", args.mode, "--commit", args.commit]
-    if args.mode == "load":
-        cmd += ["--from", args.from_commit]
-    cmd += unknownargs
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        # launch in temporary directory, as the local bayesflow would overshadow the installed one
+        tmpdirname = Path(tmpdirname)
+        # pass mode and data path to pytest, required for correct save and load behavior
+        if args.mode == "load":
+            data_path = from_path
+        else:
+            data_path = Path("_compatibility_data").absolute() / commit
+            if data_path.exists():
+                print(f"Removing existing data directory {data_path}...")
+                shutil.rmtree(data_path)
+
+        cmd = ["pytest", "tests/test_compatibility", f"--mode={args.mode}", f"--data-path={data_path}"]
+        cmd += unknownargs
 
-    session.run(*cmd, env={"KERAS_BACKEND": "jax"})
+        print(f"Copying tests from working directory to temporary directory: {tmpdirname}")
+        shutil.copytree("tests", tmpdirname / "tests")
+        with session.chdir(tmpdirname):
+            session.run(*cmd, env={"KERAS_BACKEND": "jax"})
diff --git a/pyproject.toml b/pyproject.toml
index b37a08e15..3efb03068 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -55,6 +55,7 @@ all = [
     "sphinxcontrib-bibtex ~= 2.6",
     "snowballstemmer ~= 2.2.0",
     # test
+    "nox",
     "pytest",
     "pytest-cov",
     "pytest-rerunfailures",
@@ -81,6 +82,7 @@ test = [
     "nbconvert",
     "ipython",
     "ipykernel",
+    "nox",
     "pytest",
     "pytest-cov",
     "pytest-rerunfailures",
diff --git a/tests/conftest.py b/tests/conftest.py
index c0abb1f3d..a32d71c7a 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -8,8 +8,7 @@
 
 def pytest_addoption(parser):
     parser.addoption("--mode", choices=["save", "load"])
-    parser.addoption("--commit", type=str)
-    parser.addoption("--from", type=str, required=False, dest="from_")
+    parser.addoption("--data-path", type=str)
 
 
 def pytest_runtest_setup(item):
@@ -73,16 +72,16 @@ def feature_size(request):
 
 
 @pytest.fixture()
-def random_conditions(batch_size, conditions_size):
+def random_conditions(random_seed, batch_size, conditions_size):
     if conditions_size is None:
         return None
 
-    return keras.random.normal((batch_size, conditions_size))
+    return keras.random.normal((batch_size, conditions_size), seed=10)
 
 
 @pytest.fixture()
-def random_samples(batch_size, feature_size):
-    return keras.random.normal((batch_size, feature_size))
+def random_samples(random_seed, batch_size, feature_size):
+    return keras.random.normal((batch_size, feature_size), seed=20)
 
 
 @pytest.fixture(scope="function", autouse=True)
@@ -93,8 +92,8 @@ def random_seed():
 
 
 @pytest.fixture()
-def random_set(batch_size, set_size, feature_size):
-    return keras.random.normal((batch_size, set_size, feature_size))
+def random_set(random_seed, batch_size, set_size, feature_size):
+    return keras.random.normal((batch_size, set_size, feature_size), seed=30)
 
 
 @pytest.fixture(params=[2, 3])
diff --git a/tests/test_compatibility/conftest.py b/tests/test_compatibility/conftest.py
index a33a90f4a..0bc74ca79 100644
--- a/tests/test_compatibility/conftest.py
+++ b/tests/test_compatibility/conftest.py
@@ -10,30 +10,21 @@ def mode(request):
     return mode
 
 
-@pytest.fixture(scope="session")
-def commit(request):
-    return request.config.getoption("--commit")
-
-
-@pytest.fixture(scope="session")
-def from_commit(request):
-    return request.config.getoption("--from")
-
-
 @pytest.fixture(autouse=True, scope="session")
-def data_dir(request, commit, from_commit, tmp_path_factory):
+def data_dir(request, tmp_path_factory):
     # read config option to detect "unset" scenario
     mode = request.config.getoption("--mode")
-    if mode == "save":
-        path = Path(".").absolute() / "_compatibility_data" / commit
-        return path
+    path = request.config.getoption("--data-path")
+    if not mode:
+        # if mode is unset, save and load from a temporary directory
+        return Path(tmp_path_factory.mktemp("_compatibility_data"))
+    elif not path:
+        pytest.exit(reason="Please provide the --data-path argument for model saving/loading.")
     elif mode == "load":
-        path = Path(".").absolute() / "_compatibility_data" / from_commit
+        path = Path(path)
         if not path.exists():
             pytest.exit(reason=f"Load path '{path}' does not exist. Please specify a valid load path", returncode=1)
-        return path
-    # if mode is unset, save and load from a temporary directory
-    return Path(tmp_path_factory.mktemp("_compatibility_data"))
+    return path
 
 
 # reduce number of test configurations
diff --git a/tests/test_compatibility/test_networks/test_summary_networks/test_summary_networks.py b/tests/test_compatibility/test_networks/test_summary_networks/test_summary_networks.py
index 421ce55a7..05aec6793 100644
--- a/tests/test_compatibility/test_networks/test_summary_networks/test_summary_networks.py
+++ b/tests/test_compatibility/test_networks/test_summary_networks/test_summary_networks.py
@@ -15,7 +15,7 @@
     ],
     indirect=True,
 )
-class TestInferenceNetwork(SaveLoadTest):
+class TestSummaryNetwork(SaveLoadTest):
     filenames = {
         "model": "model.keras",
         "output": "output.pickle",
diff --git a/tests/test_compatibility/utils/helpers.py b/tests/test_compatibility/utils/helpers.py
index 7ab9b98a5..f42ef46ec 100644
--- a/tests/test_compatibility/utils/helpers.py
+++ b/tests/test_compatibility/utils/helpers.py
@@ -1,5 +1,7 @@
 import pytest
-from utils import get_valid_filename, get_path
+import hashlib
+import inspect
+from pathlib import Path
 
 
 class SaveLoadTest:
@@ -7,10 +9,24 @@ class SaveLoadTest:
 
     @pytest.fixture(autouse=True)
     def filepaths(self, data_dir, mode, request):
-        prefix = get_valid_filename(request._pyfuncitem.name)
+        # this name contains the config for the test and is therefore a unique identifier
+        test_config_str = request._pyfuncitem.name
+        # hash it, as it could be too long for the file system
+        prefix = hashlib.sha1(test_config_str.encode("utf-8")).hexdigest()
+        # use path to test file as base, remove ".py" suffix
+        base_path = Path(inspect.getsourcefile(type(self))[:-3])
+        # add class name
+        directory = base_path / type(self).__name__
+        # only keep the path relative to the tests directory
+        directory = directory.relative_to(Path("tests").absolute())
+        directory = Path(data_dir) / directory
+
+        if mode == "save":
+            directory.mkdir(parents=True, exist_ok=True)
+
         files = {}
         for label, filename in self.filenames.items():
-            path = get_path(data_dir, f"{prefix}__{filename}", create=mode == "save")
+            path = directory / f"{prefix}__{filename}"
             if mode == "load" and not path.exists():
                 pytest.skip(f"Required file not available: {path}")
             files[label] = path
diff --git a/tests/test_compatibility/utils/io.py b/tests/test_compatibility/utils/io.py
index c2a554617..618b4e26a 100644
--- a/tests/test_compatibility/utils/io.py
+++ b/tests/test_compatibility/utils/io.py
@@ -1,32 +1,6 @@
-import inspect
 from keras.saving import deserialize_keras_object, serialize_keras_object
-from pathlib import Path
 import pickle
-import re
-
-
-def get_path(data_dir: Path | str = "", filename: str = "", *, create: bool = False) -> Path:
-    frame = inspect.stack()[1]
-    base_path = Path(inspect.stack()[1].filename[:-3])
-    function_name = frame.function
-    if "self" in frame[0].f_locals:
-        filepath = base_path / frame[0].f_locals["self"].__class__.__name__ / function_name
-    else:
-        filepath = base_path / function_name
-    filepath = Path(data_dir) / filepath.relative_to(Path("tests").absolute())
-    if create is True:
-        filepath.mkdir(parents=True, exist_ok=True)
-    if filename:
-        return filepath / filename
-    return filepath
-
-
-def get_valid_filename(name):
-    s = str(name).strip().replace(" ", "_")
-    s = re.sub(r"(?u)[^-\w.]", "_", s)
-    if s in {"", ".", ".."}:
-        raise ValueError("Could not derive file name from '%s'" % name)
-    return s
+from pathlib import Path
 
 
 def dump_path(object, filepath: Path | str):

From da3bacb6d085477a9cf4ff098e2f393bde86248f Mon Sep 17 00:00:00 2001
From: Valentin Pratz <git@valentinpratz.de>
Date: Sat, 21 Jun 2025 19:43:20 +0000
Subject: [PATCH 07/10] rename argument from --install to commit again

---
 noxfile.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/noxfile.py b/noxfile.py
index 662af68a2..0a883baeb 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -29,11 +29,11 @@ def save_and_load(session: nox.Session):
     subparsers = parser.add_subparsers(help="subcommand help", dest="mode")
     # save command
     parser_save = subparsers.add_parser("save")
-    parser_save.add_argument("--install", type=str, default=".", required=True, dest="commit")
+    parser_save.add_argument("commit", type=str)
     # load command, additional "from" argument
     parser_load = subparsers.add_parser("load")
     parser_load.add_argument("--from", type=str, required=True, dest="from_commit")
-    parser_load.add_argument("--install", type=str, required=True, dest="commit")
+    parser_load.add_argument("commit", type=str)
 
     # keep unknown arguments, they will be forwarded to pytest below
     args, unknownargs = parser.parse_known_args(session.posargs)

From be44a77581c040c7f3a540920444451122afd574 Mon Sep 17 00:00:00 2001
From: Valentin Pratz <git@valentinpratz.de>
Date: Sat, 21 Jun 2025 20:23:23 +0000
Subject: [PATCH 08/10] adapt tests for torch

---
 .../test_point_approximator/conftest.py           | 15 ++++++++++-----
 .../test_point_approximator.py                    |  5 ++---
 .../test_distributions/test_distributions.py      | 14 +++++++++++---
 tests/test_compatibility/test_metrics/conftest.py |  8 +++-----
 4 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/tests/test_compatibility/test_approximators/test_point_approximator/conftest.py b/tests/test_compatibility/test_approximators/test_point_approximator/conftest.py
index 4ee38c142..cc344aadd 100644
--- a/tests/test_compatibility/test_approximators/test_point_approximator/conftest.py
+++ b/tests/test_compatibility/test_approximators/test_point_approximator/conftest.py
@@ -6,11 +6,6 @@ def batch_size():
     return 8
 
 
-@pytest.fixture()
-def num_samples():
-    return 100
-
-
 @pytest.fixture(params=["single_parametric", "multiple_parametric"])
 def point_inference_network(request):
     match request.param:
@@ -52,3 +47,13 @@ def approximator(adapter, point_inference_network, summary_network, standardize)
         summary_network=summary_network,
         standardize=standardize,
     )
+
+
+@pytest.fixture()
+def adapter():
+    from bayesflow import ContinuousApproximator
+
+    return ContinuousApproximator.build_adapter(
+        inference_variables=["mean", "std"],
+        inference_conditions=["x"],
+    )
diff --git a/tests/test_compatibility/test_approximators/test_point_approximator/test_point_approximator.py b/tests/test_compatibility/test_approximators/test_point_approximator/test_point_approximator.py
index a45801e79..c4d8801b2 100644
--- a/tests/test_compatibility/test_approximators/test_point_approximator/test_point_approximator.py
+++ b/tests/test_compatibility/test_approximators/test_point_approximator/test_point_approximator.py
@@ -5,10 +5,9 @@
 
 
 @pytest.mark.parametrize(
-    "summary_network,simulator,adapter,standardize",
+    "summary_network,simulator,standardize",
     [
-        ["deep_set", "sir", "summary", "all"],  # use deep_set for speed
-        [None, "two_moons", "direct", None],
+        [None, "normal", "all"],
     ],
     indirect=True,
 )
diff --git a/tests/test_compatibility/test_distributions/test_distributions.py b/tests/test_compatibility/test_distributions/test_distributions.py
index 6af2263c7..c53046e93 100644
--- a/tests/test_compatibility/test_distributions/test_distributions.py
+++ b/tests/test_compatibility/test_distributions/test_distributions.py
@@ -48,12 +48,20 @@ def from_config(cls, config, custom_objects=None):
 
             model = DummyModel(distribution)
             model.compile(loss=keras.losses.MeanSquaredError())
-            model.fit(
-                random_samples,
-                keras.ops.ones(keras.ops.shape(random_samples)[:-1]),
+            fit_kwargs = dict(
+                x=random_samples,
+                y=keras.ops.ones(keras.ops.shape(random_samples)[:-1]),
                 batch_size=keras.ops.shape(random_samples)[0],
                 epochs=1,
             )
+            if keras.backend.backend() == "torch":
+                import torch
+
+                with torch.enable_grad():
+                    model.fit(**fit_kwargs)
+            else:
+                model.fit(**fit_kwargs)
+
             model.save(filepaths["model"])
 
             output = self.evaluate(model.distribution, random_samples)
diff --git a/tests/test_compatibility/test_metrics/conftest.py b/tests/test_compatibility/test_metrics/conftest.py
index 63475e68a..9de8610cc 100644
--- a/tests/test_compatibility/test_metrics/conftest.py
+++ b/tests/test_compatibility/test_metrics/conftest.py
@@ -1,5 +1,5 @@
 import pytest
-import numpy as np
+import keras
 
 
 @pytest.fixture()
@@ -20,11 +20,9 @@ def metric(request):
 
 @pytest.fixture
 def samples_1():
-    rng = np.random.default_rng(seed=1)
-    return rng.normal(size=(2, 3)).astype(np.float32)
+    return keras.random.normal((2, 3), seed=1)
 
 
 @pytest.fixture
 def samples_2():
-    rng = np.random.default_rng(seed=2)
-    return rng.normal(size=(2, 3)).astype(np.float32)
+    return keras.random.normal((2, 3), seed=2)

From f9199ea5265b74698014f5c07c8fc666bdbf7099 Mon Sep 17 00:00:00 2001
From: han-ol <g@hans.olischlaeger.com>
Date: Mon, 23 Jun 2025 14:18:22 +0200
Subject: [PATCH 09/10] Fix variable name in adapter fixture

---
 tests/test_compatibility/conftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_compatibility/conftest.py b/tests/test_compatibility/conftest.py
index 0bc74ca79..ad347cf55 100644
--- a/tests/test_compatibility/conftest.py
+++ b/tests/test_compatibility/conftest.py
@@ -58,7 +58,7 @@ def adapter(request):
         case "summary":
             return bf.Adapter.create_default("parameters").rename("observables", "summary_variables")
         case "direct":
-            return bf.Adapter.create_default("parameters").rename("observables", "direct_conditions")
+            return bf.Adapter.create_default("parameters").rename("observables", "inference_conditions")
         case "default":
             return bf.Adapter.create_default("parameters")
         case "empty":

From 83da4edb4f0b9e570fab5b900aa054ee328a1f6f Mon Sep 17 00:00:00 2001
From: han-ol <g@hans.olischlaeger.com>
Date: Mon, 23 Jun 2025 19:02:26 +0200
Subject: [PATCH 10/10] Restore more complex point approximator tests

---
 .../test_point_approximator/conftest.py                | 10 ----------
 .../test_point_approximator/test_point_approximator.py |  5 +++--
 2 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/tests/test_compatibility/test_approximators/test_point_approximator/conftest.py b/tests/test_compatibility/test_approximators/test_point_approximator/conftest.py
index cc344aadd..a0d9583a2 100644
--- a/tests/test_compatibility/test_approximators/test_point_approximator/conftest.py
+++ b/tests/test_compatibility/test_approximators/test_point_approximator/conftest.py
@@ -47,13 +47,3 @@ def approximator(adapter, point_inference_network, summary_network, standardize)
         summary_network=summary_network,
         standardize=standardize,
     )
-
-
-@pytest.fixture()
-def adapter():
-    from bayesflow import ContinuousApproximator
-
-    return ContinuousApproximator.build_adapter(
-        inference_variables=["mean", "std"],
-        inference_conditions=["x"],
-    )
diff --git a/tests/test_compatibility/test_approximators/test_point_approximator/test_point_approximator.py b/tests/test_compatibility/test_approximators/test_point_approximator/test_point_approximator.py
index c4d8801b2..a45801e79 100644
--- a/tests/test_compatibility/test_approximators/test_point_approximator/test_point_approximator.py
+++ b/tests/test_compatibility/test_approximators/test_point_approximator/test_point_approximator.py
@@ -5,9 +5,10 @@
 
 
 @pytest.mark.parametrize(
-    "summary_network,simulator,standardize",
+    "summary_network,simulator,adapter,standardize",
     [
-        [None, "normal", "all"],
+        ["deep_set", "sir", "summary", "all"],  # use deep_set for speed
+        [None, "two_moons", "direct", None],
     ],
     indirect=True,
 )