pytorch
diff --git a/‎test/mocking_classes.py‎
Lines changed: 6 additions & 3 deletions b/‎test/mocking_classes.py‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎test/test_env.py‎
Lines changed: 3 additions & 0 deletions b/‎test/test_env.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎torchrl/collectors/collectors.py‎
Lines changed: 97 additions & 33 deletions b/‎torchrl/collectors/collectors.py‎
Lines changed: 97 additions & 33 deletions
diff --git a/‎torchrl/collectors/distributed/generic.py‎
Lines changed: 45 additions & 19 deletions b/‎torchrl/collectors/distributed/generic.py‎
Lines changed: 45 additions & 19 deletions
diff --git a/‎torchrl/collectors/distributed/ray.py‎
Lines changed: 36 additions & 22 deletions b/‎torchrl/collectors/distributed/ray.py‎
Lines changed: 36 additions & 22 deletions
@@ -1138,14 +1138,17 @@ def _step(
             dtype=torch.int,
             device=device if self.device is None else self.device,
         )
+        if self.reward_keys:
+            reward_spec = self.full_reward_spec[self.reward_keys[0]]
+            reward_spec_dtype = reward_spec.dtype
+        else:
+            reward_spec_dtype = torch.get_default_dtype()
         tensordict = TensorDict(
             source={
                 "observation": self.count.clone(),
                 "done": self.count > self.max_steps,
                 "terminated": self.count > self.max_steps,
-                "reward": torch.zeros_like(
-                    self.count, dtype=self.full_reward_spec[self.reward_keys[0]].dtype
-                ),
+                "reward": torch.zeros_like(self.count, dtype=reward_spec_dtype),
             },
             batch_size=self.batch_size,
             device=self.device,
 
@@ -30,6 +30,7 @@
     dense_stack_tds,
     LazyStackedTensorDict,
     set_capture_non_tensor_stack,
+    set_list_to_stack,
     TensorDict,
     TensorDictBase,
 )
@@ -3094,6 +3095,7 @@ def test_mocking_envs(envclass):
             check_env_specs(env, seed=100, return_contiguous=False)
 
 
+@set_list_to_stack(True)
 class TestTerminatedOrTruncated:
     @pytest.mark.parametrize("done_key", ["done", "terminated", "truncated"])
     def test_root_prevail(self, done_key):
@@ -3409,6 +3411,7 @@ def test_single_task_share_individual_td():
         )
 
 
+@set_list_to_stack(True)
 def test_stackable():
     # Tests the _stackable util
     stack = [TensorDict({"a": 0}, []), TensorDict({"b": 1}, [])]
 
@@ -11,7 +11,7 @@
 import warnings
 from copy import copy, deepcopy
 from datetime import timedelta
-from typing import Callable, OrderedDict
+from typing import Any, Callable, OrderedDict, Sequence
 
 import torch.cuda
 from tensordict import TensorDict, TensorDictBase
@@ -131,6 +131,7 @@ def _distributed_init_collection_node(
     num_workers,
     env_make,
     policy,
+    policy_factory,
     frames_per_batch,
     collector_kwargs,
     verbose=True,
@@ -143,6 +144,7 @@ def _distributed_init_collection_node(
         num_workers,
         env_make,
         policy,
+        policy_factory,
         frames_per_batch,
         collector_kwargs,
         verbose=verbose,
@@ -156,6 +158,7 @@ def _run_collector(
     num_workers,
     env_make,
     policy,
+    policy_factory,
     frames_per_batch,
     collector_kwargs,
     verbose=True,
@@ -178,12 +181,17 @@ def _run_collector(
         policy_weights = TensorDict.from_module(policy)
         policy_weights = policy_weights.data.lock_()
     else:
-        warnings.warn(_NON_NN_POLICY_WEIGHTS)
+        if collector_kwargs.get("remote_weight_updater") is None and (
+            policy_factory is None
+            or (isinstance(policy_factory, Sequence) and not any(policy_factory))
+        ):
+            warnings.warn(_NON_NN_POLICY_WEIGHTS)
         policy_weights = TensorDict(lock=True)
 
     collector = collector_class(
         env_make,
         policy,
+        policy_factory=policy_factory,
         frames_per_batch=frames_per_batch,
         total_frames=-1,
         split_trajs=False,
@@ -278,8 +286,8 @@ class DistributedDataCollector(DataCollectorBase):
                 pickled directly), the :arg:`policy_factory` should be used instead.
 
     Keyword Args:
-        policy_factory (Callable[[], Callable], optional): a callable that returns
-            a policy instance. This is exclusive with the `policy` argument.
+        policy_factory (Callable[[], Callable], list of Callable[[], Callable], optional): a callable
+            (or list of callables) that returns a policy instance. This is exclusive with the `policy` argument.
 
             .. note:: `policy_factory` comes in handy whenever the policy cannot be serialized.
 
@@ -411,14 +419,16 @@ class DistributedDataCollector(DataCollectorBase):
             to learn more.
             Defaults to ``"submitit"``.
         tcp_port (int, optional): the TCP port to be used. Defaults to 10003.
-        local_weight_updater (LocalWeightUpdaterBase, optional): An instance of :class:`~torchrl.collectors.LocalWeightUpdaterBase`
+        local_weight_updater (LocalWeightUpdaterBase or constructor, optional): An instance of :class:`~torchrl.collectors.LocalWeightUpdaterBase`
             or its subclass, responsible for updating the policy weights on the local inference worker.
             This is typically not used in :class:`~torchrl.collectors.distributed.DistributedDataCollector` as it
             focuses on distributed environments.
-        remote_weight_updater (RemoteWeightUpdaterBase, optional): An instance of :class:`~torchrl.collectors.RemoteWeightUpdaterBase`
+            Consider using a constructor if the updater needs to be serialized.
+        remote_weight_updater (RemoteWeightUpdaterBase or constructor, optional): An instance of :class:`~torchrl.collectors.RemoteWeightUpdaterBase`
             or its subclass, responsible for updating the policy weights on distributed inference workers.
             If not provided, a :class:`~torchrl.collectors.distributed.DistributedRemoteWeightUpdater` will be used by
             default, which handles weight synchronization across distributed workers.
+            Consider using a constructor if the updater needs to be serialized.
 
     """
 
@@ -429,31 +439,37 @@ def __init__(
         create_env_fn,
         policy: Callable[[TensorDictBase], TensorDictBase] | None = None,
         *,
-        policy_factory: Callable[[], Callable] | None = None,
+        policy_factory: Callable[[], Callable]
+        | list[Callable[[] | Callable]]
+        | None = None,
         frames_per_batch: int,
         total_frames: int = -1,
-        device: torch.device | list[torch.device] = None,
-        storing_device: torch.device | list[torch.device] = None,
-        env_device: torch.device | list[torch.device] = None,
-        policy_device: torch.device | list[torch.device] = None,
+        device: torch.device | list[torch.device] | None = None,
+        storing_device: torch.device | list[torch.device] | None = None,
+        env_device: torch.device | list[torch.device] | None = None,
+        policy_device: torch.device | list[torch.device] | None = None,
         max_frames_per_traj: int = -1,
         init_random_frames: int = -1,
         reset_at_each_iter: bool = False,
         postproc: Callable | None = None,
         split_trajs: bool = False,
         exploration_type: ExporationType = DEFAULT_EXPLORATION_TYPE,  # noqa
         collector_class: type = SyncDataCollector,
-        collector_kwargs: dict = None,
+        collector_kwargs: dict[str, Any] | None = None,
         num_workers_per_collector: int = 1,
         sync: bool = False,
-        slurm_kwargs: dict | None = None,
+        slurm_kwargs: dict[str, Any] | None = None,
         backend: str = "gloo",
         update_after_each_batch: bool = False,
         max_weight_update_interval: int = -1,
         launcher: str = "submitit",
-        tcp_port: int = None,
-        remote_weight_updater: RemoteWeightUpdaterBase | None = None,
-        local_weight_updater: LocalWeightUpdaterBase | None = None,
+        tcp_port: int | None = None,
+        remote_weight_updater: RemoteWeightUpdaterBase
+        | Callable[[], RemoteWeightUpdaterBase]
+        | None = None,
+        local_weight_updater: LocalWeightUpdaterBase
+        | Callable[[], LocalWeightUpdaterBase]
+        | None = None,
     ):
 
         if collector_class == "async":
@@ -465,18 +481,22 @@ def __init__(
         self.collector_class = collector_class
         self.env_constructors = create_env_fn
         self.policy = policy
+        if not isinstance(policy_factory, Sequence):
+            policy_factory = [policy_factory for _ in range(len(self.env_constructors))]
+        self.policy_factory = policy_factory
         if isinstance(policy, nn.Module):
             policy_weights = TensorDict.from_module(policy)
             policy_weights = policy_weights.data.lock_()
-        elif policy_factory is not None:
+        elif any(policy_factory):
             policy_weights = None
             if remote_weight_updater is None:
                 raise RuntimeError(
                     "remote_weight_updater must be passed along with "
                     "a policy_factory."
                 )
         else:
-            warnings.warn(_NON_NN_POLICY_WEIGHTS)
+            if not any(policy_factory):
+                warnings.warn(_NON_NN_POLICY_WEIGHTS)
             policy_weights = TensorDict(lock=True)
         self.policy_weights = policy_weights
         self.num_workers = len(create_env_fn)
@@ -664,12 +684,15 @@ def _make_container(self):
         if self._VERBOSE:
             torchrl_logger.info("making container")
         env_constructor = self.env_constructors[0]
+        kwargs = self.collector_kwargs[0]
         pseudo_collector = SyncDataCollector(
             env_constructor,
-            self.policy,
+            policy=self.policy,
+            policy_factory=self.policy_factory[0],
             frames_per_batch=self._frames_per_batch_corrected,
             total_frames=-1,
             split_trajs=False,
+            **kwargs,
         )
         for _data in pseudo_collector:
             break
@@ -713,6 +736,7 @@ def _init_worker_dist_submitit(self, executor, i):
             self.num_workers_per_collector,
             env_make,
             self.policy,
+            self.policy_factory[i],
             self._frames_per_batch_corrected,
             self.collector_kwargs[i],
             self._VERBOSE,
@@ -734,6 +758,7 @@ def get_env_make(i):
                 "num_workers": self.num_workers_per_collector,
                 "env_make": get_env_make(i),
                 "policy": self.policy,
+                "policy_factory": self.policy_factory[i],
                 "frames_per_batch": self._frames_per_batch_corrected,
                 "collector_kwargs": self.collector_kwargs[i],
             }
@@ -760,6 +785,7 @@ def _init_worker_dist_mp(self, i):
                 self.num_workers_per_collector,
                 env_make,
                 self.policy,
+                self.policy_factory[i],
                 self._frames_per_batch_corrected,
                 self.collector_kwargs[i],
                 self._VERBOSE,
 
@@ -7,7 +7,7 @@
 
 import asyncio
 import warnings
-from typing import Callable, Iterator, OrderedDict
+from typing import Any, Callable, Iterator, OrderedDict, Sequence
 
 import torch
 import torch.nn as nn
@@ -153,8 +153,8 @@ class RayCollector(DataCollectorBase):
                 pickled directly), the :arg:`policy_factory` should be used instead.
 
     Keyword Args:
-        policy_factory (Callable[[], Callable], optional): a callable that returns
-            a policy instance. This is exclusive with the `policy` argument.
+        policy_factory (Callable[[], Callable], list of Callable[[], Callable], optional): a callable
+            (or list of callables) that returns a policy instance. This is exclusive with the `policy` argument.
 
             .. note:: `policy_factory` comes in handy whenever the policy cannot be serialized.
 
@@ -230,7 +230,7 @@ class RayCollector(DataCollectorBase):
             collecting data. Must be one of ``torchrl.envs.utils.ExplorationType.DETERMINISTIC``,
             ``torchrl.envs.utils.ExplorationType.RANDOM``, ``torchrl.envs.utils.ExplorationType.MODE``
             or ``torchrl.envs.utils.ExplorationType.MEAN``.
-        collector_class (Python class): a collector class to be remotely instantiated. Can be
+        collector_class (Python class or constructor): a collector class to be remotely instantiated. Can be
             :class:`~torchrl.collectors.SyncDataCollector`,
             :class:`~torchrl.collectors.MultiSyncDataCollector`,
             :class:`~torchrl.collectors.MultiaSyncDataCollector`
@@ -277,13 +277,16 @@ class RayCollector(DataCollectorBase):
 
             .. note:: although it is not enfoced (to allow users to implement their own replay buffer class), a
                 :class:`~torchrl.data.RayReplayBuffer` instance should be used here.
-        local_weight_updater (LocalWeightUpdaterBase, optional): An instance of :class:`~torchrl.collectors.LocalWeightUpdaterBase`
+        local_weight_updater (LocalWeightUpdaterBase or constructor, optional): An instance of :class:`~torchrl.collectors.LocalWeightUpdaterBase`
             or its subclass, responsible for updating the policy weights on the local inference worker.
-            This is typically not used in :class:`~torchrl.collectors.RayCollector` as it focuses on distributed environments.
-        remote_weight_updater (RemoteWeightUpdaterBase, optional): An instance of :class:`~torchrl.collectors.RemoteWeightUpdaterBase`
+            This is typically not used in :class:`~torchrl.collectors.RayCollector` as it focuses on distributed
+            environments.
+            Consider using a constructor if the updater needs to be serialized.
+        remote_weight_updater (RemoteWeightUpdaterBase or constructor, optional): An instance of :class:`~torchrl.collectors.RemoteWeightUpdaterBase`
             or its subclass, responsible for updating the policy weights on remote inference workers managed by Ray.
             If not provided, a :class:`~torchrl.collectors.RayRemoteWeightUpdater` will be used by default, leveraging
             Ray's distributed capabilities.
+            Consider using a constructor if the updater needs to be serialized.
 
     Examples:
         >>> from torch import nn
@@ -319,31 +322,37 @@ def __init__(
         create_env_fn: Callable | EnvBase | list[Callable] | list[EnvBase],
         policy: Callable[[TensorDictBase], TensorDictBase] | None = None,
         *,
-        policy_factory: Callable[[], Callable] | None = None,
+        policy_factory: Callable[[], Callable]
+        | list[Callable[[], Callable]]
+        | None = None,
         frames_per_batch: int,
         total_frames: int = -1,
-        device: torch.device | list[torch.device] = None,
-        storing_device: torch.device | list[torch.device] = None,
-        env_device: torch.device | list[torch.device] = None,
-        policy_device: torch.device | list[torch.device] = None,
+        device: torch.device | list[torch.device] | None = None,
+        storing_device: torch.device | list[torch.device] | None = None,
+        env_device: torch.device | list[torch.device] | None = None,
+        policy_device: torch.device | list[torch.device] | None = None,
         max_frames_per_traj=-1,
         init_random_frames=-1,
         reset_at_each_iter=False,
         postproc=None,
         split_trajs=False,
         exploration_type=DEFAULT_EXPLORATION_TYPE,
         collector_class: Callable[[TensorDict], TensorDict] = SyncDataCollector,
-        collector_kwargs: dict | list[dict] = None,
+        collector_kwargs: dict[str, Any] | list[dict] | None = None,
         num_workers_per_collector: int = 1,
         sync: bool = False,
-        ray_init_config: dict = None,
-        remote_configs: dict | list[dict] = None,
-        num_collectors: int = None,
-        update_after_each_batch=False,
-        max_weight_update_interval=-1,
-        replay_buffer: ReplayBuffer = None,
-        remote_weight_updater: RemoteWeightUpdaterBase | None = None,
-        local_weight_updater: LocalWeightUpdaterBase | None = None,
+        ray_init_config: dict[str, Any] | None = None,
+        remote_configs: dict[str, Any] | list[dict[str, Any]] | None = None,
+        num_collectors: int | None = None,
+        update_after_each_batch: bool = False,
+        max_weight_update_interval: int = -1,
+        replay_buffer: ReplayBuffer | None = None,
+        remote_weight_updater: RemoteWeightUpdaterBase
+        | Callable[[], RemoteWeightUpdaterBase]
+        | None = None,
+        local_weight_updater: LocalWeightUpdaterBase
+        | Callable[[], LocalWeightUpdaterBase]
+        | None = None,
     ):
         self.frames_per_batch = frames_per_batch
         if remote_configs is None:
@@ -451,6 +460,9 @@ def check_list_length_consistency(*lists):
             collector_class.print_remote_collector_info = print_remote_collector_info
 
         self.replay_buffer = replay_buffer
+        if not isinstance(policy_factory, Sequence):
+            policy_factory = [policy_factory] * len(create_env_fn)
+        self.policy_factory = policy_factory
         self._local_policy = policy
         if isinstance(self._local_policy, nn.Module):
             policy_weights = TensorDict.from_module(self._local_policy)
@@ -491,7 +503,7 @@ def check_list_length_consistency(*lists):
 
         # update collector kwargs
         for i, collector_kwarg in enumerate(self.collector_kwargs):
-            collector_kwarg["policy_factory"] = policy_factory
+            collector_kwarg["policy_factory"] = policy_factory[i]
             collector_kwarg["max_frames_per_traj"] = max_frames_per_traj
             collector_kwarg["init_random_frames"] = (
                 init_random_frames // self.num_collectors
@@ -678,6 +690,7 @@ def _sync_iterator(self) -> Iterator[TensorDictBase]:
         """Collects one data batch per remote collector in each iteration."""
         while self.collected_frames < self.total_frames:
             if self.update_after_each_batch or self.max_weight_update_interval > -1:
+                torchrl_logger.info("Updating weights on all workers")
                 self.update_policy_weights_()
 
             # Ask for batches to all remote workers.
@@ -759,6 +772,7 @@ def _async_iterator(self) -> Iterator[TensorDictBase]:
             yield out_td
 
             if self.update_after_each_batch or self.max_weight_update_interval > -1:
+                torchrl_logger.info(f"Updating weights on worker {collector_index}")
                 self.update_policy_weights_(worker_ids=collector_index + 1)
 
             # Schedule a new collection task