From 6588f85d68a67e8bc0fdb433d6ad6ec54a450d51 Mon Sep 17 00:00:00 2001
From: Kelly Guo <kellyg@nvidia.com>
Date: Mon, 10 Nov 2025 16:44:30 -0800
Subject: [PATCH 1/7] Adds RL device for allowing CPU sim and RL device

---
 scripts/demos/pick_and_place.py               | 94 +++++++++++--------
 .../reinforcement_learning/rl_games/play.py   |  4 -
 .../reinforcement_learning/rl_games/train.py  |  5 -
 .../reinforcement_learning/rsl_rl/train.py    |  2 +-
 source/isaaclab_rl/config/extension.toml      |  2 +-
 source/isaaclab_rl/docs/CHANGELOG.rst         | 10 ++
 .../isaaclab_rl/rl_games/rl_games.py          |  4 +
 .../isaaclab_rl/rsl_rl/vecenv_wrapper.py      | 31 +++++-
 8 files changed, 98 insertions(+), 54 deletions(-)

diff --git a/scripts/demos/pick_and_place.py b/scripts/demos/pick_and_place.py
index cc14dcb0a72..ae3c1bd420d 100644
--- a/scripts/demos/pick_and_place.py
+++ b/scripts/demos/pick_and_place.py
@@ -11,6 +11,7 @@
 
 # add argparse arguments
 parser = argparse.ArgumentParser(description="Keyboard control for Isaac Lab Pick and Place.")
+parser.add_argument("--num_envs", type=int, default=32, help="Number of environments to spawn.")
 # append AppLauncher cli args
 AppLauncher.add_app_launcher_args(parser)
 # parse the arguments
@@ -59,11 +60,16 @@ class PickAndPlaceEnvCfg(DirectRLEnvCfg):
     action_space = 4
     observation_space = 6
     state_space = 0
-    device = "cpu"
 
-    # Simulation cfg. Note that we are forcing the simulation to run on CPU.
-    # This is because the surface gripper API is only supported on CPU backend for now.
-    sim: SimulationCfg = SimulationCfg(dt=1 / 60, render_interval=decimation, device="cpu")
+    # Simulation cfg. Surface grippers are currently only supported on CPU.
+    # Surface grippers also require scene query support to function.
+    sim: SimulationCfg = SimulationCfg(
+        dt=1 / 60,
+        device="cpu",
+        render_interval=decimation,
+        use_fabric=True,
+        enable_scene_query_support=True,
+    )
     debug_vis = True
 
     # robot
@@ -136,8 +142,8 @@ def __init__(self, cfg: PickAndPlaceEnvCfg, render_mode: str | None = None, **kw
         self.joint_vel = self.pick_and_place.data.joint_vel
 
         # Buffers
-        self.go_to_cube = False
-        self.go_to_target = False
+        self.go_to_cube = torch.zeros(self.num_envs, dtype=torch.bool, device=self.device)
+        self.go_to_target = torch.zeros(self.num_envs, dtype=torch.bool, device=self.device)
         self.target_pos = torch.zeros((self.num_envs, 3), device=self.device, dtype=torch.float32)
         self.instant_controls = torch.zeros((self.num_envs, 3), device=self.device, dtype=torch.float32)
         self.permanent_controls = torch.zeros((self.num_envs, 1), device=self.device, dtype=torch.float32)
@@ -173,35 +179,36 @@ def set_up_keyboard(self):
         print("Keyboard set up!")
         print("The simulation is ready for you to try it out!")
         print("Your goal is pick up the purple cube and to drop it on the red sphere!")
-        print("Use the following controls to interact with the simulation:")
-        print("Press the 'A' key to have the gripper track the cube position.")
-        print("Press the 'D' key to have the gripper track the target position")
-        print("Press the 'W' or 'S' keys to move the gantry UP or DOWN respectively")
-        print("Press 'Q' or 'E' to OPEN or CLOSE the gripper respectively")
+        print(f"Number of environments: {self.num_envs}")
+        print("Use the following controls to interact with ALL environments simultaneously:")
+        print("Press the 'A' key to have all grippers track the cube position.")
+        print("Press the 'D' key to have all grippers track the target position")
+        print("Press the 'W' or 'S' keys to move all gantries UP or DOWN respectively")
+        print("Press 'Q' or 'E' to OPEN or CLOSE all grippers respectively")
 
     def _on_keyboard_event(self, event):
         """Checks for a keyboard event and assign the corresponding command control depending on key pressed."""
         if event.type == carb.input.KeyboardEventType.KEY_PRESS:
-            # Logic on key press
+            # Logic on key press - apply to ALL environments
             if event.input.name == self._auto_aim_target:
-                self.go_to_target = True
-                self.go_to_cube = False
+                self.go_to_target[:] = True
+                self.go_to_cube[:] = False
             if event.input.name == self._auto_aim_cube:
-                self.go_to_cube = True
-                self.go_to_target = False
+                self.go_to_cube[:] = True
+                self.go_to_target[:] = False
             if event.input.name in self._instant_key_controls:
-                self.go_to_cube = False
-                self.go_to_target = False
-                self.instant_controls[0] = self._instant_key_controls[event.input.name]
+                self.go_to_cube[:] = False
+                self.go_to_target[:] = False
+                self.instant_controls[:] = self._instant_key_controls[event.input.name]
             if event.input.name in self._permanent_key_controls:
-                self.go_to_cube = False
-                self.go_to_target = False
-                self.permanent_controls[0] = self._permanent_key_controls[event.input.name]
-        # On key release, the robot stops moving
+                self.go_to_cube[:] = False
+                self.go_to_target[:] = False
+                self.permanent_controls[:] = self._permanent_key_controls[event.input.name]
+        # On key release, all robots stop moving
         elif event.type == carb.input.KeyboardEventType.KEY_RELEASE:
-            self.go_to_cube = False
-            self.go_to_target = False
-            self.instant_controls[0] = self._instant_key_controls["ZEROS"]
+            self.go_to_cube[:] = False
+            self.go_to_target[:] = False
+            self.instant_controls[:] = self._instant_key_controls["ZEROS"]
 
     def _setup_scene(self):
         self.pick_and_place = Articulation(self.cfg.robot_cfg)
@@ -225,28 +232,30 @@ def _pre_physics_step(self, actions: torch.Tensor) -> None:
 
     def _apply_action(self) -> None:
         # We use the keyboard outputs as an action.
-        if self.go_to_cube:
+        # Process each environment independently
+        if self.go_to_cube.any():
             # Effort based proportional controller to track the cube position
-            head_pos_x = self.pick_and_place.data.joint_pos[:, self._x_dof_idx[0]]
-            head_pos_y = self.pick_and_place.data.joint_pos[:, self._y_dof_idx[0]]
-            cube_pos_x = self.cube.data.root_pos_w[:, 0] - self.scene.env_origins[:, 0]
-            cube_pos_y = self.cube.data.root_pos_w[:, 1] - self.scene.env_origins[:, 1]
+            head_pos_x = self.pick_and_place.data.joint_pos[self.go_to_cube, self._x_dof_idx[0]]
+            head_pos_y = self.pick_and_place.data.joint_pos[self.go_to_cube, self._y_dof_idx[0]]
+            cube_pos_x = self.cube.data.root_pos_w[self.go_to_cube, 0] - self.scene.env_origins[self.go_to_cube, 0]
+            cube_pos_y = self.cube.data.root_pos_w[self.go_to_cube, 1] - self.scene.env_origins[self.go_to_cube, 1]
             d_cube_robot_x = cube_pos_x - head_pos_x
             d_cube_robot_y = cube_pos_y - head_pos_y
-            self.instant_controls[0] = torch.tensor(
+            self.instant_controls[self.go_to_cube] = torch.tensor(
                 [d_cube_robot_x * 5.0, d_cube_robot_y * 5.0, 0.0], device=self.device
             )
-        elif self.go_to_target:
+        elif self.go_to_target.any():
             # Effort based proportional controller to track the target position
-            head_pos_x = self.pick_and_place.data.joint_pos[:, self._x_dof_idx[0]]
-            head_pos_y = self.pick_and_place.data.joint_pos[:, self._y_dof_idx[0]]
-            target_pos_x = self.target_pos[:, 0]
-            target_pos_y = self.target_pos[:, 1]
+            head_pos_x = self.pick_and_place.data.joint_pos[self.go_to_target, self._x_dof_idx[0]]
+            head_pos_y = self.pick_and_place.data.joint_pos[self.go_to_target, self._y_dof_idx[0]]
+            target_pos_x = self.target_pos[self.go_to_target, 0]
+            target_pos_y = self.target_pos[self.go_to_target, 1]
             d_target_robot_x = target_pos_x - head_pos_x
             d_target_robot_y = target_pos_y - head_pos_y
-            self.instant_controls[0] = torch.tensor(
+            self.instant_controls[self.go_to_target] = torch.tensor(
                 [d_target_robot_x * 5.0, d_target_robot_y * 5.0, 0.0], device=self.device
             )
+    
         # Set the joint effort targets for the picker
         self.pick_and_place.set_joint_effort_target(
             self.instant_controls[:, 0].unsqueeze(dim=1), joint_ids=self._x_dof_idx
@@ -258,7 +267,7 @@ def _apply_action(self) -> None:
             self.permanent_controls[:, 0].unsqueeze(dim=1), joint_ids=self._z_dof_idx
         )
         # Set the gripper command
-        self.gripper.set_grippers_command(self.instant_controls[:, 2].unsqueeze(dim=1))
+        self.gripper.set_grippers_command(self.instant_controls[:, 2])
 
     def _get_observations(self) -> dict:
         # Get the observations
@@ -397,8 +406,11 @@ def _debug_vis_callback(self, event):
 
 def main():
     """Main function."""
+    # create environment configuration
+    env_cfg = PickAndPlaceEnvCfg()
+    env_cfg.scene.num_envs = args_cli.num_envs
     # create environment
-    pick_and_place = PickAndPlaceEnv(PickAndPlaceEnvCfg())
+    pick_and_place = PickAndPlaceEnv(env_cfg)
     obs, _ = pick_and_place.reset()
     while simulation_app.is_running():
         # check for selected robots
@@ -409,4 +421,4 @@ def main():
 
 if __name__ == "__main__":
     main()
-    simulation_app.close()
+    simulation_app.close()
\ No newline at end of file
diff --git a/scripts/reinforcement_learning/rl_games/play.py b/scripts/reinforcement_learning/rl_games/play.py
index d6faec37316..135980e92c7 100644
--- a/scripts/reinforcement_learning/rl_games/play.py
+++ b/scripts/reinforcement_learning/rl_games/play.py
@@ -95,10 +95,6 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen
     # override configurations with non-hydra CLI arguments
     env_cfg.scene.num_envs = args_cli.num_envs if args_cli.num_envs is not None else env_cfg.scene.num_envs
     env_cfg.sim.device = args_cli.device if args_cli.device is not None else env_cfg.sim.device
-    # update agent device to match simulation device
-    if args_cli.device is not None:
-        agent_cfg["params"]["config"]["device"] = args_cli.device
-        agent_cfg["params"]["config"]["device_name"] = args_cli.device
 
     # randomly sample a seed if seed = -1
     if args_cli.seed == -1:
diff --git a/scripts/reinforcement_learning/rl_games/train.py b/scripts/reinforcement_learning/rl_games/train.py
index 634e5975676..d6900a3789f 100644
--- a/scripts/reinforcement_learning/rl_games/train.py
+++ b/scripts/reinforcement_learning/rl_games/train.py
@@ -102,11 +102,6 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen
             "Please use GPU device (e.g., --device cuda) for distributed training."
         )
 
-    # update agent device to match simulation device
-    if args_cli.device is not None:
-        agent_cfg["params"]["config"]["device"] = args_cli.device
-        agent_cfg["params"]["config"]["device_name"] = args_cli.device
-
     # randomly sample a seed if seed = -1
     if args_cli.seed == -1:
         args_cli.seed = random.randint(0, 10000)
diff --git a/scripts/reinforcement_learning/rsl_rl/train.py b/scripts/reinforcement_learning/rsl_rl/train.py
index 8b66feb28aa..ad739f4559a 100644
--- a/scripts/reinforcement_learning/rsl_rl/train.py
+++ b/scripts/reinforcement_learning/rsl_rl/train.py
@@ -182,7 +182,7 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen
         env = gym.wrappers.RecordVideo(env, **video_kwargs)
 
     # wrap around environment for rsl-rl
-    env = RslRlVecEnvWrapper(env, clip_actions=agent_cfg.clip_actions)
+    env = RslRlVecEnvWrapper(env, clip_actions=agent_cfg.clip_actions, rl_device=agent_cfg.device)
 
     # create runner from rsl-rl
     if agent_cfg.class_name == "OnPolicyRunner":
diff --git a/source/isaaclab_rl/config/extension.toml b/source/isaaclab_rl/config/extension.toml
index 0e2f31470b6..35ce2649060 100644
--- a/source/isaaclab_rl/config/extension.toml
+++ b/source/isaaclab_rl/config/extension.toml
@@ -1,7 +1,7 @@
 [package]
 
 # Note: Semantic Versioning is used: https://semver.org/
-version = "0.4.4"
+version = "0.5.0"
 
 # Description
 title = "Isaac Lab RL"
diff --git a/source/isaaclab_rl/docs/CHANGELOG.rst b/source/isaaclab_rl/docs/CHANGELOG.rst
index e3d44a08d96..3698bf770e0 100644
--- a/source/isaaclab_rl/docs/CHANGELOG.rst
+++ b/source/isaaclab_rl/docs/CHANGELOG.rst
@@ -1,6 +1,16 @@
 Changelog
 ---------
 
+0.5.0 (2025-11-10)
+~~~~~~~~~~~~~~~~~~
+
+Added
+^^^^^
+
+* Added support for decoupling RL device from simulation device in for RL wrappers.
+  This allows users to run simulation on one device (e.g., CPU) while running RL training/inference on another device.
+
+
 0.4.4 (2025-10-15)
 ~~~~~~~~~~~~~~~~~~
 
diff --git a/source/isaaclab_rl/isaaclab_rl/rl_games/rl_games.py b/source/isaaclab_rl/isaaclab_rl/rl_games/rl_games.py
index 8c448c172ac..22df1e8bef4 100644
--- a/source/isaaclab_rl/isaaclab_rl/rl_games/rl_games.py
+++ b/source/isaaclab_rl/isaaclab_rl/rl_games/rl_games.py
@@ -319,6 +319,10 @@ def _process_obs(self, obs_dict: VecEnvObs) -> dict[str, torch.Tensor] | dict[st
             - ``"obs"``: either a concatenated tensor (``concate_obs_group=True``) or a Dict of group tensors.
             - ``"states"`` (optional): same structure as above when state groups are configured; omitted otherwise.
         """
+        # move observations to RL device if different from sim device
+        if self._rl_device != self._sim_device:
+            obs_dict = {key: obs.to(device=self._rl_device) for key, obs in obs_dict.items()}
+
         # clip the observations
         for key, obs in obs_dict.items():
             obs_dict[key] = torch.clamp(obs, -self._clip_obs, self._clip_obs)
diff --git a/source/isaaclab_rl/isaaclab_rl/rsl_rl/vecenv_wrapper.py b/source/isaaclab_rl/isaaclab_rl/rsl_rl/vecenv_wrapper.py
index 73ceae04693..10377f84bd6 100644
--- a/source/isaaclab_rl/isaaclab_rl/rsl_rl/vecenv_wrapper.py
+++ b/source/isaaclab_rl/isaaclab_rl/rsl_rl/vecenv_wrapper.py
@@ -24,7 +24,7 @@ class RslRlVecEnvWrapper(VecEnv):
         https://github.com/leggedrobotics/rsl_rl/blob/master/rsl_rl/env/vec_env.py
     """
 
-    def __init__(self, env: ManagerBasedRLEnv | DirectRLEnv, clip_actions: float | None = None):
+    def __init__(self, env: ManagerBasedRLEnv | DirectRLEnv, clip_actions: float | None = None, rl_device: str | None = None):
         """Initializes the wrapper.
 
         Note:
@@ -33,6 +33,8 @@ def __init__(self, env: ManagerBasedRLEnv | DirectRLEnv, clip_actions: float | N
         Args:
             env: The environment to wrap around.
             clip_actions: The clipping value for actions. If ``None``, then no clipping is done.
+            rl_device: The device for RL agent/policy. If ``None``, uses the environment device.
+                This allows running the RL agent on a different device than the environment.
 
         Raises:
             ValueError: When the environment is not an instance of :class:`ManagerBasedRLEnv` or :class:`DirectRLEnv`.
@@ -49,11 +51,21 @@ def __init__(self, env: ManagerBasedRLEnv | DirectRLEnv, clip_actions: float | N
         self.env = env
         self.clip_actions = clip_actions
 
+        # store the RL device (where policy/training happens)
+        # this may be different from env.device (where task buffers are)
+        if rl_device is None:
+            self.rl_device = self.unwrapped.device
+        else:
+            self.rl_device = rl_device
+
         # store information required by wrapper
         self.num_envs = self.unwrapped.num_envs
-        self.device = self.unwrapped.device
+        self.device = self.rl_device
         self.max_episode_length = self.unwrapped.max_episode_length
 
+        # track the environment device separately
+        self.env_device = self.unwrapped.device
+
         # obtain dimensions of the environment
         if hasattr(self.unwrapped, "action_manager"):
             self.num_actions = self.unwrapped.action_manager.total_action_dim
@@ -139,6 +151,9 @@ def seed(self, seed: int = -1) -> int:  # noqa: D102
     def reset(self) -> tuple[TensorDict, dict]:  # noqa: D102
         # reset the environment
         obs_dict, extras = self.env.reset()
+        # move observations to RL device if different from env device
+        if self.rl_device != self.env_device:
+            obs_dict = {k: v.to(self.rl_device) if isinstance(v, torch.Tensor) else v for k, v in obs_dict.items()}
         return TensorDict(obs_dict, batch_size=[self.num_envs]), extras
 
     def get_observations(self) -> TensorDict:
@@ -147,14 +162,26 @@ def get_observations(self) -> TensorDict:
             obs_dict = self.unwrapped.observation_manager.compute()
         else:
             obs_dict = self.unwrapped._get_observations()
+        # move observations to RL device if different from env device
+        if self.rl_device != self.env_device:
+            obs_dict = {k: v.to(self.rl_device) if isinstance(v, torch.Tensor) else v for k, v in obs_dict.items()}
         return TensorDict(obs_dict, batch_size=[self.num_envs])
 
     def step(self, actions: torch.Tensor) -> tuple[TensorDict, torch.Tensor, torch.Tensor, dict]:
+        # move actions to env device if coming from different RL device
+        if self.rl_device != self.env_device:
+            actions = actions.to(self.env_device)
         # clip actions
         if self.clip_actions is not None:
             actions = torch.clamp(actions, -self.clip_actions, self.clip_actions)
         # record step information
         obs_dict, rew, terminated, truncated, extras = self.env.step(actions)
+        # move outputs to RL device if different from env device
+        if self.rl_device != self.env_device:
+            obs_dict = {k: v.to(self.rl_device) if isinstance(v, torch.Tensor) else v for k, v in obs_dict.items()}
+            rew = rew.to(self.rl_device)
+            terminated = terminated.to(self.rl_device)
+            truncated = truncated.to(self.rl_device)
         # compute dones for compatibility with RSL-RL
         dones = (terminated | truncated).to(dtype=torch.long)
         # move time out information to the extras dict

From ee5a07eeeb7ecf97dcad455dce4e837eaf2712fa Mon Sep 17 00:00:00 2001
From: Kelly Guo <kellyg@nvidia.com>
Date: Mon, 10 Nov 2025 16:51:16 -0800
Subject: [PATCH 2/7] add test case

---
 scripts/demos/pick_and_place.py               |  12 +-
 .../isaaclab_rl/rsl_rl/vecenv_wrapper.py      |   4 +-
 .../test/test_rl_device_separation.py         | 680 ++++++++++++++++++
 3 files changed, 689 insertions(+), 7 deletions(-)
 create mode 100644 source/isaaclab_tasks/test/test_rl_device_separation.py

diff --git a/scripts/demos/pick_and_place.py b/scripts/demos/pick_and_place.py
index ae3c1bd420d..249059c61f4 100644
--- a/scripts/demos/pick_and_place.py
+++ b/scripts/demos/pick_and_place.py
@@ -241,8 +241,8 @@ def _apply_action(self) -> None:
             cube_pos_y = self.cube.data.root_pos_w[self.go_to_cube, 1] - self.scene.env_origins[self.go_to_cube, 1]
             d_cube_robot_x = cube_pos_x - head_pos_x
             d_cube_robot_y = cube_pos_y - head_pos_y
-            self.instant_controls[self.go_to_cube] = torch.tensor(
-                [d_cube_robot_x * 5.0, d_cube_robot_y * 5.0, 0.0], device=self.device
+            self.instant_controls[self.go_to_cube] = torch.stack(
+                [d_cube_robot_x * 5.0, d_cube_robot_y * 5.0, torch.zeros_like(d_cube_robot_x)], dim=1
             )
         elif self.go_to_target.any():
             # Effort based proportional controller to track the target position
@@ -252,10 +252,10 @@ def _apply_action(self) -> None:
             target_pos_y = self.target_pos[self.go_to_target, 1]
             d_target_robot_x = target_pos_x - head_pos_x
             d_target_robot_y = target_pos_y - head_pos_y
-            self.instant_controls[self.go_to_target] = torch.tensor(
-                [d_target_robot_x * 5.0, d_target_robot_y * 5.0, 0.0], device=self.device
+            self.instant_controls[self.go_to_target] = torch.stack(
+                [d_target_robot_x * 5.0, d_target_robot_y * 5.0, torch.zeros_like(d_target_robot_x)], dim=1
             )
-    
+
         # Set the joint effort targets for the picker
         self.pick_and_place.set_joint_effort_target(
             self.instant_controls[:, 0].unsqueeze(dim=1), joint_ids=self._x_dof_idx
@@ -421,4 +421,4 @@ def main():
 
 if __name__ == "__main__":
     main()
-    simulation_app.close()
\ No newline at end of file
+    simulation_app.close()
diff --git a/source/isaaclab_rl/isaaclab_rl/rsl_rl/vecenv_wrapper.py b/source/isaaclab_rl/isaaclab_rl/rsl_rl/vecenv_wrapper.py
index 10377f84bd6..784892f7e37 100644
--- a/source/isaaclab_rl/isaaclab_rl/rsl_rl/vecenv_wrapper.py
+++ b/source/isaaclab_rl/isaaclab_rl/rsl_rl/vecenv_wrapper.py
@@ -24,7 +24,9 @@ class RslRlVecEnvWrapper(VecEnv):
         https://github.com/leggedrobotics/rsl_rl/blob/master/rsl_rl/env/vec_env.py
     """
 
-    def __init__(self, env: ManagerBasedRLEnv | DirectRLEnv, clip_actions: float | None = None, rl_device: str | None = None):
+    def __init__(
+        self, env: ManagerBasedRLEnv | DirectRLEnv, clip_actions: float | None = None, rl_device: str | None = None
+    ):
         """Initializes the wrapper.
 
         Note:
diff --git a/source/isaaclab_tasks/test/test_rl_device_separation.py b/source/isaaclab_tasks/test/test_rl_device_separation.py
new file mode 100644
index 00000000000..ec3f7060877
--- /dev/null
+++ b/source/isaaclab_tasks/test/test_rl_device_separation.py
@@ -0,0 +1,680 @@
+# Copyright (c) 2022-2025, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md).
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""Test RL device separation across all supported RL libraries.
+
+This test verifies that RL library wrappers correctly handle device transfers when the
+simulation device differs from the RL training device.
+
+Device Architecture:
+    1. sim_device: Where physics simulation runs and environment buffers live
+    2. rl_device: Where policy networks and training computations occur
+
+Test Scenarios:
+    - GPU simulation + GPU RL: Same device (no transfers needed, optimal performance)
+    - GPU simulation + CPU RL: Cross-device transfers (wrapper handles transfers)
+    - CPU simulation + CPU RL: CPU-only operation
+
+Each test verifies the wrapper correctly:
+    1. Unwrapped env: operates entirely on sim_device
+    2. Wrapper: accepts actions on rl_device (where policy generates them)
+    3. Wrapper: internally transfers actions from rl_device → sim_device for env.step()
+    4. Wrapper: transfers outputs from sim_device → rl_device (for policy to use)
+
+Tested Libraries:
+    - RSL-RL: TensorDict observations, explicit rl_device parameter
+        * Transfers observations and rewards to rl_device
+    - RL Games: Dict observations, explicit rl_device parameter
+        * Transfers observations and rewards to rl_device
+    - Stable-Baselines3: Numpy arrays (CPU-only by design)
+        * Always converts to/from numpy on CPU
+    - skrl: Dict observations, uses skrl.config.torch.device for RL device
+        * Keeps observations on sim_device (policy handles transfer)
+        * Only transfers actions from rl_device to sim_device
+
+IMPORTANT: Due to Isaac Sim limitations, only ONE test can be run per pytest invocation.
+Run tests individually:
+    pytest test_rl_device_separation.py::test_rsl_rl_device_separation_gpu_to_gpu -v -s
+    pytest test_rl_device_separation.py::test_rsl_rl_device_separation_gpu_to_cpu -v -s
+    pytest test_rl_device_separation.py::test_rl_games_device_separation_gpu_to_gpu -v -s
+    ...
+"""
+
+from isaaclab.app import AppLauncher
+
+# launch the simulator
+app_launcher = AppLauncher(headless=True)
+simulation_app = app_launcher.app
+
+"""Rest everything follows."""
+
+import gymnasium as gym
+import torch
+
+import carb
+import omni.usd
+import pytest
+
+import isaaclab_tasks  # noqa: F401
+from isaaclab_tasks.utils.parse_cfg import parse_env_cfg
+
+# Test environment - use Cartpole as it's simple and fast
+TEST_ENV = "Isaac-Cartpole-v0"
+NUM_ENVS = 4
+
+
+def _test_rsl_rl_device_separation(sim_device: str, rl_device: str):
+    """Helper function to test RSL-RL with specified device configuration.
+
+    Args:
+        sim_device: Device for simulation (e.g., "cuda:0", "cpu")
+        rl_device: Device for RL agent (e.g., "cuda:0", "cpu")
+    """
+    from tensordict import TensorDict
+
+    from isaaclab_rl.rsl_rl import RslRlVecEnvWrapper
+
+    print(f"\n{'=' * 60}")
+    print(f">>> Testing RSL-RL with sim_device={sim_device}, rl_device={rl_device}")
+    print(f"{'=' * 60}")
+
+    # Create a new stage
+    omni.usd.get_context().new_stage()
+    # Reset the rtx sensors carb setting to False
+    carb.settings.get_settings().set_bool("/isaaclab/render/rtx_sensors", False)
+
+    try:
+        # Parse environment config
+        print("  [1/6] Parsing environment config...")
+        env_cfg = parse_env_cfg(TEST_ENV, device=sim_device, num_envs=NUM_ENVS)
+
+        # Create environment
+        print("  [2/6] Creating environment (may take 5-10s)...")
+        env = gym.make(TEST_ENV, cfg=env_cfg)
+        print("  [2/6] Environment created successfully")
+    except Exception as e:
+        # Try to close environment on exception
+        if "env" in locals() and hasattr(env, "_is_closed"):
+            env.close()
+        else:
+            if hasattr(e, "obj") and hasattr(e.obj, "_is_closed"):
+                e.obj.close()
+        pytest.fail(f"Failed to set-up the environment for task {TEST_ENV}. Error: {e}")
+
+    # Disable control on stop
+    env.unwrapped.sim._app_control_on_stop_handle = None
+
+    # Verify environment device
+    print("  [3/6] Verifying environment device...")
+    assert (
+        env.unwrapped.device == sim_device
+    ), f"Environment device mismatch: expected {sim_device}, got {env.unwrapped.device}"
+
+    # Test environment directly before wrapping to verify it returns data on sim device
+    print("  [3/6] Testing unwrapped environment returns data on sim_device...")
+    obs_dict, _ = env.reset()
+    for key, value in obs_dict.items():
+        if isinstance(value, torch.Tensor):
+            assert (
+                value.device.type == torch.device(sim_device).type
+            ), f"Unwrapped env obs '{key}' should be on {sim_device}, got {value.device}"
+
+    # Step unwrapped environment to verify outputs are on sim device
+    action_space = env.unwrapped.single_action_space
+    test_action = torch.zeros(NUM_ENVS, action_space.shape[0], device=sim_device)
+    obs_dict, rew, term, trunc, extras = env.step(test_action)
+    assert (
+        rew.device.type == torch.device(sim_device).type
+    ), f"Unwrapped env rewards should be on {sim_device}, got {rew.device}"
+    assert (
+        term.device.type == torch.device(sim_device).type
+    ), f"Unwrapped env terminated should be on {sim_device}, got {term.device}"
+    print(f"  [3/6] Verified: Unwrapped environment returns data on {sim_device}")
+
+    # Create RSL-RL wrapper with RL device
+    print("  [4/6] Creating RSL-RL wrapper...")
+    env = RslRlVecEnvWrapper(env, rl_device=rl_device)
+    print(f"  [4/6] Wrapper created (env_device={env.env_device}, rl_device={env.rl_device})")
+
+    # Verify devices
+    assert env.env_device == sim_device, f"Wrapper env_device should be {sim_device}"
+    assert env.rl_device == rl_device, f"Wrapper RL device should be {rl_device}"
+    assert env.device == rl_device, f"Wrapper device property should be {rl_device}"
+
+    # Reset and step to test device transfers
+    print("  [5/6] Testing reset and step operations...")
+    obs, extras = env.reset()
+    print("  [5/6] Reset completed")
+
+    # Verify observations are on RL device (RSL-RL returns TensorDict)
+    assert isinstance(obs, TensorDict), f"Expected TensorDict, got {type(obs)}"
+    for key, value in obs.items():
+        if isinstance(value, torch.Tensor):
+            assert (
+                value.device.type == torch.device(rl_device).type
+            ), f"Observation '{key}' should be on {rl_device}, got {value.device}"
+
+    # Sample random action on RL device (simulating policy output)
+    # RSL-RL: action_space.shape is (num_envs, action_dim)
+    action = 2 * torch.rand(env.action_space.shape, device=rl_device) - 1
+    print(f"  [5/6] Action created on rl_device: {action.device}, shape: {action.shape}")
+
+    # Verify action is on RL device before calling step
+    assert (
+        action.device.type == torch.device(rl_device).type
+    ), f"Action should be on {rl_device} before step, got {action.device}"
+
+    # Step environment - wrapper should:
+    # 1. Accept action on rl_device
+    # 2. Transfer action from rl_device to sim_device internally
+    # 3. Call unwrapped env.step() with action on sim_device
+    # 4. Transfer outputs from sim_device to rl_device
+    obs, reward, dones, extras = env.step(action)
+    print("  [5/6] Step completed - wrapper handled device transfers")
+
+    # Verify all outputs are on RL device (wrapper transferred from sim_device)
+    print("  [6/6] Verifying device transfers...")
+    assert isinstance(obs, TensorDict), f"Expected TensorDict, got {type(obs)}"
+    for key, value in obs.items():
+        if isinstance(value, torch.Tensor):
+            assert (
+                value.device.type == torch.device(rl_device).type
+            ), f"Step observation '{key}' should be on {rl_device}, got {value.device}"
+    assert reward.device.type == torch.device(rl_device).type, f"Rewards should be on {rl_device}, got {reward.device}"
+    assert dones.device.type == torch.device(rl_device).type, f"Dones should be on {rl_device}, got {dones.device}"
+
+    # Cleanup
+    print("  [6/6] Cleaning up environment...")
+    env.close()
+    print(f"✓ RSL-RL test PASSED for sim_device={sim_device}, rl_device={rl_device}")
+    print("  Wrapper device transfer verified:")
+    print(f"    1. Unwrapped env: expects actions on {sim_device}, returns data on {sim_device}")
+    print(f"    2. Wrapper: accepts actions on {rl_device} (from policy)")
+    print(f"    3. Wrapper: internally transfers actions to {sim_device} for env.step()")
+    print(f"    4. Wrapper: transfers outputs from {sim_device} to {rl_device} (for policy)")
+    print("-" * 80)
+
+
+def _test_rl_games_device_separation(sim_device: str, rl_device: str):
+    """Helper function to test RL Games with specified device configuration.
+
+    Args:
+        sim_device: Device for simulation (e.g., "cuda:0", "cpu")
+        rl_device: Device for RL agent (e.g., "cuda:0", "cpu")
+    """
+    from isaaclab_rl.rl_games import RlGamesVecEnvWrapper
+
+    print(f"\n{'=' * 60}")
+    print(f">>> Testing RL Games with sim_device={sim_device}, rl_device={rl_device}")
+    print(f"{'=' * 60}")
+
+    # Create a new stage
+    omni.usd.get_context().new_stage()
+    # Reset the rtx sensors carb setting to False
+    carb.settings.get_settings().set_bool("/isaaclab/render/rtx_sensors", False)
+
+    try:
+        # Parse environment config
+        print("  [1/5] Parsing environment config...")
+        env_cfg = parse_env_cfg(TEST_ENV, device=sim_device, num_envs=NUM_ENVS)
+
+        # Create environment
+        print("  [2/5] Creating environment (may take 5-10s)...")
+        env = gym.make(TEST_ENV, cfg=env_cfg)
+        print("  [2/5] Environment created successfully")
+    except Exception as e:
+        # Try to close environment on exception
+        if "env" in locals() and hasattr(env, "_is_closed"):
+            env.close()
+        else:
+            if hasattr(e, "obj") and hasattr(e.obj, "_is_closed"):
+                e.obj.close()
+        pytest.fail(f"Failed to set-up the environment for task {TEST_ENV}. Error: {e}")
+
+    # Disable control on stop
+    env.unwrapped.sim._app_control_on_stop_handle = None
+
+    # Verify environment device
+    print("  [3/5] Verifying environment device...")
+    assert (
+        env.unwrapped.device == sim_device
+    ), f"Environment device mismatch: expected {sim_device}, got {env.unwrapped.device}"
+
+    # Test environment directly before wrapping to verify it returns data on sim device
+    print("  [3/5] Testing unwrapped environment returns data on sim_device...")
+    obs_dict, _ = env.reset()
+    for key, value in obs_dict.items():
+        if isinstance(value, torch.Tensor):
+            assert (
+                value.device.type == torch.device(sim_device).type
+            ), f"Unwrapped env obs '{key}' should be on {sim_device}, got {value.device}"
+
+    # Step unwrapped environment to verify outputs are on sim device
+    action_space = env.unwrapped.single_action_space
+    test_action = torch.zeros(NUM_ENVS, action_space.shape[0], device=sim_device)
+    obs_dict, rew, term, trunc, extras = env.step(test_action)
+    assert (
+        rew.device.type == torch.device(sim_device).type
+    ), f"Unwrapped env rewards should be on {sim_device}, got {rew.device}"
+    assert (
+        term.device.type == torch.device(sim_device).type
+    ), f"Unwrapped env terminated should be on {sim_device}, got {term.device}"
+    print(f"  [3/5] Verified: Unwrapped environment returns data on {sim_device}")
+
+    # Create RL Games wrapper with RL device
+    print("  [3/5] Creating RL Games wrapper...")
+    env = RlGamesVecEnvWrapper(env, rl_device=rl_device, clip_obs=10.0, clip_actions=1.0)
+
+    # Reset and step to test device transfers
+    print("  [4/5] Testing reset and step operations...")
+    obs = env.reset()
+    print("  [4/5] Reset completed")
+
+    # Verify observations are on RL device
+    if isinstance(obs, dict):
+        for key, value in obs.items():
+            assert (
+                value.device.type == torch.device(rl_device).type
+            ), f"Observation '{key}' should be on {rl_device}, got {value.device}"
+    else:
+        assert (
+            obs.device.type == torch.device(rl_device).type
+        ), f"Observation should be on {rl_device}, got {obs.device}"
+
+    # Sample random action on RL device (simulating policy output)
+    action = 2 * torch.rand(NUM_ENVS, *env.action_space.shape, device=rl_device) - 1
+    print(f"  [4/5] Action created on rl_device: {action.device}, shape: {action.shape}")
+
+    # Verify action is on RL device before calling step
+    assert (
+        action.device.type == torch.device(rl_device).type
+    ), f"Action should be on {rl_device} before step, got {action.device}"
+
+    # Step environment - wrapper should:
+    # 1. Accept action on rl_device
+    # 2. Transfer action from rl_device to sim_device internally
+    # 3. Call unwrapped env.step() with action on sim_device
+    # 4. Transfer outputs from sim_device to rl_device
+    obs, reward, dones, info = env.step(action)
+    print("  [4/5] Step completed - wrapper handled device transfers")
+
+    # Verify all outputs are on RL device (wrapper transferred from sim_device)
+    print("  [5/5] Verifying device transfers...")
+    # RL Games returns flat tensor for observations
+    if isinstance(obs, dict):
+        for key, value in obs.items():
+            assert (
+                value.device.type == torch.device(rl_device).type
+            ), f"Observation '{key}' should be on {rl_device}, got {value.device}"
+    else:
+        assert (
+            obs.device.type == torch.device(rl_device).type
+        ), f"Observations should be on {rl_device}, got {obs.device}"
+    assert reward.device.type == torch.device(rl_device).type, f"Rewards should be on {rl_device}, got {reward.device}"
+    assert dones.device.type == torch.device(rl_device).type, f"Dones should be on {rl_device}, got {dones.device}"
+
+    # Cleanup
+    print("  [5/5] Cleaning up environment...")
+    env.close()
+    print(f"✓ RL Games test PASSED for sim_device={sim_device}, rl_device={rl_device}")
+    print("  Wrapper device transfer verified:")
+    print(f"    1. Unwrapped env: expects actions on {sim_device}, returns data on {sim_device}")
+    print(f"    2. Wrapper: accepts actions on {rl_device} (from policy)")
+    print(f"    3. Wrapper: internally transfers actions to {sim_device} for env.step()")
+    print(f"    4. Wrapper: transfers outputs from {sim_device} to {rl_device} (for policy)")
+    print("-" * 80)
+
+
+def _test_sb3_device_separation(sim_device: str):
+    """Helper function to test Stable-Baselines3 with specified device configuration.
+
+    Note: SB3 always converts to CPU/numpy, so we don't test rl_device parameter.
+
+    Args:
+        sim_device: Device for simulation (e.g., "cuda:0", "cpu")
+    """
+    import numpy as np
+
+    from isaaclab_rl.sb3 import Sb3VecEnvWrapper
+
+    print(f"\n{'=' * 60}")
+    print(f">>> Testing SB3 with sim_device={sim_device}")
+    print(f"{'=' * 60}")
+
+    # Create a new stage
+    omni.usd.get_context().new_stage()
+    # Reset the rtx sensors carb setting to False
+    carb.settings.get_settings().set_bool("/isaaclab/render/rtx_sensors", False)
+
+    try:
+        # Parse environment config
+        print("  [1/5] Parsing environment config...")
+        env_cfg = parse_env_cfg(TEST_ENV, device=sim_device, num_envs=NUM_ENVS)
+
+        # Create environment
+        print("  [2/5] Creating environment (may take 5-10s)...")
+        env = gym.make(TEST_ENV, cfg=env_cfg)
+        print("  [2/5] Environment created successfully")
+    except Exception as e:
+        # Try to close environment on exception
+        if "env" in locals() and hasattr(env, "_is_closed"):
+            env.close()
+        else:
+            if hasattr(e, "obj") and hasattr(e.obj, "_is_closed"):
+                e.obj.close()
+        pytest.fail(f"Failed to set-up the environment for task {TEST_ENV}. Error: {e}")
+
+    # Disable control on stop
+    env.unwrapped.sim._app_control_on_stop_handle = None
+
+    # Verify environment device
+    print("  [3/5] Verifying environment device...")
+    assert (
+        env.unwrapped.device == sim_device
+    ), f"Environment device mismatch: expected {sim_device}, got {env.unwrapped.device}"
+
+    # Test environment directly before wrapping to verify it returns data on sim device
+    print("  [3/5] Testing unwrapped environment returns data on sim_device...")
+    obs_dict, _ = env.reset()
+    for key, value in obs_dict.items():
+        if isinstance(value, torch.Tensor):
+            assert (
+                value.device.type == torch.device(sim_device).type
+            ), f"Unwrapped env obs '{key}' should be on {sim_device}, got {value.device}"
+    print(f"  [3/5] Verified: Unwrapped environment returns data on {sim_device}")
+
+    # Create SB3 wrapper (always converts to numpy/CPU)
+    print("  [3/5] Creating SB3 wrapper...")
+    env = Sb3VecEnvWrapper(env)
+
+    # Reset and step to test device transfers
+    print("  [4/5] Testing reset and step operations...")
+    obs = env.reset()
+    print("  [4/5] Reset completed")
+
+    # SB3 observations should always be numpy arrays (on CPU)
+    assert isinstance(obs, np.ndarray), f"SB3 observations should be numpy arrays, got {type(obs)}"
+
+    # Sample random action (SB3 uses numpy)
+    action = 2 * np.random.rand(env.num_envs, *env.action_space.shape) - 1
+    assert isinstance(action, np.ndarray), f"Action should be numpy array, got {type(action)}"
+    print(f"  [4/5] Action sampled (numpy array), shape: {action.shape}")
+
+    # Step environment - wrapper should:
+    # 1. Convert numpy action to torch tensor on sim_device internally
+    # 2. Call unwrapped env.step() with action on sim_device
+    # 3. Convert outputs from sim_device tensors to numpy arrays
+    obs, reward, done, info = env.step(action)
+    print("  [4/5] Step completed, outputs converted to numpy")
+
+    # Verify all outputs are numpy arrays (wrapper transferred and converted)
+    print("  [5/5] Verifying numpy conversions...")
+    assert isinstance(obs, np.ndarray), f"Observations should be numpy arrays, got {type(obs)}"
+    assert isinstance(reward, np.ndarray), f"Rewards should be numpy arrays, got {type(reward)}"
+    assert isinstance(done, np.ndarray), f"Dones should be numpy arrays, got {type(done)}"
+
+    # Cleanup
+    print("  [5/5] Cleaning up environment...")
+    env.close()
+    print(f"✓ SB3 test PASSED for sim_device={sim_device}")
+    print("  Wrapper device transfer verified:")
+    print(f"    1. Unwrapped env: expects actions on {sim_device}, returns data on {sim_device}")
+    print("    2. Wrapper: accepts numpy arrays (from policy on CPU)")
+    print(f"    3. Wrapper: internally converts to tensors on {sim_device} for env.step()")
+    print(f"    4. Wrapper: converts outputs from {sim_device} tensors to numpy arrays (for policy)")
+    print("-" * 80)
+
+
+def _test_skrl_device_separation(sim_device: str, rl_device: str):
+    """Helper function to test skrl with specified device configuration.
+
+    Note: skrl uses skrl.config.torch.device for device configuration.
+    This can be set via agent_cfg["device"] for consistency with other libraries.
+
+    Args:
+        sim_device: Device for simulation (e.g., "cuda:0", "cpu")
+        rl_device: Device for RL agent (e.g., "cuda:0", "cpu") - set via skrl.config.torch.device
+    """
+    try:
+        import skrl
+        from skrl.envs.wrappers.torch import wrap_env
+    except ImportError:
+        pytest.skip("skrl not installed")
+
+    print(f"\n{'=' * 60}")
+    print(f">>> Testing skrl with sim_device={sim_device}, rl_device={rl_device}")
+    print(f"    Using skrl.config.torch.device = {rl_device}")
+    print(f"{'=' * 60}")
+
+    # Create agent config with device parameter (for demonstration/consistency)
+    agent_cfg = {"device": rl_device}
+
+    # Configure skrl device (can be set from agent_cfg for consistency with other libraries)
+    if "device" in agent_cfg:
+        skrl.config.torch.device = torch.device(agent_cfg["device"])
+    else:
+        skrl.config.torch.device = torch.device(rl_device)
+
+    # Create a new stage
+    omni.usd.get_context().new_stage()
+    # Reset the rtx sensors carb setting to False
+    carb.settings.get_settings().set_bool("/isaaclab/render/rtx_sensors", False)
+
+    try:
+        # Parse environment config
+        print("  [1/6] Parsing environment config...")
+        env_cfg = parse_env_cfg(TEST_ENV, device=sim_device, num_envs=NUM_ENVS)
+
+        # Create environment
+        print("  [2/6] Creating environment (may take 5-10s)...")
+        env = gym.make(TEST_ENV, cfg=env_cfg)
+        print("  [2/6] Environment created successfully")
+    except Exception as e:
+        # Try to close environment on exception
+        if "env" in locals() and hasattr(env, "_is_closed"):
+            env.close()
+        else:
+            if hasattr(e, "obj") and hasattr(e.obj, "_is_closed"):
+                e.obj.close()
+        pytest.fail(f"Failed to set-up the environment for task {TEST_ENV}. Error: {e}")
+
+    # Disable control on stop
+    env.unwrapped.sim._app_control_on_stop_handle = None
+
+    # Verify environment device
+    print("  [3/6] Verifying environment device...")
+    assert (
+        env.unwrapped.device == sim_device
+    ), f"Environment device mismatch: expected {sim_device}, got {env.unwrapped.device}"
+
+    # Test environment directly before wrapping to verify it returns data on sim device
+    print("  [3/6] Testing unwrapped environment returns data on sim_device...")
+    obs_dict, _ = env.reset()
+    for key, value in obs_dict.items():
+        if isinstance(value, torch.Tensor):
+            assert (
+                value.device.type == torch.device(sim_device).type
+            ), f"Unwrapped env obs '{key}' should be on {sim_device}, got {value.device}"
+    print(f"  [3/6] Verified: Unwrapped environment returns data on {sim_device}")
+
+    # Wrap with skrl (will use skrl.config.torch.device for policy)
+    print("  [3/6] Creating skrl wrapper...")
+    env = wrap_env(env, wrapper="isaaclab")
+
+    # Reset to test basic functionality
+    print("  [4/6] Testing reset and step operations...")
+    obs, info = env.reset()
+    print("  [4/6] Reset completed")
+
+    # Verify observations are tensors or dict
+    # skrl can return either dict or tensor depending on configuration
+    if isinstance(obs, dict):
+        assert isinstance(obs["policy"], torch.Tensor), f"Observations should be tensors, got {type(obs['policy'])}"
+    else:
+        assert isinstance(obs, torch.Tensor), f"Observations should be tensors, got {type(obs)}"
+
+    # Sample random action on RL device (simulating policy output - skrl always uses GPU for training)
+    rl_device_obj = skrl.config.torch.device
+    action = 2 * torch.rand(NUM_ENVS, *env.action_space.shape, device=rl_device_obj) - 1
+    print(f"  [4/6] Action created on rl_device: {rl_device_obj}, shape: {action.shape}")
+
+    # Verify action is on RL device before calling step
+    assert (
+        action.device.type == rl_device_obj.type
+    ), f"Action should be on {rl_device_obj} before step, got {action.device}"
+
+    # Step environment - wrapper should:
+    # 1. Accept action on rl_device
+    # 2. Transfer action from rl_device to sim_device internally
+    # 3. Call unwrapped env.step() with action on sim_device
+    # 4. Return outputs on sim_device (skrl policy handles device transfer)
+    print("  [5/6] Testing step with action on rl_device...")
+    transition = env.step(action)
+    print("  [5/6] Step completed - wrapper handled action device transfer")
+
+    # Verify outputs are tensors
+    # Note: skrl wrapper returns outputs on sim_device, not rl_device
+    # The policy is responsible for transferring observations when needed
+    print("  [6/6] Verifying outputs are on sim_device (skrl behavior)...")
+    if len(transition) == 5:
+        obs, reward, terminated, truncated, info = transition
+        # Check observations (can be dict or tensor)
+        if isinstance(obs, dict):
+            assert isinstance(obs["policy"], torch.Tensor), "Observations should be tensors"
+            assert (
+                obs["policy"].device.type == torch.device(sim_device).type
+            ), f"Observations should be on {sim_device}, got {obs['policy'].device}"
+        else:
+            assert isinstance(obs, torch.Tensor), "Observations should be tensors"
+            assert (
+                obs.device.type == torch.device(sim_device).type
+            ), f"Observations should be on {sim_device}, got {obs.device}"
+        assert isinstance(reward, torch.Tensor), "Rewards should be tensors"
+        assert (
+            reward.device.type == torch.device(sim_device).type
+        ), f"Rewards should be on {sim_device}, got {reward.device}"
+        assert isinstance(terminated, torch.Tensor), "Terminated should be tensors"
+        assert (
+            terminated.device.type == torch.device(sim_device).type
+        ), f"Terminated should be on {sim_device}, got {terminated.device}"
+        assert isinstance(truncated, torch.Tensor), "Truncated should be tensors"
+        assert (
+            truncated.device.type == torch.device(sim_device).type
+        ), f"Truncated should be on {sim_device}, got {truncated.device}"
+    elif len(transition) == 4:
+        obs, reward, done, info = transition
+        # Check observations (can be dict or tensor)
+        if isinstance(obs, dict):
+            assert isinstance(obs["policy"], torch.Tensor), "Observations should be tensors"
+            assert (
+                obs["policy"].device.type == torch.device(sim_device).type
+            ), f"Observations should be on {sim_device}, got {obs['policy'].device}"
+        else:
+            assert isinstance(obs, torch.Tensor), "Observations should be tensors"
+            assert (
+                obs.device.type == torch.device(sim_device).type
+            ), f"Observations should be on {sim_device}, got {obs.device}"
+        assert isinstance(reward, torch.Tensor), "Rewards should be tensors"
+        assert (
+            reward.device.type == torch.device(sim_device).type
+        ), f"Rewards should be on {sim_device}, got {reward.device}"
+        assert isinstance(done, torch.Tensor), "Dones should be tensors"
+        assert done.device.type == torch.device(sim_device).type, f"Dones should be on {sim_device}, got {done.device}"
+    else:
+        pytest.fail(f"Unexpected number of return values from step: {len(transition)}")
+
+    # Cleanup
+    print("  [6/6] Cleaning up environment...")
+    env.close()
+    print(f"✓ skrl test PASSED for sim_device={sim_device}, rl_device={rl_device_obj}")
+    print("  Wrapper device transfer verified (skrl-specific behavior):")
+    print(f"    1. Unwrapped env: expects actions on {sim_device}, returns data on {sim_device}")
+    print(f"    2. Wrapper: accepts actions on {rl_device_obj} (from policy)")
+    print(f"    3. Wrapper: internally transfers actions to {sim_device} for env.step()")
+    print(f"    4. Wrapper: returns outputs on {sim_device} (policy handles obs device transfer)")
+    print("    Note: Unlike RSL-RL/RL-Games, skrl keeps observations on sim_device")
+    print("-" * 80)
+
+
+# ============================================================================
+# Test Functions
+# ============================================================================
+
+
+def test_rsl_rl_device_separation_gpu_to_gpu():
+    """Test RSL-RL with GPU simulation and GPU RL (default configuration)."""
+    try:
+        import isaaclab_rl.rsl_rl  # noqa: F401
+    except ImportError:
+        pytest.skip("RSL-RL not installed")
+
+    _test_rsl_rl_device_separation(sim_device="cuda:0", rl_device="cuda:0")
+
+
+def test_rsl_rl_device_separation_gpu_to_cpu():
+    """Test RSL-RL with GPU simulation and CPU RL (cross-device transfer)."""
+    try:
+        import isaaclab_rl.rsl_rl  # noqa: F401
+    except ImportError:
+        pytest.skip("RSL-RL not installed")
+
+    _test_rsl_rl_device_separation(sim_device="cuda:0", rl_device="cpu")
+
+
+def test_rl_games_device_separation_gpu_to_gpu():
+    """Test RL Games with GPU simulation and GPU RL (default configuration)."""
+    try:
+        import isaaclab_rl.rl_games  # noqa: F401
+    except ImportError:
+        pytest.skip("RL Games not installed")
+
+    _test_rl_games_device_separation(sim_device="cuda:0", rl_device="cuda:0")
+
+
+def test_rl_games_device_separation_gpu_to_cpu():
+    """Test RL Games with GPU simulation and CPU RL (cross-device transfer)."""
+    try:
+        import isaaclab_rl.rl_games  # noqa: F401
+    except ImportError:
+        pytest.skip("RL Games not installed")
+
+    _test_rl_games_device_separation(sim_device="cuda:0", rl_device="cpu")
+
+
+def test_sb3_device_separation_gpu():
+    """Test Stable-Baselines3 with GPU simulation.
+
+    Note: SB3 always converts to CPU/numpy, so only GPU simulation is tested.
+    """
+    try:
+        import isaaclab_rl.sb3  # noqa: F401
+    except ImportError:
+        pytest.skip("Stable-Baselines3 not installed")
+
+    _test_sb3_device_separation(sim_device="cuda:0")
+
+
+def test_skrl_device_separation_gpu():
+    """Test skrl with GPU simulation and GPU policy (matching devices)."""
+    try:
+        import skrl  # noqa: F401
+    except ImportError:
+        pytest.skip("skrl not installed")
+
+    _test_skrl_device_separation(sim_device="cuda:0", rl_device="cuda:0")
+
+
+def test_skrl_device_separation_cpu_to_gpu():
+    """Test skrl with CPU simulation and GPU policy.
+
+    Note: Uses skrl.config.torch.device to set the policy device to GPU
+    while the environment runs on CPU.
+    """
+    try:
+        import skrl  # noqa: F401
+    except ImportError:
+        pytest.skip("skrl not installed")
+
+    _test_skrl_device_separation(sim_device="cpu", rl_device="cuda:0")

From b40845ec9428df053e9fa552f3e90bacc6743f64 Mon Sep 17 00:00:00 2001
From: Kelly Guo <kellyg@nvidia.com>
Date: Mon, 10 Nov 2025 16:53:23 -0800
Subject: [PATCH 3/7] isolate pick_and_place change

---
 scripts/demos/pick_and_place.py | 98 +++++++++++++++------------------
 1 file changed, 43 insertions(+), 55 deletions(-)

diff --git a/scripts/demos/pick_and_place.py b/scripts/demos/pick_and_place.py
index 249059c61f4..bc6d35940f0 100644
--- a/scripts/demos/pick_and_place.py
+++ b/scripts/demos/pick_and_place.py
@@ -11,7 +11,6 @@
 
 # add argparse arguments
 parser = argparse.ArgumentParser(description="Keyboard control for Isaac Lab Pick and Place.")
-parser.add_argument("--num_envs", type=int, default=32, help="Number of environments to spawn.")
 # append AppLauncher cli args
 AppLauncher.add_app_launcher_args(parser)
 # parse the arguments
@@ -60,16 +59,11 @@ class PickAndPlaceEnvCfg(DirectRLEnvCfg):
     action_space = 4
     observation_space = 6
     state_space = 0
+    device = "cpu"
 
-    # Simulation cfg. Surface grippers are currently only supported on CPU.
-    # Surface grippers also require scene query support to function.
-    sim: SimulationCfg = SimulationCfg(
-        dt=1 / 60,
-        device="cpu",
-        render_interval=decimation,
-        use_fabric=True,
-        enable_scene_query_support=True,
-    )
+    # Simulation cfg. Note that we are forcing the simulation to run on CPU.
+    # This is because the surface gripper API is only supported on CPU backend for now.
+    sim: SimulationCfg = SimulationCfg(dt=1 / 60, render_interval=decimation, device="cpu")
     debug_vis = True
 
     # robot
@@ -142,8 +136,8 @@ def __init__(self, cfg: PickAndPlaceEnvCfg, render_mode: str | None = None, **kw
         self.joint_vel = self.pick_and_place.data.joint_vel
 
         # Buffers
-        self.go_to_cube = torch.zeros(self.num_envs, dtype=torch.bool, device=self.device)
-        self.go_to_target = torch.zeros(self.num_envs, dtype=torch.bool, device=self.device)
+        self.go_to_cube = False
+        self.go_to_target = False
         self.target_pos = torch.zeros((self.num_envs, 3), device=self.device, dtype=torch.float32)
         self.instant_controls = torch.zeros((self.num_envs, 3), device=self.device, dtype=torch.float32)
         self.permanent_controls = torch.zeros((self.num_envs, 1), device=self.device, dtype=torch.float32)
@@ -179,36 +173,35 @@ def set_up_keyboard(self):
         print("Keyboard set up!")
         print("The simulation is ready for you to try it out!")
         print("Your goal is pick up the purple cube and to drop it on the red sphere!")
-        print(f"Number of environments: {self.num_envs}")
-        print("Use the following controls to interact with ALL environments simultaneously:")
-        print("Press the 'A' key to have all grippers track the cube position.")
-        print("Press the 'D' key to have all grippers track the target position")
-        print("Press the 'W' or 'S' keys to move all gantries UP or DOWN respectively")
-        print("Press 'Q' or 'E' to OPEN or CLOSE all grippers respectively")
+        print("Use the following controls to interact with the simulation:")
+        print("Press the 'A' key to have the gripper track the cube position.")
+        print("Press the 'D' key to have the gripper track the target position")
+        print("Press the 'W' or 'S' keys to move the gantry UP or DOWN respectively")
+        print("Press 'Q' or 'E' to OPEN or CLOSE the gripper respectively")
 
     def _on_keyboard_event(self, event):
         """Checks for a keyboard event and assign the corresponding command control depending on key pressed."""
         if event.type == carb.input.KeyboardEventType.KEY_PRESS:
-            # Logic on key press - apply to ALL environments
+            # Logic on key press
             if event.input.name == self._auto_aim_target:
-                self.go_to_target[:] = True
-                self.go_to_cube[:] = False
+                self.go_to_target = True
+                self.go_to_cube = False
             if event.input.name == self._auto_aim_cube:
-                self.go_to_cube[:] = True
-                self.go_to_target[:] = False
+                self.go_to_cube = True
+                self.go_to_target = False
             if event.input.name in self._instant_key_controls:
-                self.go_to_cube[:] = False
-                self.go_to_target[:] = False
-                self.instant_controls[:] = self._instant_key_controls[event.input.name]
+                self.go_to_cube = False
+                self.go_to_target = False
+                self.instant_controls[0] = self._instant_key_controls[event.input.name]
             if event.input.name in self._permanent_key_controls:
-                self.go_to_cube[:] = False
-                self.go_to_target[:] = False
-                self.permanent_controls[:] = self._permanent_key_controls[event.input.name]
-        # On key release, all robots stop moving
+                self.go_to_cube = False
+                self.go_to_target = False
+                self.permanent_controls[0] = self._permanent_key_controls[event.input.name]
+        # On key release, the robot stops moving
         elif event.type == carb.input.KeyboardEventType.KEY_RELEASE:
-            self.go_to_cube[:] = False
-            self.go_to_target[:] = False
-            self.instant_controls[:] = self._instant_key_controls["ZEROS"]
+            self.go_to_cube = False
+            self.go_to_target = False
+            self.instant_controls[0] = self._instant_key_controls["ZEROS"]
 
     def _setup_scene(self):
         self.pick_and_place = Articulation(self.cfg.robot_cfg)
@@ -232,30 +225,28 @@ def _pre_physics_step(self, actions: torch.Tensor) -> None:
 
     def _apply_action(self) -> None:
         # We use the keyboard outputs as an action.
-        # Process each environment independently
-        if self.go_to_cube.any():
+        if self.go_to_cube:
             # Effort based proportional controller to track the cube position
-            head_pos_x = self.pick_and_place.data.joint_pos[self.go_to_cube, self._x_dof_idx[0]]
-            head_pos_y = self.pick_and_place.data.joint_pos[self.go_to_cube, self._y_dof_idx[0]]
-            cube_pos_x = self.cube.data.root_pos_w[self.go_to_cube, 0] - self.scene.env_origins[self.go_to_cube, 0]
-            cube_pos_y = self.cube.data.root_pos_w[self.go_to_cube, 1] - self.scene.env_origins[self.go_to_cube, 1]
+            head_pos_x = self.pick_and_place.data.joint_pos[:, self._x_dof_idx[0]]
+            head_pos_y = self.pick_and_place.data.joint_pos[:, self._y_dof_idx[0]]
+            cube_pos_x = self.cube.data.root_pos_w[:, 0] - self.scene.env_origins[:, 0]
+            cube_pos_y = self.cube.data.root_pos_w[:, 1] - self.scene.env_origins[:, 1]
             d_cube_robot_x = cube_pos_x - head_pos_x
             d_cube_robot_y = cube_pos_y - head_pos_y
-            self.instant_controls[self.go_to_cube] = torch.stack(
-                [d_cube_robot_x * 5.0, d_cube_robot_y * 5.0, torch.zeros_like(d_cube_robot_x)], dim=1
+            self.instant_controls[0] = torch.tensor(
+                [d_cube_robot_x * 5.0, d_cube_robot_y * 5.0, 0.0], device=self.device
             )
-        elif self.go_to_target.any():
+        elif self.go_to_target:
             # Effort based proportional controller to track the target position
-            head_pos_x = self.pick_and_place.data.joint_pos[self.go_to_target, self._x_dof_idx[0]]
-            head_pos_y = self.pick_and_place.data.joint_pos[self.go_to_target, self._y_dof_idx[0]]
-            target_pos_x = self.target_pos[self.go_to_target, 0]
-            target_pos_y = self.target_pos[self.go_to_target, 1]
+            head_pos_x = self.pick_and_place.data.joint_pos[:, self._x_dof_idx[0]]
+            head_pos_y = self.pick_and_place.data.joint_pos[:, self._y_dof_idx[0]]
+            target_pos_x = self.target_pos[:, 0]
+            target_pos_y = self.target_pos[:, 1]
             d_target_robot_x = target_pos_x - head_pos_x
             d_target_robot_y = target_pos_y - head_pos_y
-            self.instant_controls[self.go_to_target] = torch.stack(
-                [d_target_robot_x * 5.0, d_target_robot_y * 5.0, torch.zeros_like(d_target_robot_x)], dim=1
+            self.instant_controls[0] = torch.tensor(
+                [d_target_robot_x * 5.0, d_target_robot_y * 5.0, 0.0], device=self.device
             )
-
         # Set the joint effort targets for the picker
         self.pick_and_place.set_joint_effort_target(
             self.instant_controls[:, 0].unsqueeze(dim=1), joint_ids=self._x_dof_idx
@@ -267,7 +258,7 @@ def _apply_action(self) -> None:
             self.permanent_controls[:, 0].unsqueeze(dim=1), joint_ids=self._z_dof_idx
         )
         # Set the gripper command
-        self.gripper.set_grippers_command(self.instant_controls[:, 2])
+        self.gripper.set_grippers_command(self.instant_controls[:, 2].unsqueeze(dim=1))
 
     def _get_observations(self) -> dict:
         # Get the observations
@@ -406,11 +397,8 @@ def _debug_vis_callback(self, event):
 
 def main():
     """Main function."""
-    # create environment configuration
-    env_cfg = PickAndPlaceEnvCfg()
-    env_cfg.scene.num_envs = args_cli.num_envs
     # create environment
-    pick_and_place = PickAndPlaceEnv(env_cfg)
+    pick_and_place = PickAndPlaceEnv(PickAndPlaceEnvCfg())
     obs, _ = pick_and_place.reset()
     while simulation_app.is_running():
         # check for selected robots
@@ -421,4 +409,4 @@ def main():
 
 if __name__ == "__main__":
     main()
-    simulation_app.close()
+    simulation_app.close()
\ No newline at end of file

From c176374a0e6d4f15c9f0c37a412b123eb436d1f9 Mon Sep 17 00:00:00 2001
From: Kelly Guo <kellyg@nvidia.com>
Date: Mon, 10 Nov 2025 16:59:36 -0800
Subject: [PATCH 4/7] update

---
 source/isaaclab_rl/config/extension.toml                | 2 +-
 source/isaaclab_rl/docs/CHANGELOG.rst                   | 8 ++++----
 source/isaaclab_rl/isaaclab_rl/rsl_rl/vecenv_wrapper.py | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/source/isaaclab_rl/config/extension.toml b/source/isaaclab_rl/config/extension.toml
index 35ce2649060..494f39f7456 100644
--- a/source/isaaclab_rl/config/extension.toml
+++ b/source/isaaclab_rl/config/extension.toml
@@ -1,7 +1,7 @@
 [package]
 
 # Note: Semantic Versioning is used: https://semver.org/
-version = "0.5.0"
+version = "0.4.5"
 
 # Description
 title = "Isaac Lab RL"
diff --git a/source/isaaclab_rl/docs/CHANGELOG.rst b/source/isaaclab_rl/docs/CHANGELOG.rst
index 3698bf770e0..0305f5a99b1 100644
--- a/source/isaaclab_rl/docs/CHANGELOG.rst
+++ b/source/isaaclab_rl/docs/CHANGELOG.rst
@@ -1,13 +1,13 @@
 Changelog
 ---------
 
-0.5.0 (2025-11-10)
+0.4.5 (2025-11-10)
 ~~~~~~~~~~~~~~~~~~
 
-Added
-^^^^^
+Changed
+^^^^^^^
 
-* Added support for decoupling RL device from simulation device in for RL wrappers.
+* Added support for decoupling RL device from simulation device in for RL games wrapper.
   This allows users to run simulation on one device (e.g., CPU) while running RL training/inference on another device.
 
 
diff --git a/source/isaaclab_rl/isaaclab_rl/rsl_rl/vecenv_wrapper.py b/source/isaaclab_rl/isaaclab_rl/rsl_rl/vecenv_wrapper.py
index 784892f7e37..6561ace8c93 100644
--- a/source/isaaclab_rl/isaaclab_rl/rsl_rl/vecenv_wrapper.py
+++ b/source/isaaclab_rl/isaaclab_rl/rsl_rl/vecenv_wrapper.py
@@ -213,4 +213,4 @@ def _modify_action_space(self):
         )
         self.env.unwrapped.action_space = gym.vector.utils.batch_space(
             self.env.unwrapped.single_action_space, self.num_envs
-        )
+        )
\ No newline at end of file

From cd1764cf231681fb11ecda782f23c20a940c1678 Mon Sep 17 00:00:00 2001
From: Kelly Guo <kellyg@nvidia.com>
Date: Mon, 10 Nov 2025 17:04:43 -0800
Subject: [PATCH 5/7] format

---
 scripts/demos/pick_and_place.py               |  2 +-
 .../reinforcement_learning/rsl_rl/train.py    |  2 +-
 .../isaaclab_rl/rsl_rl/vecenv_wrapper.py      | 35 ++-----------------
 3 files changed, 5 insertions(+), 34 deletions(-)

diff --git a/scripts/demos/pick_and_place.py b/scripts/demos/pick_and_place.py
index bc6d35940f0..cc14dcb0a72 100644
--- a/scripts/demos/pick_and_place.py
+++ b/scripts/demos/pick_and_place.py
@@ -409,4 +409,4 @@ def main():
 
 if __name__ == "__main__":
     main()
-    simulation_app.close()
\ No newline at end of file
+    simulation_app.close()
diff --git a/scripts/reinforcement_learning/rsl_rl/train.py b/scripts/reinforcement_learning/rsl_rl/train.py
index ad739f4559a..8b66feb28aa 100644
--- a/scripts/reinforcement_learning/rsl_rl/train.py
+++ b/scripts/reinforcement_learning/rsl_rl/train.py
@@ -182,7 +182,7 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen
         env = gym.wrappers.RecordVideo(env, **video_kwargs)
 
     # wrap around environment for rsl-rl
-    env = RslRlVecEnvWrapper(env, clip_actions=agent_cfg.clip_actions, rl_device=agent_cfg.device)
+    env = RslRlVecEnvWrapper(env, clip_actions=agent_cfg.clip_actions)
 
     # create runner from rsl-rl
     if agent_cfg.class_name == "OnPolicyRunner":
diff --git a/source/isaaclab_rl/isaaclab_rl/rsl_rl/vecenv_wrapper.py b/source/isaaclab_rl/isaaclab_rl/rsl_rl/vecenv_wrapper.py
index 6561ace8c93..73ceae04693 100644
--- a/source/isaaclab_rl/isaaclab_rl/rsl_rl/vecenv_wrapper.py
+++ b/source/isaaclab_rl/isaaclab_rl/rsl_rl/vecenv_wrapper.py
@@ -24,9 +24,7 @@ class RslRlVecEnvWrapper(VecEnv):
         https://github.com/leggedrobotics/rsl_rl/blob/master/rsl_rl/env/vec_env.py
     """
 
-    def __init__(
-        self, env: ManagerBasedRLEnv | DirectRLEnv, clip_actions: float | None = None, rl_device: str | None = None
-    ):
+    def __init__(self, env: ManagerBasedRLEnv | DirectRLEnv, clip_actions: float | None = None):
         """Initializes the wrapper.
 
         Note:
@@ -35,8 +33,6 @@ def __init__(
         Args:
             env: The environment to wrap around.
             clip_actions: The clipping value for actions. If ``None``, then no clipping is done.
-            rl_device: The device for RL agent/policy. If ``None``, uses the environment device.
-                This allows running the RL agent on a different device than the environment.
 
         Raises:
             ValueError: When the environment is not an instance of :class:`ManagerBasedRLEnv` or :class:`DirectRLEnv`.
@@ -53,21 +49,11 @@ def __init__(
         self.env = env
         self.clip_actions = clip_actions
 
-        # store the RL device (where policy/training happens)
-        # this may be different from env.device (where task buffers are)
-        if rl_device is None:
-            self.rl_device = self.unwrapped.device
-        else:
-            self.rl_device = rl_device
-
         # store information required by wrapper
         self.num_envs = self.unwrapped.num_envs
-        self.device = self.rl_device
+        self.device = self.unwrapped.device
         self.max_episode_length = self.unwrapped.max_episode_length
 
-        # track the environment device separately
-        self.env_device = self.unwrapped.device
-
         # obtain dimensions of the environment
         if hasattr(self.unwrapped, "action_manager"):
             self.num_actions = self.unwrapped.action_manager.total_action_dim
@@ -153,9 +139,6 @@ def seed(self, seed: int = -1) -> int:  # noqa: D102
     def reset(self) -> tuple[TensorDict, dict]:  # noqa: D102
         # reset the environment
         obs_dict, extras = self.env.reset()
-        # move observations to RL device if different from env device
-        if self.rl_device != self.env_device:
-            obs_dict = {k: v.to(self.rl_device) if isinstance(v, torch.Tensor) else v for k, v in obs_dict.items()}
         return TensorDict(obs_dict, batch_size=[self.num_envs]), extras
 
     def get_observations(self) -> TensorDict:
@@ -164,26 +147,14 @@ def get_observations(self) -> TensorDict:
             obs_dict = self.unwrapped.observation_manager.compute()
         else:
             obs_dict = self.unwrapped._get_observations()
-        # move observations to RL device if different from env device
-        if self.rl_device != self.env_device:
-            obs_dict = {k: v.to(self.rl_device) if isinstance(v, torch.Tensor) else v for k, v in obs_dict.items()}
         return TensorDict(obs_dict, batch_size=[self.num_envs])
 
     def step(self, actions: torch.Tensor) -> tuple[TensorDict, torch.Tensor, torch.Tensor, dict]:
-        # move actions to env device if coming from different RL device
-        if self.rl_device != self.env_device:
-            actions = actions.to(self.env_device)
         # clip actions
         if self.clip_actions is not None:
             actions = torch.clamp(actions, -self.clip_actions, self.clip_actions)
         # record step information
         obs_dict, rew, terminated, truncated, extras = self.env.step(actions)
-        # move outputs to RL device if different from env device
-        if self.rl_device != self.env_device:
-            obs_dict = {k: v.to(self.rl_device) if isinstance(v, torch.Tensor) else v for k, v in obs_dict.items()}
-            rew = rew.to(self.rl_device)
-            terminated = terminated.to(self.rl_device)
-            truncated = truncated.to(self.rl_device)
         # compute dones for compatibility with RSL-RL
         dones = (terminated | truncated).to(dtype=torch.long)
         # move time out information to the extras dict
@@ -213,4 +184,4 @@ def _modify_action_space(self):
         )
         self.env.unwrapped.action_space = gym.vector.utils.batch_space(
             self.env.unwrapped.single_action_space, self.num_envs
-        )
\ No newline at end of file
+        )

From 422f8e182da304754d58bcf303a56e967f7d532c Mon Sep 17 00:00:00 2001
From: Kelly Guo <kellyg@nvidia.com>
Date: Mon, 10 Nov 2025 19:41:57 -0800
Subject: [PATCH 6/7] fix test

---
 .../test/test_rl_device_separation.py         | 545 ++++--------------
 1 file changed, 118 insertions(+), 427 deletions(-)

diff --git a/source/isaaclab_tasks/test/test_rl_device_separation.py b/source/isaaclab_tasks/test/test_rl_device_separation.py
index ec3f7060877..2faeabbe1f0 100644
--- a/source/isaaclab_tasks/test/test_rl_device_separation.py
+++ b/source/isaaclab_tasks/test/test_rl_device_separation.py
@@ -24,22 +24,15 @@
     4. Wrapper: transfers outputs from sim_device → rl_device (for policy to use)
 
 Tested Libraries:
-    - RSL-RL: TensorDict observations, explicit rl_device parameter
-        * Transfers observations and rewards to rl_device
-    - RL Games: Dict observations, explicit rl_device parameter
-        * Transfers observations and rewards to rl_device
+    - RSL-RL: TensorDict observations, device separation via OnPolicyRunner (agent_cfg.device)
+        * Wrapper returns data on sim_device, Runner handles transfers to rl_device
+    - RL Games: Dict observations, explicit rl_device parameter in wrapper
+        * Wrapper transfers data from sim_device to rl_device
     - Stable-Baselines3: Numpy arrays (CPU-only by design)
-        * Always converts to/from numpy on CPU
+        * Wrapper converts tensors to/from numpy on CPU
     - skrl: Dict observations, uses skrl.config.torch.device for RL device
-        * Keeps observations on sim_device (policy handles transfer)
-        * Only transfers actions from rl_device to sim_device
-
-IMPORTANT: Due to Isaac Sim limitations, only ONE test can be run per pytest invocation.
-Run tests individually:
-    pytest test_rl_device_separation.py::test_rsl_rl_device_separation_gpu_to_gpu -v -s
-    pytest test_rl_device_separation.py::test_rsl_rl_device_separation_gpu_to_cpu -v -s
-    pytest test_rl_device_separation.py::test_rl_games_device_separation_gpu_to_gpu -v -s
-    ...
+        * Wrapper keeps observations on sim_device, only transfers actions
+
 """
 
 from isaaclab.app import AppLauncher
@@ -65,35 +58,23 @@
 NUM_ENVS = 4
 
 
-def _test_rsl_rl_device_separation(sim_device: str, rl_device: str):
-    """Helper function to test RSL-RL with specified device configuration.
+def _create_env(sim_device: str):
+    """Create and initialize a test environment.
 
     Args:
         sim_device: Device for simulation (e.g., "cuda:0", "cpu")
-        rl_device: Device for RL agent (e.g., "cuda:0", "cpu")
-    """
-    from tensordict import TensorDict
-
-    from isaaclab_rl.rsl_rl import RslRlVecEnvWrapper
-
-    print(f"\n{'=' * 60}")
-    print(f">>> Testing RSL-RL with sim_device={sim_device}, rl_device={rl_device}")
-    print(f"{'=' * 60}")
 
+    Returns:
+        Initialized gym environment
+    """
     # Create a new stage
     omni.usd.get_context().new_stage()
     # Reset the rtx sensors carb setting to False
     carb.settings.get_settings().set_bool("/isaaclab/render/rtx_sensors", False)
 
     try:
-        # Parse environment config
-        print("  [1/6] Parsing environment config...")
         env_cfg = parse_env_cfg(TEST_ENV, device=sim_device, num_envs=NUM_ENVS)
-
-        # Create environment
-        print("  [2/6] Creating environment (may take 5-10s)...")
         env = gym.make(TEST_ENV, cfg=env_cfg)
-        print("  [2/6] Environment created successfully")
     except Exception as e:
         # Try to close environment on exception
         if "env" in locals() and hasattr(env, "_is_closed"):
@@ -105,96 +86,92 @@ def _test_rsl_rl_device_separation(sim_device: str, rl_device: str):
 
     # Disable control on stop
     env.unwrapped.sim._app_control_on_stop_handle = None
+    return env
+
+
+def _verify_unwrapped_env(env, sim_device: str):
+    """Verify unwrapped environment operates entirely on sim_device.
 
-    # Verify environment device
-    print("  [3/6] Verifying environment device...")
-    assert (
-        env.unwrapped.device == sim_device
-    ), f"Environment device mismatch: expected {sim_device}, got {env.unwrapped.device}"
+    Args:
+        env: Unwrapped gym environment
+        sim_device: Expected simulation device
+    """
+    assert env.unwrapped.device == sim_device, \
+        f"Environment device mismatch: expected {sim_device}, got {env.unwrapped.device}"
 
-    # Test environment directly before wrapping to verify it returns data on sim device
-    print("  [3/6] Testing unwrapped environment returns data on sim_device...")
+    # Verify reset returns data on sim device
     obs_dict, _ = env.reset()
     for key, value in obs_dict.items():
         if isinstance(value, torch.Tensor):
-            assert (
-                value.device.type == torch.device(sim_device).type
-            ), f"Unwrapped env obs '{key}' should be on {sim_device}, got {value.device}"
+            assert value.device.type == torch.device(sim_device).type, \
+                f"Unwrapped env obs '{key}' should be on {sim_device}, got {value.device}"
 
-    # Step unwrapped environment to verify outputs are on sim device
+    # Verify step returns data on sim device
     action_space = env.unwrapped.single_action_space
     test_action = torch.zeros(NUM_ENVS, action_space.shape[0], device=sim_device)
     obs_dict, rew, term, trunc, extras = env.step(test_action)
-    assert (
-        rew.device.type == torch.device(sim_device).type
-    ), f"Unwrapped env rewards should be on {sim_device}, got {rew.device}"
-    assert (
-        term.device.type == torch.device(sim_device).type
-    ), f"Unwrapped env terminated should be on {sim_device}, got {term.device}"
-    print(f"  [3/6] Verified: Unwrapped environment returns data on {sim_device}")
-
-    # Create RSL-RL wrapper with RL device
-    print("  [4/6] Creating RSL-RL wrapper...")
-    env = RslRlVecEnvWrapper(env, rl_device=rl_device)
-    print(f"  [4/6] Wrapper created (env_device={env.env_device}, rl_device={env.rl_device})")
-
-    # Verify devices
-    assert env.env_device == sim_device, f"Wrapper env_device should be {sim_device}"
-    assert env.rl_device == rl_device, f"Wrapper RL device should be {rl_device}"
-    assert env.device == rl_device, f"Wrapper device property should be {rl_device}"
-
-    # Reset and step to test device transfers
-    print("  [5/6] Testing reset and step operations...")
-    obs, extras = env.reset()
-    print("  [5/6] Reset completed")
+    assert rew.device.type == torch.device(sim_device).type, \
+        f"Unwrapped env rewards should be on {sim_device}, got {rew.device}"
+    assert term.device.type == torch.device(sim_device).type, \
+        f"Unwrapped env terminated should be on {sim_device}, got {term.device}"
+
+
+def _verify_tensor_device(data, expected_device: str, name: str):
+    """Verify tensor or dict of tensors is on expected device.
+
+    Args:
+        data: Tensor, dict of tensors, or numpy array
+        expected_device: Expected device string
+        name: Name for error messages
+    """
+    if isinstance(data, torch.Tensor):
+        assert data.device.type == torch.device(expected_device).type, \
+            f"{name} should be on {expected_device}, got {data.device}"
+    elif isinstance(data, dict):
+        for key, value in data.items():
+            if isinstance(value, torch.Tensor):
+                assert value.device.type == torch.device(expected_device).type, \
+                    f"{name}['{key}'] should be on {expected_device}, got {value.device}"
+
+
+def _test_rsl_rl_device_separation(sim_device: str, rl_device: str):
+    """Helper function to test RSL-RL with specified device configuration.
+
+    Note: RSL-RL device separation is handled by the OnPolicyRunner, not the wrapper.
+    The wrapper returns observations on sim_device, and the runner handles device transfers.
+    This test verifies the wrapper works correctly when actions come from a different device.
+
+    Args:
+        sim_device: Device for simulation (e.g., "cuda:0", "cpu")
+        rl_device: Device for RL agent (e.g., "cuda:0", "cpu") - where policy generates actions
+    """
+    from tensordict import TensorDict
+    from isaaclab_rl.rsl_rl import RslRlVecEnvWrapper
+
+    env = _create_env(sim_device)
+    _verify_unwrapped_env(env, sim_device)
 
-    # Verify observations are on RL device (RSL-RL returns TensorDict)
+    # Create wrapper - it uses sim_device, runner handles rl_device
+    env = RslRlVecEnvWrapper(env)
+    assert env.device == sim_device, f"Wrapper device should be {sim_device}"
+
+    # Test reset - wrapper returns observations on sim_device
+    obs, extras = env.reset()
     assert isinstance(obs, TensorDict), f"Expected TensorDict, got {type(obs)}"
-    for key, value in obs.items():
-        if isinstance(value, torch.Tensor):
-            assert (
-                value.device.type == torch.device(rl_device).type
-            ), f"Observation '{key}' should be on {rl_device}, got {value.device}"
+    _verify_tensor_device(obs, sim_device, "Observation")
 
-    # Sample random action on RL device (simulating policy output)
-    # RSL-RL: action_space.shape is (num_envs, action_dim)
+    # Test step with action from RL device (simulating policy output)
+    # The wrapper should handle transferring action to sim_device internally
     action = 2 * torch.rand(env.action_space.shape, device=rl_device) - 1
-    print(f"  [5/6] Action created on rl_device: {action.device}, shape: {action.shape}")
-
-    # Verify action is on RL device before calling step
-    assert (
-        action.device.type == torch.device(rl_device).type
-    ), f"Action should be on {rl_device} before step, got {action.device}"
-
-    # Step environment - wrapper should:
-    # 1. Accept action on rl_device
-    # 2. Transfer action from rl_device to sim_device internally
-    # 3. Call unwrapped env.step() with action on sim_device
-    # 4. Transfer outputs from sim_device to rl_device
     obs, reward, dones, extras = env.step(action)
-    print("  [5/6] Step completed - wrapper handled device transfers")
 
-    # Verify all outputs are on RL device (wrapper transferred from sim_device)
-    print("  [6/6] Verifying device transfers...")
+    # Verify outputs are on sim_device (runner would transfer to rl_device)
     assert isinstance(obs, TensorDict), f"Expected TensorDict, got {type(obs)}"
-    for key, value in obs.items():
-        if isinstance(value, torch.Tensor):
-            assert (
-                value.device.type == torch.device(rl_device).type
-            ), f"Step observation '{key}' should be on {rl_device}, got {value.device}"
-    assert reward.device.type == torch.device(rl_device).type, f"Rewards should be on {rl_device}, got {reward.device}"
-    assert dones.device.type == torch.device(rl_device).type, f"Dones should be on {rl_device}, got {dones.device}"
-
-    # Cleanup
-    print("  [6/6] Cleaning up environment...")
+    _verify_tensor_device(obs, sim_device, "Step observation")
+    _verify_tensor_device(reward, sim_device, "Reward")
+    _verify_tensor_device(dones, sim_device, "Dones")
+
     env.close()
-    print(f"✓ RSL-RL test PASSED for sim_device={sim_device}, rl_device={rl_device}")
-    print("  Wrapper device transfer verified:")
-    print(f"    1. Unwrapped env: expects actions on {sim_device}, returns data on {sim_device}")
-    print(f"    2. Wrapper: accepts actions on {rl_device} (from policy)")
-    print(f"    3. Wrapper: internally transfers actions to {sim_device} for env.step()")
-    print(f"    4. Wrapper: transfers outputs from {sim_device} to {rl_device} (for policy)")
-    print("-" * 80)
 
 
 def _test_rl_games_device_separation(sim_device: str, rl_device: str):
@@ -206,125 +183,26 @@ def _test_rl_games_device_separation(sim_device: str, rl_device: str):
     """
     from isaaclab_rl.rl_games import RlGamesVecEnvWrapper
 
-    print(f"\n{'=' * 60}")
-    print(f">>> Testing RL Games with sim_device={sim_device}, rl_device={rl_device}")
-    print(f"{'=' * 60}")
+    env = _create_env(sim_device)
+    _verify_unwrapped_env(env, sim_device)
 
-    # Create a new stage
-    omni.usd.get_context().new_stage()
-    # Reset the rtx sensors carb setting to False
-    carb.settings.get_settings().set_bool("/isaaclab/render/rtx_sensors", False)
-
-    try:
-        # Parse environment config
-        print("  [1/5] Parsing environment config...")
-        env_cfg = parse_env_cfg(TEST_ENV, device=sim_device, num_envs=NUM_ENVS)
-
-        # Create environment
-        print("  [2/5] Creating environment (may take 5-10s)...")
-        env = gym.make(TEST_ENV, cfg=env_cfg)
-        print("  [2/5] Environment created successfully")
-    except Exception as e:
-        # Try to close environment on exception
-        if "env" in locals() and hasattr(env, "_is_closed"):
-            env.close()
-        else:
-            if hasattr(e, "obj") and hasattr(e.obj, "_is_closed"):
-                e.obj.close()
-        pytest.fail(f"Failed to set-up the environment for task {TEST_ENV}. Error: {e}")
-
-    # Disable control on stop
-    env.unwrapped.sim._app_control_on_stop_handle = None
-
-    # Verify environment device
-    print("  [3/5] Verifying environment device...")
-    assert (
-        env.unwrapped.device == sim_device
-    ), f"Environment device mismatch: expected {sim_device}, got {env.unwrapped.device}"
-
-    # Test environment directly before wrapping to verify it returns data on sim device
-    print("  [3/5] Testing unwrapped environment returns data on sim_device...")
-    obs_dict, _ = env.reset()
-    for key, value in obs_dict.items():
-        if isinstance(value, torch.Tensor):
-            assert (
-                value.device.type == torch.device(sim_device).type
-            ), f"Unwrapped env obs '{key}' should be on {sim_device}, got {value.device}"
-
-    # Step unwrapped environment to verify outputs are on sim device
-    action_space = env.unwrapped.single_action_space
-    test_action = torch.zeros(NUM_ENVS, action_space.shape[0], device=sim_device)
-    obs_dict, rew, term, trunc, extras = env.step(test_action)
-    assert (
-        rew.device.type == torch.device(sim_device).type
-    ), f"Unwrapped env rewards should be on {sim_device}, got {rew.device}"
-    assert (
-        term.device.type == torch.device(sim_device).type
-    ), f"Unwrapped env terminated should be on {sim_device}, got {term.device}"
-    print(f"  [3/5] Verified: Unwrapped environment returns data on {sim_device}")
-
-    # Create RL Games wrapper with RL device
-    print("  [3/5] Creating RL Games wrapper...")
+    # Create wrapper
     env = RlGamesVecEnvWrapper(env, rl_device=rl_device, clip_obs=10.0, clip_actions=1.0)
 
-    # Reset and step to test device transfers
-    print("  [4/5] Testing reset and step operations...")
+    # Test reset
     obs = env.reset()
-    print("  [4/5] Reset completed")
-
-    # Verify observations are on RL device
-    if isinstance(obs, dict):
-        for key, value in obs.items():
-            assert (
-                value.device.type == torch.device(rl_device).type
-            ), f"Observation '{key}' should be on {rl_device}, got {value.device}"
-    else:
-        assert (
-            obs.device.type == torch.device(rl_device).type
-        ), f"Observation should be on {rl_device}, got {obs.device}"
+    _verify_tensor_device(obs, rl_device, "Observation")
 
-    # Sample random action on RL device (simulating policy output)
+    # Test step with action on RL device
     action = 2 * torch.rand(NUM_ENVS, *env.action_space.shape, device=rl_device) - 1
-    print(f"  [4/5] Action created on rl_device: {action.device}, shape: {action.shape}")
-
-    # Verify action is on RL device before calling step
-    assert (
-        action.device.type == torch.device(rl_device).type
-    ), f"Action should be on {rl_device} before step, got {action.device}"
-
-    # Step environment - wrapper should:
-    # 1. Accept action on rl_device
-    # 2. Transfer action from rl_device to sim_device internally
-    # 3. Call unwrapped env.step() with action on sim_device
-    # 4. Transfer outputs from sim_device to rl_device
     obs, reward, dones, info = env.step(action)
-    print("  [4/5] Step completed - wrapper handled device transfers")
-
-    # Verify all outputs are on RL device (wrapper transferred from sim_device)
-    print("  [5/5] Verifying device transfers...")
-    # RL Games returns flat tensor for observations
-    if isinstance(obs, dict):
-        for key, value in obs.items():
-            assert (
-                value.device.type == torch.device(rl_device).type
-            ), f"Observation '{key}' should be on {rl_device}, got {value.device}"
-    else:
-        assert (
-            obs.device.type == torch.device(rl_device).type
-        ), f"Observations should be on {rl_device}, got {obs.device}"
-    assert reward.device.type == torch.device(rl_device).type, f"Rewards should be on {rl_device}, got {reward.device}"
-    assert dones.device.type == torch.device(rl_device).type, f"Dones should be on {rl_device}, got {dones.device}"
-
-    # Cleanup
-    print("  [5/5] Cleaning up environment...")
+
+    # Verify outputs are on RL device
+    _verify_tensor_device(obs, rl_device, "Observation")
+    _verify_tensor_device(reward, rl_device, "Reward")
+    _verify_tensor_device(dones, rl_device, "Dones")
+
     env.close()
-    print(f"✓ RL Games test PASSED for sim_device={sim_device}, rl_device={rl_device}")
-    print("  Wrapper device transfer verified:")
-    print(f"    1. Unwrapped env: expects actions on {sim_device}, returns data on {sim_device}")
-    print(f"    2. Wrapper: accepts actions on {rl_device} (from policy)")
-    print(f"    3. Wrapper: internally transfers actions to {sim_device} for env.step()")
-    print(f"    4. Wrapper: transfers outputs from {sim_device} to {rl_device} (for policy)")
-    print("-" * 80)
 
 
 def _test_sb3_device_separation(sim_device: str):
@@ -336,106 +214,39 @@ def _test_sb3_device_separation(sim_device: str):
         sim_device: Device for simulation (e.g., "cuda:0", "cpu")
     """
     import numpy as np
-
     from isaaclab_rl.sb3 import Sb3VecEnvWrapper
 
-    print(f"\n{'=' * 60}")
-    print(f">>> Testing SB3 with sim_device={sim_device}")
-    print(f"{'=' * 60}")
+    env = _create_env(sim_device)
+    _verify_unwrapped_env(env, sim_device)
 
-    # Create a new stage
-    omni.usd.get_context().new_stage()
-    # Reset the rtx sensors carb setting to False
-    carb.settings.get_settings().set_bool("/isaaclab/render/rtx_sensors", False)
-
-    try:
-        # Parse environment config
-        print("  [1/5] Parsing environment config...")
-        env_cfg = parse_env_cfg(TEST_ENV, device=sim_device, num_envs=NUM_ENVS)
-
-        # Create environment
-        print("  [2/5] Creating environment (may take 5-10s)...")
-        env = gym.make(TEST_ENV, cfg=env_cfg)
-        print("  [2/5] Environment created successfully")
-    except Exception as e:
-        # Try to close environment on exception
-        if "env" in locals() and hasattr(env, "_is_closed"):
-            env.close()
-        else:
-            if hasattr(e, "obj") and hasattr(e.obj, "_is_closed"):
-                e.obj.close()
-        pytest.fail(f"Failed to set-up the environment for task {TEST_ENV}. Error: {e}")
-
-    # Disable control on stop
-    env.unwrapped.sim._app_control_on_stop_handle = None
-
-    # Verify environment device
-    print("  [3/5] Verifying environment device...")
-    assert (
-        env.unwrapped.device == sim_device
-    ), f"Environment device mismatch: expected {sim_device}, got {env.unwrapped.device}"
-
-    # Test environment directly before wrapping to verify it returns data on sim device
-    print("  [3/5] Testing unwrapped environment returns data on sim_device...")
-    obs_dict, _ = env.reset()
-    for key, value in obs_dict.items():
-        if isinstance(value, torch.Tensor):
-            assert (
-                value.device.type == torch.device(sim_device).type
-            ), f"Unwrapped env obs '{key}' should be on {sim_device}, got {value.device}"
-    print(f"  [3/5] Verified: Unwrapped environment returns data on {sim_device}")
-
-    # Create SB3 wrapper (always converts to numpy/CPU)
-    print("  [3/5] Creating SB3 wrapper...")
+    # Create wrapper
     env = Sb3VecEnvWrapper(env)
 
-    # Reset and step to test device transfers
-    print("  [4/5] Testing reset and step operations...")
+    # Test reset - SB3 should return numpy arrays
     obs = env.reset()
-    print("  [4/5] Reset completed")
-
-    # SB3 observations should always be numpy arrays (on CPU)
     assert isinstance(obs, np.ndarray), f"SB3 observations should be numpy arrays, got {type(obs)}"
 
-    # Sample random action (SB3 uses numpy)
+    # Test step with numpy action
     action = 2 * np.random.rand(env.num_envs, *env.action_space.shape) - 1
-    assert isinstance(action, np.ndarray), f"Action should be numpy array, got {type(action)}"
-    print(f"  [4/5] Action sampled (numpy array), shape: {action.shape}")
-
-    # Step environment - wrapper should:
-    # 1. Convert numpy action to torch tensor on sim_device internally
-    # 2. Call unwrapped env.step() with action on sim_device
-    # 3. Convert outputs from sim_device tensors to numpy arrays
     obs, reward, done, info = env.step(action)
-    print("  [4/5] Step completed, outputs converted to numpy")
 
-    # Verify all outputs are numpy arrays (wrapper transferred and converted)
-    print("  [5/5] Verifying numpy conversions...")
+    # Verify outputs are numpy arrays
     assert isinstance(obs, np.ndarray), f"Observations should be numpy arrays, got {type(obs)}"
     assert isinstance(reward, np.ndarray), f"Rewards should be numpy arrays, got {type(reward)}"
     assert isinstance(done, np.ndarray), f"Dones should be numpy arrays, got {type(done)}"
 
-    # Cleanup
-    print("  [5/5] Cleaning up environment...")
     env.close()
-    print(f"✓ SB3 test PASSED for sim_device={sim_device}")
-    print("  Wrapper device transfer verified:")
-    print(f"    1. Unwrapped env: expects actions on {sim_device}, returns data on {sim_device}")
-    print("    2. Wrapper: accepts numpy arrays (from policy on CPU)")
-    print(f"    3. Wrapper: internally converts to tensors on {sim_device} for env.step()")
-    print(f"    4. Wrapper: converts outputs from {sim_device} tensors to numpy arrays (for policy)")
-    print("-" * 80)
 
 
 def _test_skrl_device_separation(sim_device: str, rl_device: str):
     """Helper function to test skrl with specified device configuration.
 
     Note: skrl uses skrl.config.torch.device for device configuration.
-    This can be set via agent_cfg["device"] for consistency with other libraries.
+    Observations remain on sim_device; only actions are transferred from rl_device.
 
     Args:
         sim_device: Device for simulation (e.g., "cuda:0", "cpu")
-        rl_device: Device for RL agent (e.g., "cuda:0", "cpu") - set via skrl.config.torch.device
+        rl_device: Device for RL agent (e.g., "cuda:0", "cpu")
     """
     try:
         import skrl
@@ -443,159 +254,39 @@ def _test_skrl_device_separation(sim_device: str, rl_device: str):
     except ImportError:
         pytest.skip("skrl not installed")
 
-    print(f"\n{'=' * 60}")
-    print(f">>> Testing skrl with sim_device={sim_device}, rl_device={rl_device}")
-    print(f"    Using skrl.config.torch.device = {rl_device}")
-    print(f"{'=' * 60}")
-
-    # Create agent config with device parameter (for demonstration/consistency)
-    agent_cfg = {"device": rl_device}
-
-    # Configure skrl device (can be set from agent_cfg for consistency with other libraries)
-    if "device" in agent_cfg:
-        skrl.config.torch.device = torch.device(agent_cfg["device"])
-    else:
-        skrl.config.torch.device = torch.device(rl_device)
+    # Configure skrl device
+    skrl.config.torch.device = torch.device(rl_device)
 
-    # Create a new stage
-    omni.usd.get_context().new_stage()
-    # Reset the rtx sensors carb setting to False
-    carb.settings.get_settings().set_bool("/isaaclab/render/rtx_sensors", False)
+    env = _create_env(sim_device)
+    _verify_unwrapped_env(env, sim_device)
 
-    try:
-        # Parse environment config
-        print("  [1/6] Parsing environment config...")
-        env_cfg = parse_env_cfg(TEST_ENV, device=sim_device, num_envs=NUM_ENVS)
-
-        # Create environment
-        print("  [2/6] Creating environment (may take 5-10s)...")
-        env = gym.make(TEST_ENV, cfg=env_cfg)
-        print("  [2/6] Environment created successfully")
-    except Exception as e:
-        # Try to close environment on exception
-        if "env" in locals() and hasattr(env, "_is_closed"):
-            env.close()
-        else:
-            if hasattr(e, "obj") and hasattr(e.obj, "_is_closed"):
-                e.obj.close()
-        pytest.fail(f"Failed to set-up the environment for task {TEST_ENV}. Error: {e}")
-
-    # Disable control on stop
-    env.unwrapped.sim._app_control_on_stop_handle = None
-
-    # Verify environment device
-    print("  [3/6] Verifying environment device...")
-    assert (
-        env.unwrapped.device == sim_device
-    ), f"Environment device mismatch: expected {sim_device}, got {env.unwrapped.device}"
-
-    # Test environment directly before wrapping to verify it returns data on sim device
-    print("  [3/6] Testing unwrapped environment returns data on sim_device...")
-    obs_dict, _ = env.reset()
-    for key, value in obs_dict.items():
-        if isinstance(value, torch.Tensor):
-            assert (
-                value.device.type == torch.device(sim_device).type
-            ), f"Unwrapped env obs '{key}' should be on {sim_device}, got {value.device}"
-    print(f"  [3/6] Verified: Unwrapped environment returns data on {sim_device}")
-
-    # Wrap with skrl (will use skrl.config.torch.device for policy)
-    print("  [3/6] Creating skrl wrapper...")
+    # Wrap with skrl
     env = wrap_env(env, wrapper="isaaclab")
 
-    # Reset to test basic functionality
-    print("  [4/6] Testing reset and step operations...")
+    # Test reset
     obs, info = env.reset()
-    print("  [4/6] Reset completed")
+    assert isinstance(obs, (dict, torch.Tensor)), f"Observations should be dict or tensor, got {type(obs)}"
 
-    # Verify observations are tensors or dict
-    # skrl can return either dict or tensor depending on configuration
-    if isinstance(obs, dict):
-        assert isinstance(obs["policy"], torch.Tensor), f"Observations should be tensors, got {type(obs['policy'])}"
-    else:
-        assert isinstance(obs, torch.Tensor), f"Observations should be tensors, got {type(obs)}"
-
-    # Sample random action on RL device (simulating policy output - skrl always uses GPU for training)
-    rl_device_obj = skrl.config.torch.device
-    action = 2 * torch.rand(NUM_ENVS, *env.action_space.shape, device=rl_device_obj) - 1
-    print(f"  [4/6] Action created on rl_device: {rl_device_obj}, shape: {action.shape}")
-
-    # Verify action is on RL device before calling step
-    assert (
-        action.device.type == rl_device_obj.type
-    ), f"Action should be on {rl_device_obj} before step, got {action.device}"
-
-    # Step environment - wrapper should:
-    # 1. Accept action on rl_device
-    # 2. Transfer action from rl_device to sim_device internally
-    # 3. Call unwrapped env.step() with action on sim_device
-    # 4. Return outputs on sim_device (skrl policy handles device transfer)
-    print("  [5/6] Testing step with action on rl_device...")
+    # Test step with action on RL device
+    action = 2 * torch.rand(NUM_ENVS, *env.action_space.shape, device=skrl.config.torch.device) - 1
     transition = env.step(action)
-    print("  [5/6] Step completed - wrapper handled action device transfer")
 
-    # Verify outputs are tensors
-    # Note: skrl wrapper returns outputs on sim_device, not rl_device
-    # The policy is responsible for transferring observations when needed
-    print("  [6/6] Verifying outputs are on sim_device (skrl behavior)...")
+    # Verify outputs - skrl keeps them on sim_device
     if len(transition) == 5:
         obs, reward, terminated, truncated, info = transition
-        # Check observations (can be dict or tensor)
-        if isinstance(obs, dict):
-            assert isinstance(obs["policy"], torch.Tensor), "Observations should be tensors"
-            assert (
-                obs["policy"].device.type == torch.device(sim_device).type
-            ), f"Observations should be on {sim_device}, got {obs['policy'].device}"
-        else:
-            assert isinstance(obs, torch.Tensor), "Observations should be tensors"
-            assert (
-                obs.device.type == torch.device(sim_device).type
-            ), f"Observations should be on {sim_device}, got {obs.device}"
-        assert isinstance(reward, torch.Tensor), "Rewards should be tensors"
-        assert (
-            reward.device.type == torch.device(sim_device).type
-        ), f"Rewards should be on {sim_device}, got {reward.device}"
-        assert isinstance(terminated, torch.Tensor), "Terminated should be tensors"
-        assert (
-            terminated.device.type == torch.device(sim_device).type
-        ), f"Terminated should be on {sim_device}, got {terminated.device}"
-        assert isinstance(truncated, torch.Tensor), "Truncated should be tensors"
-        assert (
-            truncated.device.type == torch.device(sim_device).type
-        ), f"Truncated should be on {sim_device}, got {truncated.device}"
+        _verify_tensor_device(obs, sim_device, "Observation")
+        _verify_tensor_device(reward, sim_device, "Reward")
+        _verify_tensor_device(terminated, sim_device, "Terminated")
+        _verify_tensor_device(truncated, sim_device, "Truncated")
     elif len(transition) == 4:
         obs, reward, done, info = transition
-        # Check observations (can be dict or tensor)
-        if isinstance(obs, dict):
-            assert isinstance(obs["policy"], torch.Tensor), "Observations should be tensors"
-            assert (
-                obs["policy"].device.type == torch.device(sim_device).type
-            ), f"Observations should be on {sim_device}, got {obs['policy'].device}"
-        else:
-            assert isinstance(obs, torch.Tensor), "Observations should be tensors"
-            assert (
-                obs.device.type == torch.device(sim_device).type
-            ), f"Observations should be on {sim_device}, got {obs.device}"
-        assert isinstance(reward, torch.Tensor), "Rewards should be tensors"
-        assert (
-            reward.device.type == torch.device(sim_device).type
-        ), f"Rewards should be on {sim_device}, got {reward.device}"
-        assert isinstance(done, torch.Tensor), "Dones should be tensors"
-        assert done.device.type == torch.device(sim_device).type, f"Dones should be on {sim_device}, got {done.device}"
+        _verify_tensor_device(obs, sim_device, "Observation")
+        _verify_tensor_device(reward, sim_device, "Reward")
+        _verify_tensor_device(done, sim_device, "Done")
     else:
         pytest.fail(f"Unexpected number of return values from step: {len(transition)}")
 
-    # Cleanup
-    print("  [6/6] Cleaning up environment...")
     env.close()
-    print(f"✓ skrl test PASSED for sim_device={sim_device}, rl_device={rl_device_obj}")
-    print("  Wrapper device transfer verified (skrl-specific behavior):")
-    print(f"    1. Unwrapped env: expects actions on {sim_device}, returns data on {sim_device}")
-    print(f"    2. Wrapper: accepts actions on {rl_device_obj} (from policy)")
-    print(f"    3. Wrapper: internally transfers actions to {sim_device} for env.step()")
-    print(f"    4. Wrapper: returns outputs on {sim_device} (policy handles obs device transfer)")
-    print("    Note: Unlike RSL-RL/RL-Games, skrl keeps observations on sim_device")
-    print("-" * 80)
 
 
 # ============================================================================

From ec417d443d4c465f8318d318f2705b5fed1e3151 Mon Sep 17 00:00:00 2001
From: Kelly Guo <kellyg@nvidia.com>
Date: Mon, 10 Nov 2025 19:42:14 -0800
Subject: [PATCH 7/7] format

---
 .../test/test_rl_device_separation.py         | 32 ++++++++++++-------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/source/isaaclab_tasks/test/test_rl_device_separation.py b/source/isaaclab_tasks/test/test_rl_device_separation.py
index 2faeabbe1f0..3dc588b3a6c 100644
--- a/source/isaaclab_tasks/test/test_rl_device_separation.py
+++ b/source/isaaclab_tasks/test/test_rl_device_separation.py
@@ -96,24 +96,28 @@ def _verify_unwrapped_env(env, sim_device: str):
         env: Unwrapped gym environment
         sim_device: Expected simulation device
     """
-    assert env.unwrapped.device == sim_device, \
-        f"Environment device mismatch: expected {sim_device}, got {env.unwrapped.device}"
+    assert (
+        env.unwrapped.device == sim_device
+    ), f"Environment device mismatch: expected {sim_device}, got {env.unwrapped.device}"
 
     # Verify reset returns data on sim device
     obs_dict, _ = env.reset()
     for key, value in obs_dict.items():
         if isinstance(value, torch.Tensor):
-            assert value.device.type == torch.device(sim_device).type, \
-                f"Unwrapped env obs '{key}' should be on {sim_device}, got {value.device}"
+            assert (
+                value.device.type == torch.device(sim_device).type
+            ), f"Unwrapped env obs '{key}' should be on {sim_device}, got {value.device}"
 
     # Verify step returns data on sim device
     action_space = env.unwrapped.single_action_space
     test_action = torch.zeros(NUM_ENVS, action_space.shape[0], device=sim_device)
     obs_dict, rew, term, trunc, extras = env.step(test_action)
-    assert rew.device.type == torch.device(sim_device).type, \
-        f"Unwrapped env rewards should be on {sim_device}, got {rew.device}"
-    assert term.device.type == torch.device(sim_device).type, \
-        f"Unwrapped env terminated should be on {sim_device}, got {term.device}"
+    assert (
+        rew.device.type == torch.device(sim_device).type
+    ), f"Unwrapped env rewards should be on {sim_device}, got {rew.device}"
+    assert (
+        term.device.type == torch.device(sim_device).type
+    ), f"Unwrapped env terminated should be on {sim_device}, got {term.device}"
 
 
 def _verify_tensor_device(data, expected_device: str, name: str):
@@ -125,13 +129,15 @@ def _verify_tensor_device(data, expected_device: str, name: str):
         name: Name for error messages
     """
     if isinstance(data, torch.Tensor):
-        assert data.device.type == torch.device(expected_device).type, \
-            f"{name} should be on {expected_device}, got {data.device}"
+        assert (
+            data.device.type == torch.device(expected_device).type
+        ), f"{name} should be on {expected_device}, got {data.device}"
     elif isinstance(data, dict):
         for key, value in data.items():
             if isinstance(value, torch.Tensor):
-                assert value.device.type == torch.device(expected_device).type, \
-                    f"{name}['{key}'] should be on {expected_device}, got {value.device}"
+                assert (
+                    value.device.type == torch.device(expected_device).type
+                ), f"{name}['{key}'] should be on {expected_device}, got {value.device}"
 
 
 def _test_rsl_rl_device_separation(sim_device: str, rl_device: str):
@@ -146,6 +152,7 @@ def _test_rsl_rl_device_separation(sim_device: str, rl_device: str):
         rl_device: Device for RL agent (e.g., "cuda:0", "cpu") - where policy generates actions
     """
     from tensordict import TensorDict
+
     from isaaclab_rl.rsl_rl import RslRlVecEnvWrapper
 
     env = _create_env(sim_device)
@@ -214,6 +221,7 @@ def _test_sb3_device_separation(sim_device: str):
         sim_device: Device for simulation (e.g., "cuda:0", "cpu")
     """
     import numpy as np
+
     from isaaclab_rl.sb3 import Sb3VecEnvWrapper
 
     env = _create_env(sim_device)