From 0f01472edf0442ce3bca72b9b7bcd9b862a64472 Mon Sep 17 00:00:00 2001
From: DynamoDuan <2030524470@qq.com>
Date: Sat, 8 Nov 2025 09:38:17 -0800
Subject: [PATCH 1/2] finish most sim

---
 openreal2sim/simulation/README.md             |   0
 openreal2sim/simulation/isaaclab/README.md    |   0
 .../isaaclab/learning/hybrid_ppo.py           |   0
 .../isaaclab/learning/simple_ppo.py           |   0
 openreal2sim/simulation/isaaclab/sim_base.py  | 394 +++++++++++-
 .../simulation/isaaclab/sim_env_factory.py    |  75 ++-
 .../isaaclab/sim_heuristic_manip.py           | 472 +++++++++++---
 .../isaaclab/sim_preprocess/usd_conversion.py |  10 +-
 .../isaaclab/sim_randomize_rollout.py         | 600 ++++++++++++++++++
 .../isaaclab/sim_utils/calibration_utils.py   |   0
 .../isaaclab/sim_utils/sim_utils.py           |  11 +-
 .../isaaclab/sim_utils/transform_utils.py     |   0
 .../simulation/isaaclab/task_interface.py     |   0
 .../simulation/modules/demo_motion_process.py | 392 ++++++++++++
 .../simulation/modules/envs/randomzier.py     | 348 ++++++++++
 .../simulation/modules/envs/task_cfg.py       |  86 +++
 .../simulation/modules/envs/task_construct.py | 227 +++++++
 .../grasp_preprocess}/grasp_generation.py     |  18 +-
 .../grasp_preprocess}/grasp_utils.py          |   0
 .../hand_preprocess/affordance_extraction.py  |   0
 .../hand_preprocess/grasp_point_extraction.py | 331 ++++++++++
 .../hand_preprocess/hand_extraction.py        | 213 +++++++
 openreal2sim/simulation/sim_agent.py          | 248 ++++++++
 .../simulation/utils/compose_config.py        |  17 +
 openreal2sim/simulation/utils/notification.py |  42 ++
 25 files changed, 3375 insertions(+), 109 deletions(-)
 mode change 100644 => 100755 openreal2sim/simulation/README.md
 mode change 100644 => 100755 openreal2sim/simulation/isaaclab/README.md
 mode change 100644 => 100755 openreal2sim/simulation/isaaclab/learning/hybrid_ppo.py
 mode change 100644 => 100755 openreal2sim/simulation/isaaclab/learning/simple_ppo.py
 mode change 100644 => 100755 openreal2sim/simulation/isaaclab/sim_base.py
 mode change 100644 => 100755 openreal2sim/simulation/isaaclab/sim_env_factory.py
 mode change 100644 => 100755 openreal2sim/simulation/isaaclab/sim_heuristic_manip.py
 mode change 100644 => 100755 openreal2sim/simulation/isaaclab/sim_preprocess/usd_conversion.py
 create mode 100755 openreal2sim/simulation/isaaclab/sim_randomize_rollout.py
 mode change 100644 => 100755 openreal2sim/simulation/isaaclab/sim_utils/calibration_utils.py
 mode change 100644 => 100755 openreal2sim/simulation/isaaclab/sim_utils/sim_utils.py
 mode change 100644 => 100755 openreal2sim/simulation/isaaclab/sim_utils/transform_utils.py
 create mode 100755 openreal2sim/simulation/isaaclab/task_interface.py
 create mode 100755 openreal2sim/simulation/modules/demo_motion_process.py
 create mode 100755 openreal2sim/simulation/modules/envs/randomzier.py
 create mode 100755 openreal2sim/simulation/modules/envs/task_cfg.py
 create mode 100644 openreal2sim/simulation/modules/envs/task_construct.py
 rename openreal2sim/simulation/{isaaclab/sim_preprocess => modules/grasp_preprocess}/grasp_generation.py (94%)
 mode change 100644 => 100755
 rename openreal2sim/simulation/{isaaclab/sim_preprocess => modules/grasp_preprocess}/grasp_utils.py (100%)
 mode change 100644 => 100755
 create mode 100755 openreal2sim/simulation/modules/hand_preprocess/affordance_extraction.py
 create mode 100755 openreal2sim/simulation/modules/hand_preprocess/grasp_point_extraction.py
 create mode 100755 openreal2sim/simulation/modules/hand_preprocess/hand_extraction.py
 create mode 100755 openreal2sim/simulation/sim_agent.py
 create mode 100755 openreal2sim/simulation/utils/compose_config.py
 create mode 100755 openreal2sim/simulation/utils/notification.py

diff --git a/openreal2sim/simulation/README.md b/openreal2sim/simulation/README.md
old mode 100644
new mode 100755
diff --git a/openreal2sim/simulation/isaaclab/README.md b/openreal2sim/simulation/isaaclab/README.md
old mode 100644
new mode 100755
diff --git a/openreal2sim/simulation/isaaclab/learning/hybrid_ppo.py b/openreal2sim/simulation/isaaclab/learning/hybrid_ppo.py
old mode 100644
new mode 100755
diff --git a/openreal2sim/simulation/isaaclab/learning/simple_ppo.py b/openreal2sim/simulation/isaaclab/learning/simple_ppo.py
old mode 100644
new mode 100755
diff --git a/openreal2sim/simulation/isaaclab/sim_base.py b/openreal2sim/simulation/isaaclab/sim_base.py
old mode 100644
new mode 100755
index f8f204a..fa8a3cd
--- a/openreal2sim/simulation/isaaclab/sim_base.py
+++ b/openreal2sim/simulation/isaaclab/sim_base.py
@@ -4,13 +4,21 @@
 import os
 import json
 import random
+import shutil
 from pathlib import Path
-from typing import Any, Optional, Dict
+from typing import Any, Optional, Dict, Sequence, Tuple
 from typing import List
 
 import numpy as np
 import torch
 import imageio
+import cv2
+import h5py
+import sys
+file_path = Path(__file__).resolve()
+sys.path.append(str(file_path.parent))
+sys.path.append(str(file_path.parent.parent))
+from modules.envs.task_cfg import CameraInfo, TaskCfg, TrajectoryCfg
 
 # Isaac Lab
 import isaaclab.sim as sim_utils
@@ -74,6 +82,10 @@ def __init__(
         set_physics_props: bool = True,
         enable_motion_planning: bool = True,
         debug_level: int = 1,
+        demo_dir: Optional[Path] = None,
+        data_dir: Optional[Path] = None,
+        task_cfg: Optional[TaskCfg] = None,
+        traj_cfg_list: Optional[List[TrajectoryCfg]] = None,
     ) -> None:
         # basic simulation setup
         self.sim: sim_utils.SimulationContext = sim
@@ -86,7 +98,13 @@ def __init__(
         self.cam_dict = cam_dict
         self.out_dir: Path = out_dir
         self.img_folder: str = img_folder
-
+        self.defined_demo_dir: Optional[Path] = demo_dir
+        if self.defined_demo_dir is not None:
+            self.defined_demo_dir.mkdir(parents=True, exist_ok=True)
+        
+        self.data_dir: Optional[Path] = data_dir
+        if self.data_dir is not None:
+            self.data_dir.mkdir(parents=True, exist_ok=True)
         # scene entities
         self.robot = scene["robot"]
         if robot_pose.ndim == 1:
@@ -95,9 +113,19 @@ def __init__(
             assert robot_pose.shape[0] == self.num_envs and robot_pose.shape[1] == 7, \
                 f"robot_pose must be [B,7], got {robot_pose.shape}"
             self.robot_pose = robot_pose.to(self.robot.device).contiguous()
-
-        self.object_prim = scene["object_00"]
-        self.other_object_prims = [scene[key] for key in scene.keys() if f"object_" in key and key != "object_00"]
+        self.task_cfg = task_cfg
+        self.traj_cfg_list = traj_cfg_list
+        # Get object prim based on selected_object_id
+        # Default to object_00 for backward compatibility
+        # Note: selected_object_id is set in subclasses after super().__init__()
+        # So we use a helper method that can be called later
+        self._selected_object_id = None  # Will be set by subclasses
+        self.object_prim = scene["object_00"]  # Default, will be updated if needed
+        self._update_object_prim()
+        
+        # Get all other object prims (excluding the main object)
+        self.other_object_prims = [scene[key] for key in scene.keys() 
+                                   if f"object_" in key and key != "object_00"]
         self.background_prim = scene["background"]
         self.camera: Camera = scene["camera"]
 
@@ -155,14 +183,17 @@ def __init__(
         self.save_dict = {
             "rgb": [], "depth": [], "segmask": [],
             "joint_pos": [], "joint_vel": [], "actions": [],
-            "gripper_pos": [], "gripper_cmd": []
+            "gripper_pos": [], "gripper_cmd": [],
+            "composed_rgb": []  # composed rgb image with background and foreground
         }
 
         # visualization
         self.selected_object_id = 0
+        self._selected_object_id = 0
         self.debug_level = debug_level
 
         self.goal_vis_list = []
+        
         if self.debug_level > 0:
             for b in range(self.num_envs):
                 cfg = VisualizationMarkersCfg(
@@ -183,6 +214,23 @@ def __init__(
             print(f"prepare curobo motion planning: {enable_motion_planning}")
             self.prepare_curobo()
             print("curobo motion planning ready.")
+        
+    def _update_object_prim(self):
+        """Update object_prim based on selected_object_id. Called after selected_object_id is set."""
+        if self._selected_object_id is None:
+            return
+        try:
+            from sim_env_factory import get_prim_name_from_oid
+            oid_str = str(self._selected_object_id)
+            prim_name = get_prim_name_from_oid(oid_str)
+            if prim_name in self.scene:
+                self.object_prim = self.scene[prim_name]
+                # Update other_object_prims
+                self.other_object_prims = [self.scene[key] for key in self.scene.keys() 
+                                           if f"object_" in key and key != prim_name]
+        except (ImportError, ValueError, KeyError) as e:
+            # Fallback to object_00 if mapping not available
+            pass
 
     # -------- Curobo Motion Planning ----------
     def prepare_curobo(self):
@@ -445,7 +493,7 @@ def move_to_ik(self, position: torch.Tensor, quaternion: torch.Tensor, steps: in
 
             # Compute next joint command
             joint_pos_des = self.diff_ik_controller.compute(ee_pos_b, ee_quat_b, jacobian, joint_pos)
-
+           
             # Apply
             self.apply_actions(joint_pos_des, gripper_open=gripper_open)
 
@@ -521,6 +569,7 @@ def reset(self, env_ids=None):
         env_origins_robot = env_origins.to(self.robot.device)                # (M,3)
         robot_pose_world = rp_local.clone()
         robot_pose_world[:, :3] = env_origins_robot + robot_pose_world[:, :3]
+        #print(f"[INFO] robot_pose_world: {robot_pose_world}")
         self.robot.write_root_pose_to_sim(robot_pose_world, env_ids=env_ids_t)
         self.robot.write_root_velocity_to_sim(
             torch.zeros((M, 6), device=self.robot.device, dtype=torch.float32), env_ids=env_ids_t
@@ -615,22 +664,40 @@ def clear_data(self):
         for key in self.save_dict.keys():
             self.save_dict[key] = []
 
+                        
+  
     def _demo_dir(self) -> Path:
-        return self.out_dir / self.img_folder / "demos" / f"demo_{self.demo_id}"
+        if self.defined_demo_dir is not None:
+            return self.defined_demo_dir
+        else:
+            return self.out_dir / self.img_folder / "demos" / f"demo_{self.demo_id}"
 
     def _env_dir(self, base: Path, b: int) -> Path:
         d = base / f"env_{b:03d}"
         d.mkdir(parents=True, exist_ok=True)
         return d
+    
+    def _get_next_demo_dir(self, base: Path) -> Path:
+        already_existing_num = len(list(base.iterdir()))
+        return base / f"demo_{already_existing_num:03d}"
 
-    def save_data(self, ignore_keys: List[str] = []):
+    def save_data(self, ignore_keys: List[str] = [], env_ids: Optional[List[int]] = None):
         save_root = self._demo_dir()
         save_root.mkdir(parents=True, exist_ok=True)
-
+        
         stacked = {k: np.array(v) for k, v in self.save_dict.items()}
-
-        for b in range(self.num_envs):
-            env_dir = self._env_dir(save_root, b)
+        if env_ids is None:
+            env_ids = self._all_env_ids.cpu().numpy()
+
+        composed_rgb = []
+        hdf5_names = []
+        for b in env_ids:
+            if self.defined_demo_dir is  None:
+                env_dir = save_root / f"env_{b:03d}"
+            else:
+                env_dir = self._get_next_demo_dir(save_root)
+                hdf5_names.append(env_dir.name)
+            env_dir.mkdir(parents=True, exist_ok=True)
             for key, arr in stacked.items():
                 if key in ignore_keys: # skip the keys for storage
                     continue
@@ -657,8 +724,25 @@ def save_data(self, ignore_keys: List[str] = []):
                         writer.append_data(depth_norm[t])
                     writer.close()
                     np.save(env_dir / f"{key}.npy", depth_seq)
-                else:
+                elif key != "composed_rgb":
+                    #import pdb; pdb.set_trace()
                     np.save(env_dir / f"{key}.npy", arr[:, b])
+            video_path = env_dir / "real_video.mp4"
+            writer = imageio.get_writer(video_path, fps=50, macro_block_size=None)
+            rgb = np.array(self.save_dict["rgb"])
+            mask = np.array(self.save_dict["segmask"])
+            bg_rgb_path = self.task_cfg.bg_rgb_path
+            self.bg_rgb = imageio.imread(bg_rgb_path)
+            for t in range(rgb.shape[0]):
+                #import ipdb; ipdb.set_trace()
+                composed = self.convert_real(mask[t, b], self.bg_rgb, rgb[t, b])        
+                writer.append_data(composed)
+            writer.close()
+            composed_rgb.append(composed)
+
+   
+        self.export_batch_data_to_hdf5(hdf5_names)
+        
 
         print("[INFO]: Demonstration is saved at: ", save_root)
 
@@ -676,6 +760,243 @@ def save_data(self, ignore_keys: List[str] = []):
         os.system(f"cp -r {save_root}/* {demo_save_path}")
         print("[INFO]: Demonstration is saved at: ", demo_save_path)
 
+    # def collect_data_from_folder(self, folder_path: Path):
+    #     """
+    #     Load `.npy` files saved by `save_data` and repopulate `self.save_dict`.
+
+    #     The provided directory may either be:
+    #       • A demo folder containing multiple `env_XXX` subdirectories, or
+    #       • A single `env_XXX` directory.
+
+    #     Args:
+    #         folder_path: Path to the data directory.
+
+    #     Returns:
+    #         A tuple ``(stacked, env_dirs)`` where ``stacked`` maps data keys to
+    #         stacked numpy arrays of shape `[T, B, ...]` (``T`` timesteps,
+    #         ``B`` environments) and ``env_dirs`` is the ordered list of
+    #         environment directories discovered in ``folder_path``.
+    #     """
+    #     folder_path = Path(folder_path)
+    #     if not folder_path.exists():
+    #         raise FileNotFoundError(f"[collect_data_from_folder] path does not exist: {folder_path}")
+    #     if folder_path.is_file():
+    #         raise NotADirectoryError(f"[collect_data_from_folder] expected a directory, got file: {folder_path}")
+
+    #     if folder_path.name.startswith("env_"):
+    #         env_dirs = [folder_path]
+    #     else:
+    #         env_dirs = sorted(
+    #             [p for p in folder_path.iterdir() if p.is_dir() and p.name.startswith("env_")],
+    #             key=lambda p: p.name,
+    #         )
+
+    #     if len(env_dirs) == 0:
+    #         raise ValueError(f"[collect_data_from_folder] no env_XXX directories found in {folder_path}")
+
+    #     aggregated: Dict[str, List[np.ndarray]] = {}
+    #     for env_dir in env_dirs:
+    #         for npy_file in sorted(env_dir.glob("*.npy")):
+    #             key = npy_file.stem
+    #             try:
+    #                 arr = np.load(npy_file, allow_pickle=False)
+    #             except Exception as exc:
+    #                 print(f"[collect_data_from_folder] skip {npy_file}: {exc}")
+    #                 continue
+    #             aggregated.setdefault(key, []).append(arr)
+
+    #     if len(aggregated) == 0:
+    #         raise ValueError(f"[collect_data_from_folder] no npy data found in {folder_path}")
+
+    #     stacked: Dict[str, np.ndarray] = {}
+    #     for key, env_slices in aggregated.items():
+    #         reference_shape = env_slices[0].shape
+    #         for idx, slice_arr in enumerate(env_slices[1:], start=1):
+    #             if slice_arr.shape != reference_shape:
+    #                 raise ValueError(
+    #                     f"[collect_data_from_folder] inconsistent shapes for '{key}': "
+    #                     f"{reference_shape} vs {slice_arr.shape} (env idx {idx})"
+    #                 )
+    #         stacked[key] = np.stack(env_slices, axis=1)  # [T, B, ...]
+
+    #     for key in self.save_dict.keys():
+    #         if key not in stacked:
+    #             self.save_dict[key] = []
+    #             continue
+    #         data = stacked[key]  # [T, B, ...]
+    #         self.save_dict[key] = [data[t] for t in range(data.shape[0])]
+
+    #     return stacked, env_dirs
+
+    def _encode_rgb_sequence(self, frames: np.ndarray) -> tuple[list[bytes], int]:
+        """Encode a sequence of RGB frames into JPEG bytes and return padded bytes."""
+        if frames.shape[0] == 0:
+            return [], 1
+
+        encoded: List[bytes] = []
+        max_len = 1
+        for frame in frames:
+            frame_np = np.asarray(frame)
+            if frame_np.dtype != np.uint8:
+                frame_np = np.clip(frame_np, 0, 255).astype(np.uint8)
+            if frame_np.ndim == 3 and frame_np.shape[2] == 4:
+                frame_np = frame_np[..., :3]
+            if frame_np.ndim == 3 and frame_np.shape[2] == 3:
+                frame_np = cv2.cvtColor(frame_np, cv2.COLOR_RGB2BGR)
+            success, buffer = cv2.imencode(".jpg", frame_np)
+            if not success:
+                raise RuntimeError("Failed to encode RGB frame into JPEG for RoboTwin export.")
+            data_bytes = buffer.tobytes()
+            encoded.append(data_bytes)
+            if len(data_bytes) > max_len:
+                max_len = len(data_bytes)
+
+        padded = [frame_bytes.ljust(max_len, b"\0") for frame_bytes in encoded]
+        return padded, max_len
+
+    def export_batch_data_to_hdf5(self, hdf5_names: List[str]) -> int:
+        """Export buffered trajectories to RoboTwin-style HDF5 episodes."""
+        if self.data_dir is not None:
+            target_root = self.data_dir
+        else:
+            target_root = self._demo_dir() / "hdf5"
+        data_dir = Path(target_root) 
+        data_dir.mkdir(parents=True, exist_ok=True)
+
+       
+        num_envs = len(hdf5_names)
+        stacked = {k: np.array(v) for k, v in self.save_dict.items()}
+       
+        episode_names = []
+        for idx, name in enumerate(hdf5_names):
+            name = str(name)
+            episode_names.append(name.replace("demo_", "episode_"))
+        handled_keys = {
+            "rgb",
+            "depth",
+            "segmask",
+            "joint_pos",
+            "joint_vel",
+            "gripper_pos",
+            "gripper_cmd",
+            "actions",
+        }
+
+        camera_params = self._get_camera_parameters()
+
+        for env_idx, episode_name in enumerate(episode_names):
+            hdf5_path = data_dir / f"{episode_name}.hdf5"
+            hdf5_path.parent.mkdir(parents=True, exist_ok=True)
+
+            with h5py.File(hdf5_path, "w") as f:
+                obs_grp = f.create_group("observation")
+                camera_group_name = "head_camera" if camera_params is not None else "camera"
+                cam_grp = obs_grp.create_group(camera_group_name)
+                if camera_params is not None:
+                    intrinsics, extrinsics, resolution = camera_params
+                    cam_grp.create_dataset("intrinsics", data=intrinsics)
+                    cam_grp.create_dataset("extrinsics", data=extrinsics)
+                    cam_grp.attrs["resolution"] = resolution
+
+                if "rgb" in stacked:
+                    rgb_frames = stacked["rgb"][:, env_idx]
+                    encoded_frames, max_len = self._encode_rgb_sequence(rgb_frames)
+                    dtype = f"S{max_len}" if max_len > 0 else "S1"
+                    cam_grp.create_dataset("rgb", data=np.asarray(encoded_frames, dtype=dtype))
+                    cam_grp.attrs["encoding"] = "jpeg"
+                    cam_grp.attrs["channels"] = 3
+                    cam_grp.attrs["original_shape"] = rgb_frames.shape
+                    if camera_params is None:
+                        cam_grp.create_dataset("intrinsics", data=np.zeros((3, 3), dtype=np.float32))
+                        cam_grp.create_dataset("extrinsics", data=np.zeros((4, 4), dtype=np.float32))
+
+                if "depth" in stacked:
+                    depth_ds = cam_grp.create_dataset("depth", data=stacked["depth"][:, env_idx])
+                    depth_ds.attrs["encoding"] = "float32"
+                    depth_ds.attrs["unit"] = "meter"
+                if "segmask" in stacked:
+                    seg_ds = cam_grp.create_dataset(
+                        "segmentation",
+                        data=stacked["segmask"][:, env_idx].astype(np.uint8),
+                    )
+                    seg_ds.attrs["encoding"] = "uint8"
+                    seg_ds.attrs["color_mapping"] = "instance_id"
+
+                joint_grp = f.create_group("joint_action")
+                if "joint_pos" in stacked:
+                    joint_grp.create_dataset(
+                        "joint_pos", data=stacked["joint_pos"][:, env_idx].astype(np.float32)
+                    )
+                if "joint_vel" in stacked:
+                    joint_grp.create_dataset(
+                        "joint_vel", data=stacked["joint_vel"][:, env_idx].astype(np.float32)
+                    )
+                if "gripper_cmd" in stacked:
+                    joint_grp.create_dataset(
+                        "gripper_cmd", data=stacked["gripper_cmd"][:, env_idx].astype(np.float32)
+                    )
+                if len(joint_grp.keys()) == 0:
+                    del f["joint_action"]
+
+                if "gripper_pos" in stacked:
+                    endpose_grp = f.create_group("endpose")
+                    endpose_grp.create_dataset(
+                        "gripper_pos", data=stacked["gripper_pos"][:, env_idx].astype(np.float32)
+                    )
+                    if "gripper_cmd" in stacked:
+                        endpose_grp.create_dataset(
+                            "gripper_cmd", data=stacked["gripper_cmd"][:, env_idx].astype(np.float32)
+                        )
+
+                if "actions" in stacked:
+                    f.create_dataset(
+                        "actions", data=stacked["actions"][:, env_idx].astype(np.float32)
+                    )
+
+                extras = {}
+                for key, value in stacked.items():
+                    if key in handled_keys:
+                        continue
+
+                extras_grp = f.create_group("extras")
+                if len(extras) > 0:
+                    for key, value in extras.items():
+                        extras_grp.create_dataset(key, data=value)
+
+                if self.task_cfg is not None:
+                   
+                    extras_grp.create_dataset("task_desc", data=self.task_cfg.task_desc)
+                
+                if self.traj_cfg_list is not None:
+                    traj_i = self.traj_cfg_list[env_idx]
+                    traj_grp = extras_grp.create_group("traj")
+                    traj_grp.create_dataset("robot_pose", data=traj_i.robot_pose)
+                    traj_grp.create_dataset("pregrasp_pose", data=traj_i.pregrasp_pose)
+                    traj_grp.create_dataset("grasp_pose", data=traj_i.grasp_pose)
+
+
+                frame_count = stacked["rgb"].shape[0]
+                meta_grp = f.create_group("meta")
+                meta_grp.attrs["env_index"] = int(env_idx)
+                meta_grp.attrs["frame_dt"] = float(self.sim_dt)
+                meta_grp.attrs["frame_count"] = int(frame_count)
+                meta_grp.attrs["source"] = "OpenReal2Sim"
+                meta_grp.attrs["episode_name"] = episode_name
+                meta_grp.create_dataset("frame_indices", data=np.arange(frame_count, dtype=np.int32))
+
+        print(f"[INFO]: Exported {num_envs} HDF5 episodes to {data_dir}")
+        return num_envs
+
+
+
+    def convert_real(self,segmask, bg_rgb, fg_rgb):
+        #import pdb; pdb.set_trace()
+        segmask_2d = segmask[..., 0]
+        composed = bg_rgb.copy()
+        composed[segmask_2d] = fg_rgb[segmask_2d]
+        return composed
+
+    
     def delete_data(self):
         save_path = self._demo_dir()
         failure_root = self.out_dir / self.img_folder / "demos_failures"
@@ -686,3 +1007,48 @@ def delete_data(self):
         for key in self.save_dict.keys():
             self.save_dict[key] = []
         print("[INFO]: Clear up the folder: ", save_path)
+
+    def _quat_to_rot(self, quat: Sequence[float]) -> np.ndarray:
+        w, x, y, z = quat
+        rot = np.array(
+            [
+                [1 - 2 * (y ** 2 + z ** 2), 2 * (x * y - z * w), 2 * (x * z + y * w)],
+                [2 * (x * y + z * w), 1 - 2 * (x ** 2 + z ** 2), 2 * (y * z - x * w)],
+                [2 * (x * z - y * w), 2 * (y * z + x * w), 1 - 2 * (x ** 2 + y ** 2)],
+            ],
+            dtype=np.float32,
+        )
+        return rot
+
+    def _get_camera_parameters(self) -> Optional[Tuple[np.ndarray, np.ndarray, Tuple[int, int]]]:
+        if self.task_cfg is None:
+            return None
+        camera_info = getattr(self.task_cfg, "camera_info", None)
+        if camera_info is None:
+            return None
+
+        intrinsics = np.array(
+            [
+                [camera_info.fx, 0.0, camera_info.cx],
+                [0.0, camera_info.fy, camera_info.cy],
+                [0.0, 0.0, 1.0],
+            ],
+            dtype=np.float32,
+        )
+
+        if getattr(camera_info, "camera_opencv_to_world", None) is not None:
+            extrinsics = np.array(camera_info.camera_opencv_to_world, dtype=np.float32)
+        else: 
+            extrinsics = np.eye(4, dtype=np.float32)
+            if getattr(camera_info, "camera_heading_wxyz", None) is not None:
+                rot = self._quat_to_rot(camera_info.camera_heading_wxyz)
+            else:
+                rot = np.eye(3, dtype=np.float32)
+            extrinsics[:3, :3] = rot
+            if getattr(camera_info, "camera_position", None) is not None:
+                extrinsics[:3, 3] = np.array(camera_info.camera_position, dtype=np.float32)
+        resolution = (
+            int(camera_info.width),
+            int(camera_info.height),
+        )
+        return intrinsics, extrinsics, resolution
diff --git a/openreal2sim/simulation/isaaclab/sim_env_factory.py b/openreal2sim/simulation/isaaclab/sim_env_factory.py
old mode 100644
new mode 100755
index f3d662e..429e72e
--- a/openreal2sim/simulation/isaaclab/sim_env_factory.py
+++ b/openreal2sim/simulation/isaaclab/sim_env_factory.py
@@ -52,6 +52,8 @@ class SceneCtx:
     obj_physics: List[Dict] = None
     use_ground_plane: bool = False
     ground_z: Optional[float] = None
+    oid_to_index: Optional[Dict[str, int]] = None  # Maps oid (str) to prim index
+    index_to_oid: Optional[Dict[int, str]] = None  # Maps prim index to oid (str)
 
 
 _SCENE_CTX: Optional[SceneCtx] = None
@@ -247,7 +249,23 @@ def init_scene_from_scene_dict(
     If robot pose not provided, sample one with robot_placement_candidates_v2().
     """
     cam_dict = cfgs["cam_cfg"]
-    obj_paths = [o["usd"] for o in scene["objects"].values()]
+    
+    # Sort objects by oid to ensure consistent ordering
+    # scene["objects"] keys are oid strings (e.g., "1", "2", ...)
+    sorted_objects = sorted(scene["objects"].items(), key=lambda x: int(x[0]))
+    
+    # Build oid <-> index mapping
+    oid_to_index = {}
+    index_to_oid = {}
+    obj_paths = []
+    obj_physics_list = []
+    
+    for index, (oid_str, obj_data) in enumerate(sorted_objects):
+        oid_to_index[oid_str] = index
+        index_to_oid[index] = oid_str
+        obj_paths.append(obj_data["usd"])
+        obj_physics_list.append(obj_data.get("physics", None))
+    
     background_path = scene["background"]["usd"]
 
     # overwrite physics
@@ -255,12 +273,12 @@ def init_scene_from_scene_dict(
     obj_physics = cfgs["physics_cfg"]["obj_physics"]
     bg_physics = cfgs["physics_cfg"]["bg_physics"]
     if obj_physics is None:
-        obj_physics = [o.get("physics", None) for o in scene["objects"].values()]
+        obj_physics = obj_physics_list  # Use physics from sorted objects
     elif isinstance(obj_physics, dict):
-        obj_physics = [obj_physics for _ in scene["objects"].values()]
+        obj_physics = [obj_physics for _ in range(len(obj_paths))]
     elif isinstance(obj_physics, list):
-        assert len(obj_physics) == len(scene["objects"]), \
-            "obj_physics must be a list of the same length as scene['objects'] if provided."
+        assert len(obj_physics) == len(obj_paths), \
+            f"obj_physics must be a list of the same length as objects ({len(obj_paths)}), got {len(obj_physics)}."
         pass
     else:
         raise TypeError("obj_physics must be None, a dict, or a list of dicts.")
@@ -288,6 +306,8 @@ def init_scene_from_scene_dict(
         obj_physics=list(obj_physics),
         use_ground_plane=use_ground_plane,
         ground_z=ground_z,
+        oid_to_index=oid_to_index,
+        index_to_oid=index_to_oid,
     )
 
 
@@ -370,6 +390,51 @@ def load_scene_json(key: str) -> dict:
     return json.load(open(scene_path))
 
 
+def get_prim_name_from_oid(oid: str | int) -> str:
+    """
+    Get the prim name (e.g., "object_00") from object id.
+    
+    Args:
+        oid: Object ID as string (e.g., "1") or integer (e.g., 1)
+        
+    Returns:
+        Prim name string (e.g., "object_00")
+    """
+    assert _SCENE_CTX is not None, "init_scene_from_scene_dict must be called first."
+    oid_str = str(oid)
+    if _SCENE_CTX.oid_to_index is None:
+        raise ValueError("oid_to_index mapping not available. Scene may not have been initialized from scene dict.")
+    index = _SCENE_CTX.oid_to_index.get(oid_str)
+    if index is None:
+        raise ValueError(f"OID '{oid_str}' not found in scene. Available OIDs: {list(_SCENE_CTX.oid_to_index.keys())}")
+    return f"object_{index:02d}"
+
+
+def get_oid_from_prim_name(prim_name: str) -> str:
+    """
+    Get the object id from prim name (e.g., "object_00" -> "1").
+    
+    Args:
+        prim_name: Prim name string (e.g., "object_00")
+        
+    Returns:
+        OID as string (e.g., "1")
+    """
+    assert _SCENE_CTX is not None, "init_scene_from_scene_dict must be called first."
+    if not prim_name.startswith("object_"):
+        raise ValueError(f"Invalid prim name format: {prim_name}")
+    try:
+        index = int(prim_name.split("_")[1])
+    except (IndexError, ValueError):
+        raise ValueError(f"Invalid prim name format: {prim_name}")
+    if _SCENE_CTX.index_to_oid is None:
+        raise ValueError("index_to_oid mapping not available. Scene may not have been initialized from scene dict.")
+    oid = _SCENE_CTX.index_to_oid.get(index)
+    if oid is None:
+        raise ValueError(f"Index {index} not found in scene. Available indices: {list(_SCENE_CTX.index_to_oid.keys())}")
+    return oid
+
+
 def make_env(
     cfgs: dict,
     num_envs: int = 1,
diff --git a/openreal2sim/simulation/isaaclab/sim_heuristic_manip.py b/openreal2sim/simulation/isaaclab/sim_heuristic_manip.py
old mode 100644
new mode 100755
index a4910a9..8719d69
--- a/openreal2sim/simulation/isaaclab/sim_heuristic_manip.py
+++ b/openreal2sim/simulation/isaaclab/sim_heuristic_manip.py
@@ -5,27 +5,34 @@
 from __future__ import annotations
 
 # ─────────── AppLauncher ───────────
-import argparse, os, json, random, transforms3d
+import argparse, os, json, random, transforms3d, typing
+from typing import Optional
 from pathlib import Path
 import numpy as np
 import torch
 import yaml
 from isaaclab.app import AppLauncher
+import sys
+from modules.envs.task_cfg import TaskCfg, TaskType, SuccessMetric, SuccessMetricType, TrajectoryCfg, RobotType
+from modules.envs.task_construct import construct_task_config, add_reference_trajectory, load_task_cfg
+file_path = Path(__file__).resolve()
+sys.path.append(str(file_path.parent))
+sys.path.append(str(file_path.parent.parent))
 
 # ─────────── CLI ───────────
-parser = argparse.ArgumentParser("sim_policy")
-parser.add_argument("--key", type=str, default="demo_video", help="scene key (outputs/<key>)")
-parser.add_argument("--robot", type=str, default="franka")
-parser.add_argument("--num_envs", type=int, default=1)
-parser.add_argument("--num_trials", type=int, default=1)
-parser.add_argument("--teleop_device", type=str, default="keyboard")
-parser.add_argument("--sensitivity", type=float, default=1.0)
-AppLauncher.add_app_launcher_args(parser)
-args_cli = parser.parse_args()
-args_cli.enable_cameras = True
-args_cli.headless = False  # headless mode for batch execution
-app_launcher = AppLauncher(vars(args_cli))
-simulation_app = app_launcher.app
+# parser = argparse.ArgumentParser("sim_policy")
+# parser.add_argument("--key", type=str, default="demo_video", help="scene key (outputs/<key>)")
+# parser.add_argument("--robot", type=str, default="franka")
+# parser.add_argument("--num_envs", type=int, default=1)
+# parser.add_argument("--num_trials", type=int, default=1)
+# parser.add_argument("--teleop_device", type=str, default="keyboard")
+# parser.add_argument("--sensitivity", type=float, default=1.0)
+# AppLauncher.add_app_launcher_args(parser)
+# args_cli = parser.parse_args()
+# args_cli.enable_cameras = True
+# args_cli.headless = True  # headless mode for batch execution
+# app_launcher = AppLauncher(vars(args_cli))
+# simulation_app = app_launcher.app
 
 # ─────────── Runtime imports ───────────
 import isaaclab.sim as sim_utils
@@ -37,13 +44,11 @@
 # ─────────── Simulation environments ───────────
 from sim_base import BaseSimulator, get_next_demo_id
 from sim_env_factory import make_env
-from sim_preprocess.grasp_utils import get_best_grasp_with_hints
+from modules.grasp_preprocess.grasp_utils import get_best_grasp_with_hints
 from sim_utils.transform_utils import pose_to_mat, mat_to_pose, grasp_to_world, grasp_approach_axis_batch
 from sim_utils.sim_utils import load_sim_parameters
 
-BASE_DIR   = Path.cwd()
-img_folder = args_cli.key
-out_dir    = BASE_DIR / "outputs"
+
 
 
 # ──────────────────────────── Heuristic Manipulation ────────────────────────────
@@ -54,28 +59,29 @@ class HeuristicManipulation(BaseSimulator):
       • Every attempt does reset → pre → grasp → close → lift → check;
       • Early stops when any env succeeds; then re-exec for logging.
     """
-    def __init__(self, sim, scene, sim_cfgs: dict):
+    def __init__(self, sim, scene, sim_cfgs: dict, args, out_dir: Path, img_folder: str, demo_dir: Path, data_dir: Path):
         robot_pose = torch.tensor(
             sim_cfgs["robot_cfg"]["robot_pose"],
             dtype=torch.float32,
             device=sim.device
         )
         super().__init__(
-            sim=sim, scene=scene, args=args_cli,
+            sim=sim, scene=scene, args=args,
             robot_pose=robot_pose, cam_dict=sim_cfgs["cam_cfg"],
-            out_dir=out_dir, img_folder=img_folder,
+            out_dir=out_dir, img_folder=img_folder, demo_dir = demo_dir, data_dir = data_dir,
             enable_motion_planning=True,
             set_physics_props=True, debug_level=0,
         )
-
+        self.final_gripper_closed = sim_cfgs["demo_cfg"]["final_gripper_closed"]
         self.selected_object_id = sim_cfgs["demo_cfg"]["manip_object_id"]
-        self.traj_key = sim_cfgs["demo_cfg"]["traj_key"]
         self.traj_path = sim_cfgs["demo_cfg"]["traj_path"]
         self.goal_offset = [0, 0, sim_cfgs["demo_cfg"]["goal_offset"]]
         self.grasp_path = sim_cfgs["demo_cfg"]["grasp_path"]
         self.grasp_idx = sim_cfgs["demo_cfg"]["grasp_idx"]
         self.grasp_pre = sim_cfgs["demo_cfg"]["grasp_pre"]
         self.grasp_delta = sim_cfgs["demo_cfg"]["grasp_delta"]
+        self.task_type = sim_cfgs["demo_cfg"]["task_type"]
+        self.robot_type = args.robot
         self.load_obj_goal_traj()
 
     def load_obj_goal_traj(self):
@@ -91,7 +97,7 @@ def load_obj_goal_traj(self):
         """
         # —— 1) Load Δ_w ——
         rel = np.load(self.traj_path).astype(np.float32)
-        self.obj_rel_traj = rel  # (T,4,4)
+        self.obj_rel_traj = rel[1:, :, :]  # (T,4,4)
 
         self.reset()
 
@@ -108,6 +114,7 @@ def load_obj_goal_traj(self):
         # Note: here the relative traj Δ_w is defined in world frame with origin (0,0,0),
         # Hence, we need to normalize it to each env's origin frame.
         origins = self.scene.env_origins.detach().cpu().numpy().astype(np.float32)  # (B,3)
+
         obj_state_np[:, :3] -= origins # normalize to env origin frame
 
         # —— 3) Precompute absolute object goals for all envs ——
@@ -121,7 +128,7 @@ def load_obj_goal_traj(self):
                 obj_goal[b, t] = goal
 
         self.obj_goal_traj_w = obj_goal  # [B, T, 4, 4]
-
+    
     def follow_object_goals(self, start_joint_pos, sample_step=1, visualize=True):
         """
         follow precompute object absolute trajectory: self.obj_goal_traj_w:
@@ -173,9 +180,103 @@ def follow_object_goals(self, start_joint_pos, sample_step=1, visualize=True):
             joint_pos, success = self.move_to(ee_pos_b, ee_quat_b, gripper_open=False)
             self.save_dict["actions"].append(np.concatenate([ee_pos_b.cpu().numpy(), ee_quat_b.cpu().numpy(), np.ones((B, 1))], axis=1))
 
+        success_ids = self.is_success()
+        joint_pos = self.wait(gripper_open=not self.final_gripper_closed, steps=30)
+        return joint_pos, success_ids
+    
+    def follow_object_relative_goals(self, start_joint_pos, sample_step=1, visualize=True):
+        B = self.scene.num_envs
+        obj_goal_all = self.obj_goal_traj_w  # [B, T, 4, 4]
+        T = obj_goal_all.shape[1]
+
+        ee_w  = self.robot.data.body_state_w[:, self.robot_entity_cfg.body_ids[0], 0:7]  # [B,7]
+        # obj_w = self.object_prim.data.root_com_state_w[:, :7]                                 # [B,7]
+        current_ee_pos_w = ee_w[:, :3]
+        current_ee_quat_w = ee_w[:, 3:7]
+
+        joint_pos = start_joint_pos
+        root_w = self.robot.data.root_state_w[:, 0:7]  # robot base poses per env
+
+        t_iter = list(range(0, T, sample_step))
+        t_iter = t_iter + [T-1] if t_iter[-1] != T-1 else t_iter
+
+        for t in t_iter:
+            goal_pos_list, goal_quat_list = [], []
+            print(f"[INFO] follow object goal step {t}/{T}")
+            for b in range(B):
+                         # (4,4)
+                current_T_ee = pose_to_mat(current_ee_pos_w[b], current_ee_quat_w[b])
+                T_ee_goal  = self.obj_rel_traj[t] @ current_T_ee
+                pos_b, quat_b = mat_to_pose(T_ee_goal)
+                goal_pos_list.append(pos_b.astype(np.float32))
+                goal_quat_list.append(quat_b.astype(np.float32))
+
+            goal_pos  = torch.as_tensor(np.stack(goal_pos_list),  dtype=torch.float32, device=self.sim.device)
+            goal_quat = torch.as_tensor(np.stack(goal_quat_list), dtype=torch.float32, device=self.sim.device)
+
+            if visualize:
+                self.show_goal(goal_pos, goal_quat)
+            ee_pos_b, ee_quat_b = subtract_frame_transforms(
+                root_w[:, :3], root_w[:, 3:7], goal_pos, goal_quat
+            )
+            joint_pos, success = self.move_to(ee_pos_b, ee_quat_b, gripper_open=False)
+            current_ee_pos_w = self.robot.data.body_state_w[:, self.robot_entity_cfg.body_ids[0], 0:3]
+            current_ee_quat_w = self.robot.data.body_state_w[:, self.robot_entity_cfg.body_ids[0], 3:7]
+
+            self.save_dict["actions"].append(np.concatenate([ee_pos_b.cpu().numpy(), ee_quat_b.cpu().numpy(), np.ones((B, 1))], axis=1))
+
+        success_ids = self.is_success()
+        joint_pos = self.wait(gripper_open=not self.final_gripper_closed, steps=30)
+        return joint_pos, success_ids
+    
+
+    def follow_object_centers(self, start_joint_pos, sample_step=1, visualize=True):
+        B = self.scene.num_envs
+        obj_goal_all = self.obj_goal_traj_w  # [B, T, 4, 4]
+        T = obj_goal_all.shape[1]
+
+        ee_w  = self.robot.data.body_state_w[:, self.robot_entity_cfg.body_ids[0], 0:7]  # [B,7]
+        # obj_w = self.object_prim.data.root_com_state_w[:, :7]                                 # [B,7]
+        obj_w = self.object_prim.data.root_state_w[:, :7]                                 # [B,7]
+
+        T_ee_in_obj = []
+        for b in range(B):
+            T_ee_w  = pose_to_mat(ee_w[b, :3],  ee_w[b, 3:7])
+            T_obj_w = pose_to_mat(obj_w[b, :3], obj_w[b, 3:7])
+            T_ee_in_obj.append((np.linalg.inv(T_obj_w) @ T_ee_w).astype(np.float32))
+
+        joint_pos = start_joint_pos
+        root_w = self.robot.data.root_state_w[:, 0:7]  # robot base poses per env
+
+        t_iter = list(range(0, T, sample_step))
+        t_iter = t_iter + [T-1] if t_iter[-1] != T-1 else t_iter
+
+        for t in t_iter:
+            goal_pos_list, goal_quat_list = [], []
+            print(f"[INFO] follow object goal step {t}/{T}")
+            for b in range(B):
+                T_obj_goal = obj_goal_all[b, t]            # (4,4)
+                T_ee_goal  = T_obj_goal @ T_ee_in_obj[b]   # (4,4)
+                pos_b, quat_b = mat_to_pose(T_ee_goal)
+
+                goal_pos_list.append(pos_b.astype(np.float32))
+                goal_quat_list.append(quat_b.astype(np.float32))
+
+            goal_pos  = torch.as_tensor(np.stack(goal_pos_list),  dtype=torch.float32, device=self.sim.device)
+            goal_quat = ee_w[:, 3:7]
+
+            if visualize:
+                self.show_goal(goal_pos, goal_quat)
+            ee_pos_b, ee_quat_b = subtract_frame_transforms(
+                root_w[:, :3], root_w[:, 3:7], goal_pos, goal_quat
+            )
+            joint_pos, success = self.move_to(ee_pos_b, ee_quat_b, gripper_open=False)
+            self.save_dict["actions"].append(np.concatenate([ee_pos_b.cpu().numpy(), ee_quat_b.cpu().numpy(), np.ones((B, 1))], axis=1))
+        is_success = self.is_success()
+        joint_pos = self.wait(gripper_open=not self.final_gripper_closed, steps=30)
+        return joint_pos, is_success
+
 
-        joint_pos = self.wait(gripper_open=True, steps=10)
-        return joint_pos
 
     def viz_object_goals(self, sample_step=1, hold_steps=20):
         self.reset()
@@ -278,7 +379,7 @@ def execute_and_lift_once_batch(self, info: dict, lift_height=0.12) -> tuple[np.
         score[lifted] = 1000.0
         return lifted.detach().cpu().numpy().astype(bool), score.detach().cpu().numpy().astype(np.float32)
 
-    def lift_up(self, height=0.12, gripper_open=False):
+    def lift_up(self, height=0.12, gripper_open=False, steps=8):
         """
         Lift up by a certain height (m) from current EE pose.
         """
@@ -290,17 +391,16 @@ def lift_up(self, height=0.12, gripper_open=False):
         p_lift_b, q_lift_b = subtract_frame_transforms(
             root[:, 0:3], root[:, 3:7],
             target_p, ee_w[:, 3:7]
-        )
+        ) # [B,3], [B,4]  
         jp, success = self.move_to(p_lift_b, q_lift_b, gripper_open=gripper_open)
-        jp = self.wait(gripper_open=gripper_open, steps=8)
+        jp = self.wait(gripper_open=gripper_open, steps=steps)
         return jp
-
-    def is_success(self) -> bool:
+ 
+    def is_success(self) -> torch.Tensor:
         ee_w  = self.robot.data.body_state_w[:, self.robot_entity_cfg.body_ids[0], 0:7]
         obj_w = self.object_prim.data.root_com_pos_w[:, 0:3]
-        dist = torch.norm(obj_w[:, :3] - ee_w[:, :3], dim=1).mean().item()
-        print("Success!" if dist < 0.10 else "Fail!")
-        return dist < 0.10
+        dist = torch.norm(obj_w[:, :3] - ee_w[:, :3], dim=1) # [B]
+        return (dist < 0.15).to(torch.bool)
 
     def build_grasp_info(
         self,
@@ -371,7 +471,7 @@ def grasp_trials(self, gg, std: float = 0.0005):
             self.show_goal(grasp_pos_w_batch, grasp_quat_w_batch)
             # random disturbance along approach axis
             pre_dist_batch = np.full((B,), pre_dist_const, dtype=np.float32)
-            delta_batch    = rng.normal(0.0, std, size=(B,)).astype(np.float32)
+            delta_batch    = rng.normal(-0.002, std, size=(B,)).astype(np.float32)
 
             info = self.build_grasp_info(grasp_pos_w_batch, grasp_quat_w_batch,
                                           pre_dist_batch, delta_batch)
@@ -423,7 +523,7 @@ def replay_actions(self, actions: np.ndarray):
             jp = self.wait(gripper_open=g_b, steps=3)
         return True
 
-    def inference(self, std: float = 0.0) -> bool:
+    def inference(self, std: float = 0.0) -> list[int]:
         """
         Main function of the heuristic manipulation policy.
         Physical trial-and-error grasping with approach-axis perturbation:
@@ -439,14 +539,14 @@ def inference(self, std: float = 0.0) -> bool:
         npy_path = self.grasp_path
         if npy_path is None or (not os.path.exists(npy_path)):
             print(f"[ERR] grasps npy not found: {npy_path}")
-            return False
+            return []
         gg = GraspGroup().from_npy(npy_file_path=npy_path)
         gg = get_best_grasp_with_hints(gg, point=None, direction=[0, 0, -1]) # [0, 0, -1]
 
         if self.grasp_idx >= 0:
             if self.grasp_idx >= len(gg):
                 print(f"[ERR] grasp_idx {self.grasp_idx} out of range [0,{len(gg)})")
-                return False
+                return []
             print(f"[INFO] using fixed grasp index {self.grasp_idx} for all envs.")
             p_w, q_w = grasp_to_world(gg[int(self.grasp_idx)])
             ret = {
@@ -460,7 +560,9 @@ def inference(self, std: float = 0.0) -> bool:
             ret = self.grasp_trials(gg, std=std)
 
         print("[INFO] Re-exec all envs with the winning grasp, then follow object goals.")
-
+        if ret is None or ret["success"] == False:
+            print("[ERR] no proposal succeeded to lift after full search.")
+            return []
         p_win, q_win = ret["chosen_pose_w"]
         p_all   = np.repeat(p_win.reshape(1, 3), B, axis=0)
         q_all   = np.repeat(q_win.reshape(1, 4), B, axis=0)
@@ -471,7 +573,7 @@ def inference(self, std: float = 0.0) -> bool:
 
         # reset and conduct main process: open→pre→grasp→close→follow_object_goals
         self.reset()
-
+        #print(self.object_prim.data.root_state_w[:, :7].cpu().numpy())
         cam_p = self.camera.data.pos_w
         cam_q = self.camera.data.quat_w_ros
         gp_w  = torch.as_tensor(info_all["p_w"],     dtype=torch.float32, device=self.sim.device)
@@ -479,20 +581,23 @@ def inference(self, std: float = 0.0) -> bool:
         pre_w = torch.as_tensor(info_all["pre_p_w"], dtype=torch.float32, device=self.sim.device)
         gp_cam,  gq_cam  = subtract_frame_transforms(cam_p, cam_q, gp_w,  gq_w)
         pre_cam, pre_qcm = subtract_frame_transforms(cam_p, cam_q, pre_w, gq_w)
-        self.save_dict["grasp_pose_cam"]    = torch.cat([gp_cam,  gq_cam],  dim=1).unsqueeze(0).cpu().numpy()
-        self.save_dict["pregrasp_pose_cam"] = torch.cat([pre_cam, pre_qcm], dim=1).unsqueeze(0).cpu().numpy()
+       
+        self.save_dict["grasp_pose_w"] = torch.cat([gp_w,  gq_w],  dim=1).cpu().numpy()
+        self.save_dict["pregrasp_pose_w"] = torch.cat([pre_w, gq_w], dim=1).cpu().numpy()
+        self.save_dict["grasp_pose_cam"]    = torch.cat([gp_cam,  gq_cam],  dim=1).cpu().numpy()
+        self.save_dict["pregrasp_pose_cam"] = torch.cat([pre_cam, pre_qcm], dim=1).cpu().numpy()
 
         jp = self.robot.data.joint_pos[:, self.robot_entity_cfg.joint_ids]
         self.wait(gripper_open=True, steps=4)
 
         # pre → grasp
         jp, success = self.move_to(info_all["pre_p_b"], info_all["pre_q_b"], gripper_open=True)
-        if torch.any(success==False): return False
+        if torch.any(success==False): return []
         self.save_dict["actions"].append(np.concatenate([info_all["pre_p_b"].cpu().numpy(), info_all["pre_q_b"].cpu().numpy(), np.zeros((B, 1))], axis=1))
         jp = self.wait(gripper_open=True, steps=3)
 
         jp, success = self.move_to(info_all["p_b"], info_all["q_b"], gripper_open=True)
-        if torch.any(success==False): return False
+        if torch.any(success==False): return []
         self.save_dict["actions"].append(np.concatenate([info_all["p_b"].cpu().numpy(), info_all["q_b"].cpu().numpy(), np.zeros((B, 1))], axis=1))
 
         # close gripper
@@ -500,43 +605,244 @@ def inference(self, std: float = 0.0) -> bool:
         self.save_dict["actions"].append(np.concatenate([info_all["p_b"].cpu().numpy(), info_all["q_b"].cpu().numpy(), np.ones((B, 1))], axis=1))
 
         # object goal following
-        # self.lift_up(height=0.05, gripper_open=False)
-        jp = self.follow_object_goals(jp, sample_step=5, visualize=True)
+        self.lift_up(height=self.goal_offset[2], gripper_open=False, steps=8)
+        #jp = self.follow_object_goals(jp, sample_step=5, visualize=True)
+        if self.task_type == "simple_pick_place" or self.task_type == "simple_pick":
+            jp, success_ids = self.follow_object_centers(jp, sample_step=1, visualize=True)
+        elif self.task_type == "targetted_pick_place":
+            jp, success_ids = self.follow_object_goals(jp, sample_step=1, visualize=True)
+        else:
+            raise ValueError(f"[ERR] Invalid task type: {self.task_type}")
+       
+        object_prim_world_pose = self.object_prim.data.root_state_w[:, :7].cpu().numpy()
+        object_prim_world_pose[:, :3] = object_prim_world_pose[:, :3] - self.scene.env_origins.cpu().numpy()
+        self.save_dict["final_object_world_pose"] = object_prim_world_pose
+        
+        robot_world_pose = self.robot.data.root_state_w[:, :7].cpu().numpy()
+        robot_world_pose[:, :3] = robot_world_pose[:, :3] - self.scene.env_origins.cpu().numpy()
+        self.save_dict["final_robot_world_pose"] = robot_world_pose
+
+        
+    
+        # Properly handle the case when success_ids is a numpy array
+        # Convert it to a torch tensor if needed, before calling torch.where
+        print(f"[INFO] success_ids: {success_ids}")
+        # If success_ids is already a tensor, we keep as-is
+        success_ids = torch.where(success_ids)[0].cpu().numpy()
+      
+    
+        return success_ids
+           
+
+    def from_data_to_task_cfg(self, key:str) -> TaskCfg:
+        BASE_DIR = Path.cwd()
+        scene_json_path = BASE_DIR / "outputs" / key / "simulation" / "scene.json"
+        task_base_folder = BASE_DIR / "tasks"
+        scene_dict = json.load(open(scene_json_path))
+        task_cfg, base_folder = construct_task_config(key, scene_dict, task_base_folder)
+        robot_pose = self.save_dict["final_robot_world_pose"]
+        robot_pose = robot_pose[0]
+        robot_pose = robot_pose.tolist()
+
+        object_trajectory = np.load(self.traj_path).astype(np.float32)
+        pose_quat_traj = []
+        for pose_mat in object_trajectory:
+            pose, quat = mat_to_pose(pose_mat)
+            pose_quat = np.concatenate([np.array(pose), np.array(quat)])
+            pose_quat_traj.append(pose_quat)
+        pose_quat_traj = np.array(pose_quat_traj).reshape(-1, 7).tolist()
+                               
+        # Convert pregrasp and grasp poses from world to env-local frame
+        pregrasp_pose_world = np.array(self.save_dict["pregrasp_pose_w"])  # [B, 7]
+        grasp_pose_world = np.array(self.save_dict["grasp_pose_w"])        # [B, 7]
+        env_origins = self.scene.env_origins.cpu().numpy()       # [B, 3]
+        
+        pregrasp_pose_local = np.array(pregrasp_pose_world.copy())
+        pregrasp_pose_local[:, :3] = pregrasp_pose_world[:, :3] - env_origins
+        pregrasp_pose = pregrasp_pose_local[0].tolist()  # Take first env
+        
+        grasp_pose_local = grasp_pose_world.copy()
+        grasp_pose_local[:, :3] = grasp_pose_world[:, :3] - env_origins
+        grasp_pose = grasp_pose_local[0].tolist()  # Take first env
+      
+        final_gripper_close = self.final_gripper_closed
+        
+        if task_cfg.task_type == TaskType.TARGETTED_PICK_PLACE:
+            success_metric = SuccessMetric(
+                success_metric_type = SuccessMetricType.TARGET_POINT,
+                end_pose  = self.save_dict["final_object_world_pose"][0].tolist(),
+                final_gripper_close = final_gripper_close,
+            )
+        elif task_cfg.task_type == TaskType.SIMPLE_PICK_PLACE:
+            final_object_world_pose = self.save_dict["final_object_world_pose"][0]
+            ground_value = float(final_object_world_pose[2])
+            success_metric = SuccessMetric(
+                success_metric_type = SuccessMetricType.TARGET_PLANE,
+                ground_value = ground_value,
+                final_gripper_close = final_gripper_close,
+                end_pose  = self.save_dict["final_object_world_pose"][0].tolist()
+            )
+        else:
+            success_metric = SuccessMetric(
+                success_metric_type = SuccessMetricType.SIMPLE_LIFT,
+                lift_height = 0.05,
+                final_gripper_close = final_gripper_close,
+                end_pose  = self.save_dict["final_object_world_pose"][0].tolist()
+            )
+        object_poses = {}
+        for obj in task_cfg.objects:
+            object_poses[obj.object_id] = [0, 0, 0, 1, 0, 0, 0]
+        
+        if self.robot_type == 'franka':
+            robot_type = RobotType.FRANKA
+        elif self.robot_type == 'ur5':
+            robot_type = RobotType.UR5
+        else:
+            raise ValueError(f"[ERR] Invalid robot type: {self.robot_type}")
+        trajectory_cfg = TrajectoryCfg(
+            robot_pose = robot_pose,
+            object_poses = object_poses,
+            object_trajectory = pose_quat_traj,
+            final_gripper_close = final_gripper_close,
+            success_metric = success_metric,
+            pregrasp_pose = pregrasp_pose,
+            grasp_pose = grasp_pose,
+            robot_type = robot_type,
+        )
+        add_reference_trajectory(task_cfg, trajectory_cfg, base_folder)
+        return task_cfg
+
+
+# ─────robot_pose─────────────────────── Entry Point ────────────────────────────
+
+
+
+
+# def main():
+#     sim_cfgs = load_sim_parameters(BASE_DIR, args_cli.key)
+#     env, _ = make_env(
+#         cfgs=sim_cfgs, num_envs=args_cli.num_envs,
+#         device=args_cli.device,
+#         bg_simplify=False,
+#     )
+#     sim, scene = env.sim, env.scene
+
+#     my_sim = HeuristicManipulation(
+#         sim, scene, sim_cfgs=sim_cfgs, 
+#         args=args_cli, out_dir=out_dir, img_folder=img_folder
+#     )
+
+#     demo_root = (out_dir / img_folder / "demos").resolve()
+
+#     for _ in range(args_cli.num_trials):
+
+#         robot_pose = torch.tensor(sim_cfgs["robot_cfg"]["robot_pose"], dtype=torch.float32, device=my_sim.sim.device)  # [7], pos(3)+quat(wxyz)(4)
+#         my_sim.set_robot_pose(robot_pose)
+#         my_sim.demo_id = get_next_demo_id(demo_root)
+#         my_sim.reset()
+#         print(f"[INFO] start simulation demo_{my_sim.demo_id}")
+#         # Note: if you set viz_object_goals(), remember to disable gravity and collision for object
+#         # my_sim.viz_object_goals(sample_step=10, hold_steps=40)
+#         success_ids = my_sim.inference()
+#         print(f"[INFO] success_ids: {success_ids}")
+#         if success_ids is not None and len(success_ids) > 0:
+#             success = True
+#             my_sim.from_data_to_task_cfg(args_cli.key)
+#             break
+#         else:
+#             success = False
+        
+#         # actions = np.load("outputs/lab16/demos/demo_5/env_000/actions.npy")
+#         # my_sim.replay_actions(actions)
+
+#     env.close()
+#     simulation_app.close()
+
+
+
+
+
+def sim_heuristic_manip(keys: list[str], args_cli: argparse.Namespace, config_path: Optional[str] = None, config_dict: Optional[dict] = None):
+    """`
+    Run heuristic manipulation simulation.
+    
+    Args:
+        keys: List of scene keys to run (e.g., ["demo_video"])
+        args_cli: Command-line arguments (argparse.Namespace)
+        config_path: Path to config file (yaml/json) - alternative to args_cli
+        config_dict: Config dictionary - alternative to args_cli
+        
+    Usage:
+        # Option 1: Use command-line args (original)
+        sim_heuristic_manip(["demo_video"], args_cli)
+        
+        # Option 2: Use config file
+        sim_heuristic_manip(["demo_video"], config_path="config.yaml")
+        
+        # Option 3: Use config dict
+        sim_heuristic_manip(["demo_video"], config_dict={"num_envs": 4, "num_trials": 10})
+    """
+    # Create args from config if not provided
+    # if args_cli is None:    
+    #     args_cli = create_args_from_config(config_path=config_path, config_dict=config_dict)
+    BASE_DIR   = Path.cwd()
+    
+
+    out_dir    = BASE_DIR / "outputs"
+    local_out_dir = BASE_DIR / "outputs"
+    
+    for key in keys:
+        args_cli.key = key
+        local_img_folder = key
+        demo_dir = BASE_DIR / "tasks" / key / "demos"
+        data_dir = BASE_DIR / "data" / key 
+        sim_cfgs = load_sim_parameters(BASE_DIR, key)
+        env, _ = make_env(
+            cfgs=sim_cfgs, num_envs=args_cli.num_envs,
+            device=args_cli.device,
+            bg_simplify=False,
+        )
+        sim, scene = env.sim, env.scene
 
-        self.save_data(ignore_keys=["segmask", "depth"])
-        return True
+        my_sim = HeuristicManipulation(
+            sim, scene, sim_cfgs=sim_cfgs,
+            args=args_cli, out_dir=local_out_dir, img_folder=local_img_folder,
+            demo_dir = demo_dir, data_dir = data_dir )
 
-# ──────────────────────────── Entry Point ────────────────────────────
-def main():
-    sim_cfgs = load_sim_parameters(BASE_DIR, args_cli.key)
-    env, _ = make_env(
-        cfgs=sim_cfgs, num_envs=args_cli.num_envs,
-        device=args_cli.device,
-        bg_simplify=False,
-    )
-    sim, scene = env.sim, env.scene
-
-    my_sim = HeuristicManipulation(sim, scene, sim_cfgs=sim_cfgs)
-
-    demo_root = (out_dir / img_folder / "demos").resolve()
-
-    for _ in range(args_cli.num_trials):
-
-        robot_pose = torch.tensor(sim_cfgs["robot_cfg"]["robot_pose"], dtype=torch.float32, device=my_sim.sim.device)  # [7], pos(3)+quat(wxyz)(4)
-        my_sim.set_robot_pose(robot_pose)
-        my_sim.demo_id = get_next_demo_id(demo_root)
-        my_sim.reset()
-        print(f"[INFO] start simulation demo_{my_sim.demo_id}")
-        # Note: if you set viz_object_goals(), remember to disable gravity and collision for object
-        # my_sim.viz_object_goals(sample_step=10, hold_steps=40)
-        my_sim.inference()
-        # actions = np.load("outputs/lab16/demos/demo_5/env_000/actions.npy")
-        # my_sim.replay_actions(actions)
-
-    env.close()
-    simulation_app.close()
-
-if __name__ == "__main__":
-    main()
-    os.system("quit()")
-    simulation_app.close()
+        demo_root = (local_out_dir / key / "demos").resolve()
+        success = False
+        for i in range(args_cli.num_trials):
+
+            robot_pose = torch.tensor(sim_cfgs["robot_cfg"]["robot_pose"], dtype=torch.float32, device=my_sim.sim.device)  # [7], pos(3)+quat(wxyz)(4)
+            my_sim.set_robot_pose(robot_pose)
+            my_sim.demo_id = get_next_demo_id(demo_root)
+            my_sim.reset()
+            print(f"[INFO] start simulation demo_{my_sim.demo_id}")
+            # Note: if you set viz_object_goals(), remember to disable gravity and collision for object
+            # my_sim.viz_object_goals(sample_step=10, hold_steps=40)
+            std = 0
+            if i > 0:
+                std = std + 0.001
+            success_ids = my_sim.inference(std=std)
+            print(f"[INFO] success_ids: {success_ids}")
+            if len(success_ids) > 0:
+                success = True
+                task_cfg = my_sim.from_data_to_task_cfg(key)
+                my_sim.task_cfg = task_cfg
+                my_sim.trajectory_cfg_list = [task_cfg.reference_trajectory]
+                my_sim.save_data(ignore_keys=["segmask", "depth"], env_ids=success_ids[:1])
+                break
+            # actions = np.load("outputs/lab16/demos/demo_5/env_000/actions.npy")
+            # my_sim.replay_actions(actions)
+        if success == False:
+            print("[ERR] no successful environments!")
+        env.close()
+    #simulation_app.close()
+    return True
+
+
+
+#if __name__ == "__main__":
+    #main()
+    #os.system("quit()")
+    #simulation_app.close()
diff --git a/openreal2sim/simulation/isaaclab/sim_preprocess/usd_conversion.py b/openreal2sim/simulation/isaaclab/sim_preprocess/usd_conversion.py
old mode 100644
new mode 100755
index 6ce5594..1056fcd
--- a/openreal2sim/simulation/isaaclab/sim_preprocess/usd_conversion.py
+++ b/openreal2sim/simulation/isaaclab/sim_preprocess/usd_conversion.py
@@ -381,7 +381,7 @@ def count_rigid_api(usd_path):
 def run_conversion_for_key(key: str):
     global args
 
-    scene_json = out_dir / key / "scene" / "scene.json"
+    scene_json = out_dir / key / "simulation" / "scene.json"
     assert scene_json.exists(), f"[{key}] scene.json not found: {scene_json}"
     scene_dict = json.load(open(scene_json, "r"))
 
@@ -552,6 +552,14 @@ def run_conversion(scene_dict: dict):
             log.info(f'Rigid body count: {count_rigid_api(output_path)}')
             log.info(f"Saved USD file to {os.path.abspath(output_path)}")
 
+
+def usd_conversion(keys):
+    for key in keys:
+        log.info(f"\n========== [USD Convert] key: {key} ==========")
+        run_conversion_for_key(key)
+    simulation_app.close()
+    print('[Info] USD conversion completed.')
+
 ########################################################
 ## Batch entry
 ########################################################
diff --git a/openreal2sim/simulation/isaaclab/sim_randomize_rollout.py b/openreal2sim/simulation/isaaclab/sim_randomize_rollout.py
new file mode 100755
index 0000000..530995a
--- /dev/null
+++ b/openreal2sim/simulation/isaaclab/sim_randomize_rollout.py
@@ -0,0 +1,600 @@
+"""
+Heuristic manipulation policy in Isaac Lab simulation.
+Using grasping and motion planning to perform object manipulation tasks.
+"""
+from __future__ import annotations
+
+# ─────────── AppLauncher ───────────
+import argparse, os, json, random, transforms3d
+from pathlib import Path
+import numpy as np
+import torch
+import yaml
+import sys
+from isaaclab.app import AppLauncher
+from typing import Optional, List
+from modules.envs.task_cfg import TaskCfg, TaskType, SuccessMetric, SuccessMetricType, TrajectoryCfg
+from modules.envs.task_construct import construct_task_config, add_reference_trajectory, load_task_cfg
+from modules.envs.randomizer import Randomizer
+file_path = Path(__file__).resolve()
+import imageio 
+
+sys.path.append(str(file_path.parent))
+sys.path.append(str(file_path.parent.parent))
+
+# ─────────── CLI ───────────
+# parser = argparse.ArgumentParser("sim_policy")
+# parser.add_argument("--key", type=str, default="demo_video", help="scene key (outputs/<key>)")
+# parser.add_argument("--robot", type=str, default="franka")
+# parser.add_argument("--num_envs", type=int, default=1)
+# parser.add_argument("--num_trials", type=int, default=1)
+# parser.add_argument("--teleop_device", type=str, default="keyboard")
+# parser.add_argument("--sensitivity", type=float, default=1.0)
+# AppLauncher.add_app_launcher_args(parser)
+# args_cli = parser.parse_args()
+# args_cli.enable_cameras = True
+# args_cli.headless = False  # headless mode for batch execution
+# app_launcher = AppLauncher(vars(args_cli))
+# simulation_app = app_launcher.app
+
+# ─────────── Runtime imports ───────────
+import isaaclab.sim as sim_utils
+from isaaclab.utils.math import subtract_frame_transforms
+
+from graspnetAPI.grasp import GraspGroup
+
+
+# ─────────── Simulation environments ───────────
+from sim_base import BaseSimulator, get_next_demo_id
+from sim_env_factory import make_env
+from modules.grasp_preprocess.grasp_utils import get_best_grasp_with_hints
+from isaaclab.sim_utils.transform_utils import pose_to_mat, mat_to_pose, grasp_to_world, grasp_approach_axis_batch
+from isaaclab.sim_utils.sim_utils import load_sim_parameters
+
+BASE_DIR   = Path.cwd()
+
+out_dir    = BASE_DIR / "outputs"
+
+def pose_distance(T1, T2):
+    """
+    Compute translation and rotation (angle) distance between two SE(3) transformation matrices in torch.
+    Args:
+        T1, T2: [..., 4, 4] torch.Tensor or np.ndarray, can be batched
+    Returns:
+        trans_dist: translation distance(s)
+        angle: rotational angle(s)
+    """
+    if not torch.is_tensor(T1):
+        T1 = torch.tensor(T1, dtype=torch.float32)
+    if not torch.is_tensor(T2):
+        T2 = torch.tensor(T2, dtype=torch.float32)
+    
+    # Translation distance
+    t1 = T1[..., :3, 3]
+    t2 = T2[..., :3, 3]
+    trans_dist = torch.norm(t2 - t1, dim=-1)
+
+    # Rotation difference (angle)
+    R1 = T1[..., :3, :3]
+    R2 = T2[..., :3, :3]
+    dR = torch.matmul(R2, R1.transpose(-2, -1))
+    trace = dR[..., 0, 0] + dR[..., 1, 1] + dR[..., 2, 2]
+    cos_angle = (trace - 1) / 2
+    
+    # Handle numerical precision
+    cos_angle = torch.clamp(cos_angle, -1.0, 1.0)
+    angle = torch.acos(cos_angle)
+    return trans_dist, angle
+
+def compute_poses_from_traj_cfg(traj_cfg_list):
+    """
+    Extract poses and trajectories from a list of TrajectoryCfg objects.
+    
+    Args:
+        traj_cfg_list: List of TrajectoryCfg objects
+        
+    Returns:
+        robot_poses_list: List of robot poses [7] for each trajectory
+        object_poses_dict: Dict mapping oid -> list of (pos, quat) tuples
+        object_trajectory_list: List of object trajectories
+        final_gripper_state_list: List of final gripper states
+        grasping_phase_list: List of grasping phases
+        placing_phrase_list: List of placing phases
+    """
+    robot_poses_list = []
+    object_poses_dict = {}  # {oid: [(pos, quat), ...]}
+    object_trajectory_list = []
+    final_gripper_state_list = []
+    pregrasp_pose_list = []
+    grasp_pose_list = []
+    final_gripper_close_list = []
+    end_pose_list = []
+
+
+    for traj_cfg in traj_cfg_list:
+        robot_poses_list.append(traj_cfg.robot_pose)
+        
+        # Extract object poses: traj_cfg.object_poses is a list of (oid, pose) tuples
+        for oid in traj_cfg.object_poses.keys():
+            pose = traj_cfg.object_poses[oid]
+            oid_str = str(oid)
+            if oid_str not in object_poses_dict:
+                object_poses_dict[oid_str] = []
+            object_poses_dict[oid_str].append(np.array(pose, dtype=np.float32))
+        traj = []
+        for i in range(len(traj_cfg.object_trajectory)):
+            mat = pose_to_mat(traj_cfg.object_trajectory[i][:3], traj_cfg.object_trajectory[i][3:7])
+            traj.append(mat)
+        object_trajectory_list.append(np.array(traj, dtype=np.float32))
+        final_gripper_state_list.append(traj_cfg.final_gripper_close)
+        pregrasp_pose_list.append(np.array(traj_cfg.pregrasp_pose, dtype=np.float32))
+        grasp_pose_list.append(np.array(traj_cfg.grasp_pose, dtype=np.float32))
+        final_gripper_close_list.append(traj_cfg.final_gripper_close)
+        end_pose_list.append(np.array(traj_cfg.success_metric.end_pose, dtype=np.float32))
+  
+    return robot_poses_list, object_poses_dict, object_trajectory_list, final_gripper_state_list, pregrasp_pose_list, grasp_pose_list, end_pose_list
+
+
+
+# ────────────────────────────Heuristic Manipulation ────────────────────────────
+class RandomizeExecution(BaseSimulator):
+    """
+    Physical trial-and-error grasping with approach-axis perturbation:
+      • Multiple grasp proposals executed in parallel;
+      • Every attempt does reset → pre → grasp → close → lift → check;
+      • Early stops when any env succeeds; then re-exec for logging.
+    """
+    def __init__(self, sim, scene, sim_cfgs: dict, demo_dir: Path, data_dir: Path, record: bool = True, args_cli: Optional[argparse.Namespace] = None, bg_rgb: Optional[np.ndarray] = None):
+        robot_pose = torch.tensor(
+            sim_cfgs["robot_cfg"]["robot_pose"],
+            dtype=torch.float32,
+            device=sim.device,
+
+        )
+        super().__init__(
+            sim=sim, scene=scene, args=args_cli,
+            robot_pose=robot_pose, cam_dict=sim_cfgs["cam_cfg"],
+            out_dir=out_dir, img_folder=args_cli.key, data_dir=data_dir,
+            demo_dir=demo_dir,
+            enable_motion_planning=True,
+            set_physics_props=True, debug_level=0,
+        )
+
+        self.selected_object_id = sim_cfgs["demo_cfg"]["manip_object_id"]
+        self._selected_object_id = str(self.selected_object_id)  # Store as string for mapping
+        self._update_object_prim()  # Update object_prim based on selected_object_id
+        self.record = record  # Store whether to record data
+        #self.traj_cfg_list = traj_cfg_list
+       
+        self.task_type = sim_cfgs["demo_cfg"]["task_type"]
+        self.goal_offset = [0, 0, sim_cfgs["demo_cfg"]["goal_offset"]]
+        
+       
+    
+
+    def reset(self, env_ids=None):
+        super().reset(env_ids)
+        device = self.object_prim.device
+        if env_ids is None:
+            env_ids_t = self._all_env_ids.to(device)  # (B,)
+        else:
+            env_ids_t = torch.as_tensor(env_ids, device=device, dtype=torch.long).view(-1)  # (M,)
+        M = int(env_ids_t.shape[0])
+
+        # --- object pose/vel: set object at env origins with identity quat ---
+        env_origins = self.scene.env_origins.to(device)[env_ids_t]  # (M,3)
+        
+        # Set poses for all objects from object_poses_dict
+        from sim_env_factory import get_prim_name_from_oid
+        for oid in self.object_poses_dict.keys():
+            # Get prim name from oid
+            prim_name = get_prim_name_from_oid(str(oid))
+            
+            object_prim = self.scene[prim_name]
+            
+            # Get pose for this object (first pose in the list for now)
+            # object_poses_dict[oid] is a list of (pos, quat) tuples from mat_to_pose
+            if len(self.object_poses_dict[oid]) == 0:
+                continue
+            #import ipdb; ipdb.set_trace()
+            pos, quat = np.array(self.object_poses_dict[oid], dtype = np.float32)[env_ids_t.cpu().numpy(), :3], np.array(self.object_poses_dict[oid], dtype = np.float32)[env_ids_t.cpu().numpy(), 3:7]
+            object_pose = torch.zeros((M, 7), device=device, dtype=torch.float32)
+            object_pose[:, :3] = env_origins + torch.tensor(pos, dtype=torch.float32, device=device)
+            object_pose[:, 3:7] = torch.tensor(quat, dtype=torch.float32, device=device)  # wxyz
+            
+            object_prim.write_root_pose_to_sim(object_pose, env_ids=env_ids_t)
+            object_prim.write_root_velocity_to_sim(
+                torch.zeros((M, 6), device=device, dtype=torch.float32), env_ids=env_ids_t
+            )
+        
+            object_prim.write_data_to_sim()
+        
+        rp_local = np.array(self.robot_poses_list, dtype=np.float32)
+        env_origins_robot = self.scene.env_origins.to(device)[env_ids_t]
+        import copy
+        robot_pose_world = copy.deepcopy(rp_local)
+        robot_pose_world[:, :3] = env_origins_robot.cpu().numpy() + robot_pose_world[env_ids_t.cpu().numpy(), :3]
+        #robot_pose_world[:, 3:7] = [1.0, 0.0, 0.0, 0.0]
+        self.robot.write_root_pose_to_sim(torch.tensor(robot_pose_world, dtype=torch.float32, device=device), env_ids=env_ids_t)
+        self.robot.write_root_velocity_to_sim(
+            torch.zeros((M, 6), device=device, dtype=torch.float32), env_ids=env_ids_t
+        )
+
+
+        joint_pos = self.robot.data.default_joint_pos.to(self.robot.device)[env_ids_t]  # (M,7)
+        joint_vel = self.robot.data.default_joint_vel.to(self.robot.device)[env_ids_t]  # (M,7)
+        self.robot.write_joint_state_to_sim(joint_pos, joint_vel, env_ids=env_ids_t)
+        self.robot.write_data_to_sim()
+
+        self.clear_data()
+
+    def compute_components(self):
+        self.robot_poses_list, self.object_poses_dict, self.object_trajectory_list, self.final_gripper_state_list, self.pregrasp_pose_list, self.grasp_pose_list, self.end_pose_list = compute_poses_from_traj_cfg(self.traj_cfg_list)
+       
+
+    def compute_object_goal_traj(self):
+        B = self.scene.num_envs
+        # obj_state = self.object_prim.data.root_com_state_w[:, :7]  # [B,7], pos(3)+quat(wxyz)(4)
+        obj_state = self.object_prim.data.root_state_w[:, :7]  # [B,7], pos(3)+quat(wxyz)(4)
+        self.show_goal(obj_state[:, :3], obj_state[:, 3:7])
+
+        obj_state_np = obj_state.detach().cpu().numpy().astype(np.float32)
+        offset_np = np.asarray(self.goal_offset, dtype=np.float32) 
+        obj_state_np[:, :3] += offset_np  # raise a bit to avoid collision
+
+        # Note: here the relative traj Δ_w is defined in world frame with origin (0,0,0),
+        # Hence, we need to normalize it to each env's origin frame.
+        origins = self.scene.env_origins.detach().cpu().numpy().astype(np.float32)  # (B,3)
+        obj_state_np[:, :3] -= origins # normalize to env origin frame
+
+        # —— 3) Precompute absolute object goals for all envs ——
+        T = self.object_trajectory_list[0].shape[0]
+        obj_goal = np.zeros((B, T, 4, 4), dtype=np.float32)
+        for b in range(B):
+            T_init = pose_to_mat(obj_state_np[b, :3], obj_state_np[b, 3:7])  # (4,4)
+            for t in range(1, T):
+                goal = self.object_trajectory_list[b][t] @ T_init
+                goal[:3, 3] += origins[b]  # back to world frame
+                obj_goal[b, t] = goal
+
+        self.obj_goal_traj_w = obj_goal
+    
+    def follow_object_goals(self, start_joint_pos, sample_step=1, visualize=True):
+        """
+        follow precompute object absolute trajectory: self.obj_goal_traj_w:
+          T_obj_goal[t] = Δ_w[t] @ T_obj_init
+
+        EE-object transform is fixed:
+          T_ee_goal[t] = T_obj_goal[t] @ (T_obj_grasp^{-1} @ T_ee_grasp)
+        Here T_obj_grasp / T_ee_grasp is the transform at the moment of grasping.
+        """
+
+        B = self.scene.num_envs
+        obj_goal_all = self.obj_goal_traj_w  # [B, T, 4, 4]
+        T = obj_goal_all.shape[1]
+
+        ee_w  = self.robot.data.body_state_w[:, self.robot_entity_cfg.body_ids[0], 0:7]  # [B,7]
+        # obj_w = self.object_prim.data.root_com_state_w[:, :7]                                 # [B,7]
+        obj_w = self.object_prim.data.root_state_w[:, :7]                                 # [B,7]
+
+        T_ee_in_obj = []
+        for b in range(B):
+            T_ee_w  = pose_to_mat(ee_w[b, :3],  ee_w[b, 3:7])
+            T_obj_w = pose_to_mat(obj_w[b, :3], obj_w[b, 3:7])
+            T_ee_in_obj.append((np.linalg.inv(T_obj_w) @ T_ee_w).astype(np.float32))
+
+        joint_pos = start_joint_pos
+        root_w = self.robot.data.root_state_w[:, 0:7]  # robot base poses per env
+
+        t_iter = list(range(0, T, sample_step))
+        t_iter = t_iter + [T-1] if t_iter[-1] != T-1 else t_iter
+
+        for t in t_iter:
+            goal_pos_list, goal_quat_list = [], []
+            print(f"[INFO] follow object goal step {t}/{T}")
+            for b in range(B):
+                T_obj_goal = obj_goal_all[b, t]            # (4,4)
+                T_ee_goal  = T_obj_goal @ T_ee_in_obj[b]   # (4,4)
+                pos_b, quat_b = mat_to_pose(T_ee_goal)
+                goal_pos_list.append(pos_b.astype(np.float32))
+                goal_quat_list.append(quat_b.astype(np.float32))
+
+            goal_pos  = torch.as_tensor(np.stack(goal_pos_list),  dtype=torch.float32, device=self.sim.device)
+            goal_quat = torch.as_tensor(np.stack(goal_quat_list), dtype=torch.float32, device=self.sim.device)
+
+            if visualize:
+                self.show_goal(goal_pos, goal_quat)
+            ee_pos_b, ee_quat_b = subtract_frame_transforms(
+                root_w[:, :3], root_w[:, 3:7], goal_pos, goal_quat
+            )
+            joint_pos, success = self.move_to(ee_pos_b, ee_quat_b, gripper_open=False, record = self.record)
+            self.save_dict["actions"].append(np.concatenate([ee_pos_b.cpu().numpy(), ee_quat_b.cpu().numpy(), np.ones((B, 1))], axis=1))
+
+        is_grasp_success = self.is_grasp_success()
+        for b in range(B):
+            if self.final_gripper_state_list[b]:
+                self.wait(gripper_open=False, steps=10, record = self.record)
+            else:      
+                self.wait(gripper_open=True, steps=10, record = self.record)
+
+        return joint_pos, is_grasp_success
+    
+
+    def follow_object_centers(self, start_joint_pos, sample_step=1, visualize=True):
+        B = self.scene.num_envs
+        obj_goal_all = self.obj_goal_traj_w  # [B, T, 4, 4]
+        T = obj_goal_all.shape[1]
+
+        ee_w  = self.robot.data.body_state_w[:, self.robot_entity_cfg.body_ids[0], 0:7]  # [B,7]
+        # obj_w = self.object_prim.data.root_com_state_w[:, :7]                                 # [B,7]
+        obj_w = self.object_prim.data.root_state_w[:, :7]                                 # [B,7]
+
+        T_ee_in_obj = []
+        for b in range(B):
+            T_ee_w  = pose_to_mat(ee_w[b, :3],  ee_w[b, 3:7])
+            T_obj_w = pose_to_mat(obj_w[b, :3], obj_w[b, 3:7])
+            T_ee_in_obj.append((np.linalg.inv(T_obj_w) @ T_ee_w).astype(np.float32))
+
+        joint_pos = start_joint_pos
+        root_w = self.robot.data.root_state_w[:, 0:7]  # robot base poses per env
+
+        t_iter = list(range(0, T, sample_step))
+        t_iter = t_iter + [T-1] if t_iter[-1] != T-1 else t_iter
+
+        for t in t_iter:
+            goal_pos_list, goal_quat_list = [], []
+            print(f"[INFO] follow object goal step {t}/{T}")
+            for b in range(B):
+                T_obj_goal = obj_goal_all[b, t]            # (4,4)
+                T_ee_goal  = T_obj_goal @ T_ee_in_obj[b]   # (4,4)
+                pos_b, quat_b = mat_to_pose(T_ee_goal)
+
+                goal_pos_list.append(pos_b.astype(np.float32))
+                goal_quat_list.append(quat_b.astype(np.float32))
+
+            goal_pos  = torch.as_tensor(np.stack(goal_pos_list),  dtype=torch.float32, device=self.sim.device)
+            goal_quat = ee_w[:, 3:7]
+
+            if visualize:
+                self.show_goal(goal_pos, goal_quat)
+            ee_pos_b, ee_quat_b = subtract_frame_transforms(
+                root_w[:, :3], root_w[:, 3:7], goal_pos, goal_quat
+            )
+            joint_pos, success = self.move_to(ee_pos_b, ee_quat_b, gripper_open=False, record = self.record)
+            self.save_dict["actions"].append(np.concatenate([ee_pos_b.cpu().numpy(), ee_quat_b.cpu().numpy(), np.ones((B, 1))], axis=1))
+        is_grasp_success = self.is_grasp_success()
+        for b in range(B):
+            if self.final_gripper_state_list[b]:
+                self.wait(gripper_open=False, steps=10, record = self.record)
+            else:
+                self.wait(gripper_open=True, steps=10, record = self.record)
+
+        return joint_pos, is_grasp_success
+
+
+    def is_success(self):
+        obj_w = self.object_prim.data.root_state_w[:, :7]
+        origins = self.scene.env_origins
+        obj_w[:, :3] -= origins
+        obj_pose_l = pose_to_mat(obj_w[:, :3], obj_w[:, 3:7])
+        goal_pose_l = np.array(self.end_pose_list, dtype=np.float32)
+        goal_pose_l = pose_to_mat(goal_pose_l[:, :3], goal_pose_l[:, 3:7])
+        trans_dist, angle = pose_distance(obj_pose_l, goal_pose_l) 
+        print(f"[INFO] trans_dist: {trans_dist}, angle: {angle}")
+        if self.task_type == "simple_pick_place" or self.task_type == "simple_pick":
+            is_success = trans_dist < 0.10
+        elif self.task_type == "targetted_pick_place":
+            is_success = (trans_dist < 0.10) & (angle < np.radians(10))
+        else:
+            raise ValueError(f"[ERR] Invalid task type: {self.task_type}")
+        return is_success.cpu().numpy()
+
+    def is_grasp_success(self):
+        ee_w  = self.robot.data.body_state_w[:, self.robot_entity_cfg.body_ids[0], 0:7]
+        obj_w = self.object_prim.data.root_com_pos_w[:, 0:3]
+        dist = torch.norm(obj_w[:, :3] - ee_w[:, :3], dim=1) # [B]
+        print(f"[INFO] dist: {dist}")
+        return (dist < 0.15).cpu().numpy()
+
+
+    def viz_object_goals(self, sample_step=1, hold_steps=20):
+        self.reset()
+        self.wait(gripper_open=True, steps=10, record = False)
+        B = self.scene.num_envs
+        env_ids = torch.arange(B, device=self.object_prim.device, dtype=torch.long)
+        goals = self.obj_goal_traj_w
+        t_iter = list(range(0, goals.shape[1], sample_step))
+        t_iter = t_iter + [goals.shape[1]-1] if t_iter[-1] != goals.shape[1]-1 else t_iter
+        t_iter = t_iter[-1:]
+        for t in t_iter:
+            print(f"[INFO] viz object goal step {t}/{goals.shape[1]}")
+            pos_list, quat_list = [], []
+            for b in range(B):
+                pos, quat = mat_to_pose(goals[b, t])
+                pos_list.append(pos.astype(np.float32))
+                quat_list.append(quat.astype(np.float32))
+            pose = self.object_prim.data.root_state_w[:, :7]
+            # pose = self.object_prim.data.root_com_state_w[:, :7]
+            pose[:, :3]   = torch.tensor(np.stack(pos_list),  dtype=torch.float32, device=pose.device)
+            pose[:, 3:7]  = torch.tensor(np.stack(quat_list), dtype=torch.float32, device=pose.device)
+            self.show_goal(pose[:, :3], pose[:, 3:7])
+
+            for _ in range(hold_steps):
+                self.object_prim.write_root_pose_to_sim(pose, env_ids=env_ids)
+                self.object_prim.write_data_to_sim()
+                self.step()
+
+    # ---------- Helpers ----------
+    def _to_base(self, pos_w: np.ndarray | torch.Tensor, quat_w: np.ndarray | torch.Tensor):
+        """World → robot base frame for all envs."""
+        root = self.robot.data.root_state_w[:, 0:7]  # [B,7]
+        p_w, q_w = self._ensure_batch_pose(pos_w, quat_w)
+        pb, qb = subtract_frame_transforms(
+            root[:, 0:3], root[:, 3:7], p_w, q_w
+        )
+        return pb, qb  # [B,3], [B,4]
+
+    # ---------- Batched execution & lift-check ----------
+    def build_grasp_info(
+        self,
+        grasp_pos_w_batch: np.ndarray,   # (B,3)  GraspNet proposal in world frame
+        grasp_quat_w_batch: np.ndarray,  # (B,4)  wxyz
+        pregrasp_pos_w_batch: np.ndarray,
+        pregrasp_quat_w_batch: np.ndarray,
+
+    ) -> dict:
+        """
+        返回与 _build_info 相同结构，但每个 env 的抓取都可不同。
+        """
+        B = self.scene.num_envs
+        p_w   = np.asarray(grasp_pos_w_batch,  dtype=np.float32).reshape(B, 3)
+        q_w   = np.asarray(grasp_quat_w_batch, dtype=np.float32).reshape(B, 4)
+        pre_p_w = np.asarray(pregrasp_pos_w_batch, dtype=np.float32).reshape(B, 3)
+        pre_q_w = np.asarray(pregrasp_quat_w_batch, dtype=np.float32).reshape(B, 4)
+    
+
+        origins = self.scene.env_origins.detach().cpu().numpy().astype(np.float32)  # (B,3)
+        pre_p_w = pre_p_w + origins
+        p_w = p_w + origins
+
+        pre_pb, pre_qb = self._to_base(pre_p_w, pre_q_w)
+        pb, qb = self._to_base(p_w, q_w)
+
+        return {
+            "pre_p_w": pre_p_w, "p_w": p_w, "q_w": q_w,
+            "pre_p_b": pre_pb,  "pre_q_b": pre_qb,
+            "p_b": pb,      "q_b": qb,
+        }
+    
+
+    def inference(self, std: float = 0.0) -> list[int]:
+        """
+        Main function of the heuristic manipulation policy.
+        Physical trial-and-error grasping with approach-axis perturbation:
+          • Multiple grasp proposals executed in parallel;
+          • Every attempt does reset → pre → grasp → close → lift → check;
+          • Early stops when any env succeeds; then re-exec for logging.
+        """
+        B = self.scene.num_envs
+
+        self.wait(gripper_open=True, steps=10, record = self.record)
+
+        
+        # reset and conduct main process: open→pre→grasp→close→follow_object_goals
+        self.reset()
+        #self.viz_object_goals()
+
+
+        cam_p = self.camera.data.pos_w
+        cam_q = self.camera.data.quat_w_ros
+        gp_w  = torch.as_tensor(np.array(self.grasp_pose_list,  dtype=np.float32)[:,:3], dtype=torch.float32, device=self.sim.device)
+        gq_w  = torch.as_tensor(np.array(self.grasp_pose_list, dtype=np.float32)[:,3:7], dtype=torch.float32, device=self.sim.device)
+        pre_w = torch.as_tensor(np.array(self.pregrasp_pose_list, dtype=np.float32)[:,:3], dtype=torch.float32, device=self.sim.device)
+        gp_cam,  gq_cam  = subtract_frame_transforms(cam_p, cam_q, gp_w,  gq_w)
+        pre_cam, pre_qcm = subtract_frame_transforms(cam_p, cam_q, pre_w, gq_w)
+        self.save_dict["grasp_pose_cam"]    = torch.cat([gp_cam,  gq_cam],  dim=1).unsqueeze(0).cpu().numpy()
+        self.save_dict["pregrasp_pose_cam"] = torch.cat([pre_cam, pre_qcm], dim=1).unsqueeze(0).cpu().numpy()
+
+        jp = self.robot.data.joint_pos[:, self.robot_entity_cfg.joint_ids]
+        self.wait(gripper_open=True, steps=4, record = self.record)
+
+        # pre → grasp
+        info_all = self.build_grasp_info(gp_w.cpu().numpy(), gq_w.cpu().numpy(), pre_w.cpu().numpy(), gq_w.cpu().numpy())
+        jp, success = self.move_to(info_all["pre_p_b"], info_all["pre_q_b"], gripper_open=True, record = self.record)
+        if torch.any(success==False): return []
+        self.save_dict["actions"].append(np.concatenate([info_all["pre_p_b"].cpu().numpy(), info_all["pre_q_b"].cpu().numpy(), np.zeros((B, 1))], axis=1))
+        jp = self.wait(gripper_open=True, steps=3, record = self.record)
+
+        jp, success = self.move_to(info_all["p_b"], info_all["q_b"], gripper_open=True, record = self.record)
+        if torch.any(success==False): return []
+        self.save_dict["actions"].append(np.concatenate([info_all["p_b"].cpu().numpy(), info_all["q_b"].cpu().numpy(), np.zeros((B, 1))], axis=1))
+
+        # close gripper
+        jp = self.wait(gripper_open=False, steps=50, record = self.record)
+        self.save_dict["actions"].append(np.concatenate([info_all["p_b"].cpu().numpy(), info_all["q_b"].cpu().numpy(), np.ones((B, 1))], axis=1))
+
+        # object goal following
+        # self.lift_up(height=0.05, gripper_open=False)
+        if self.task_type == "simple_pick_place" or self.task_type == "simple_pick":
+            jp, is_success = self.follow_object_centers(jp, sample_step=1, visualize=True)
+        elif self.task_type == "targetted_pick_place":
+            jp, is_success = self.follow_object_goals(jp, sample_step=1, visualize=True)
+        else:
+            raise ValueError(f"[ERR] Invalid task type: {self.task_type}")
+        #jp = self.follow_object_goals(jp, sample_step=1, visualize=True)
+
+        is_success = is_success & self.is_success()
+        # Arrange the output: we want to collect only the successful env ids as a list.
+        is_success = torch.tensor(is_success, dtype=torch.bool, device=self.sim.device)
+        success_env_ids = torch.where(is_success)[0].cpu().numpy().tolist()
+
+        print(f"[INFO] success_env_ids: {success_env_ids}")
+        if self.record:
+            self.save_data(ignore_keys=["segmask", "depth"], env_ids=success_env_ids)
+        
+        return success_env_ids
+
+    def run_batch_trajectory(self, traj_cfg_list: List[TrajectoryCfg]):
+        self.traj_cfg_list = traj_cfg_list
+        self.compute_components()
+        self.compute_object_goal_traj()
+        
+        return self.inference()
+
+
+
+# ──────────────────────────── Entry Point ────────────────────────────
+
+  
+
+
+def sim_randomize_rollout(keys: list[str], args_cli: argparse.Namespace):
+    for key in keys:
+        success_trajectory_config_list = []
+        total_require_traj_num = 50
+        task_json_path = BASE_DIR / "tasks" / key / "task.json"
+        task_cfg = load_task_cfg(task_json_path)
+        randomizer = Randomizer(task_cfg)
+        random_task_cfg_list = randomizer.generate_randomized_scene_cfg(grid_dist=0.0 , grid_num=3, angle_random_range=1/20 * np.pi, angle_random_num=10, traj_randomize_num=10, scene_randomize_num=1, robot_pose_randomize_range=0, robot_pose_randomize_angle=np.pi/180, robot_pose_randomize_num=10)
+
+        args_cli.key = key
+        sim_cfgs = load_sim_parameters(BASE_DIR, key)
+      
+        demo_dir = BASE_DIR / "tasks" / key / "demos"
+        data_dir = BASE_DIR / "data" / key
+        current_timestep = 0
+        env, _ = make_env(
+                cfgs=sim_cfgs, num_envs=3,
+                device=args_cli.device,
+                bg_simplify=False,
+            )
+        sim, scene = env.sim, env.scene
+        bg_rgb_path = task_cfg.bg_rgb_path
+
+        bg_rgb = imageio.imread(bg_rgb_path)
+        my_sim = RandomizeExecution(sim, scene, sim_cfgs=sim_cfgs, demo_dir=demo_dir, data_dir=data_dir, record=True, args_cli=args_cli, bg_rgb=bg_rgb)
+        my_sim.task_cfg = task_cfg
+        while len(success_trajectory_config_list) < total_require_traj_num:
+            traj_cfg_list = random_task_cfg_list[current_timestep: current_timestep + 3]
+            current_timestep += 3
+           
+            success_env_ids = my_sim.run_batch_trajectory(traj_cfg_list)
+            
+            env.close()
+            if len(success_env_ids) > 0:
+                for env_id in success_env_ids:
+                    success_trajectory_config_list.append(traj_cfg_list[env_id])
+            
+            print(f"[INFO] success_trajectory_config_list: {len(success_trajectory_config_list)}")
+            print(total_require_traj_num)
+
+        # for timestep in range(len(success_trajectory_config_list),10):
+        #     traj_cfg_list = random_task_cfg_list[timestep: min(timestep + 10, len(random_task_cfg_list))]
+        #     my_sim = RandomizeExecution(sim, scene, sim_cfgs=sim_cfgs, traj_cfg_list=traj_cfg_list, demo_dir=demo_dir, record=True)
+        #     success_env_ids = my_sim.inference()
+        #     del my_sim
+        #     torch.cuda.empty_cache()
+     
+    
+    return success_trajectory_config_list
+
diff --git a/openreal2sim/simulation/isaaclab/sim_utils/calibration_utils.py b/openreal2sim/simulation/isaaclab/sim_utils/calibration_utils.py
old mode 100644
new mode 100755
diff --git a/openreal2sim/simulation/isaaclab/sim_utils/sim_utils.py b/openreal2sim/simulation/isaaclab/sim_utils/sim_utils.py
old mode 100644
new mode 100755
index 08dc60f..062a3dc
--- a/openreal2sim/simulation/isaaclab/sim_utils/sim_utils.py
+++ b/openreal2sim/simulation/isaaclab/sim_utils/sim_utils.py
@@ -9,7 +9,7 @@
 from sim_utils.calibration_utils import calibration_to_robot_pose, load_extrinsics
 
 def load_sim_parameters(basedir, key) -> dict:
-    scene_json_path = Path(basedir) / "outputs" / key / "scene" / "scene.json"
+    scene_json_path = Path(basedir) / "outputs" / key / "simulation" / "scene.json"
     scene_json   = json.load(open(scene_json_path, "r"))
     exp_config = yaml.load(open(Path(basedir) / "config/config.yaml"), Loader=yaml.FullLoader)
     exp_config = exp_config["local"][key]["simulation"]
@@ -44,23 +44,24 @@ def load_sim_parameters(basedir, key) -> dict:
     }
 
     # demo configs
+   
     goal_offset = exp_config.get("goal_offset", 0)
     grasp_idx = exp_config.get("grasp_idx", -1)
     grasp_pre = exp_config.get("grasp_pre", None)
     grasp_delta = exp_config.get("grasp_delta", None)
-    traj_key = exp_config.get("traj_key", "fdpose_trajs") # "fdpose_trajs", "simple_trajs", "hybrid_trajs"
-    manip_object_id = exp_config.get("manip_object_id", "1")
-    traj_path = scene_json["objects"][manip_object_id][traj_key]
+    manip_object_id = scene_json["manipulated_oid"]
+    traj_path = scene_json["objects"][manip_object_id]["final_trajs"]
     grasp_path = scene_json["objects"][manip_object_id].get("grasps", None)
+    final_gripper_closed = scene_json["final_gripper_closed"]
     demo_cfg = {
         "manip_object_id": manip_object_id,
-        "traj_key": traj_key,
         "traj_path": traj_path,
         "goal_offset": goal_offset,
         "grasp_idx": grasp_idx,
         "grasp_pre": grasp_pre,
         "grasp_delta": grasp_delta,
         "grasp_path": grasp_path,
+        "final_gripper_closed": final_gripper_closed,
     }
 
     # physics configs
diff --git a/openreal2sim/simulation/isaaclab/sim_utils/transform_utils.py b/openreal2sim/simulation/isaaclab/sim_utils/transform_utils.py
old mode 100644
new mode 100755
diff --git a/openreal2sim/simulation/isaaclab/task_interface.py b/openreal2sim/simulation/isaaclab/task_interface.py
new file mode 100755
index 0000000..e69de29
diff --git a/openreal2sim/simulation/modules/demo_motion_process.py b/openreal2sim/simulation/modules/demo_motion_process.py
new file mode 100755
index 0000000..6566aee
--- /dev/null
+++ b/openreal2sim/simulation/modules/demo_motion_process.py
@@ -0,0 +1,392 @@
+####
+
+### assign object types, determine start and end frame,  downsample traj, compute stacking sequence and assign mask.
+### TODO: 加一个ending的mesh进来 就这样。
+import yaml
+import numpy as np
+from pyquaternion import Quaternion
+from pathlib import Path
+import pickle
+import cv2
+import trimesh
+import sys
+import json
+base_dir = Path.cwd()
+sys.path.append(str(base_dir / "openreal2sim" / "simulation"))
+
+def recompute_traj(traj, c2w, extrinsics):
+    for i in range(len(traj)):
+        traj[i] = c2w @ np.linalg.inv(extrinsics[0]) @ extrinsics[i] @ np.linalg.inv(c2w) @ traj[i]
+    return traj
+
+def recompute_all_traj(scene_json_dict, scene_dict):
+    for obj_id, obj in scene_json_dict["objects"].items():
+       for key in 'fdpose_trajs', 'simple_trajs', 'hybrid_trajs':
+        traj_path = obj[key]
+        traj = np.load(traj_path)
+        traj = traj.reshape(-1, 4, 4)
+        c2w = np.array(scene_json_dict["camera"]["camera_opencv_to_world"]).astype(np.float64).reshape(4,4) 
+        extrinsics = np.array(scene_dict["extrinsics"]).astype(np.float64).reshape(-1,4,4) 
+        traj = recompute_traj(traj, c2w, extrinsics)
+        new_traj_path = traj_path.replace(".npy", "_recomputed.npy").replace("reconstruction", "simulation").replace("motions","")
+        scene_json_dict["objects"][obj_id][f"{key}_recomputed"] = str(new_traj_path)
+        np.save(new_traj_path, traj)
+
+
+def get_mask_from_frame(scene_dict, frame_index, oid):
+    mask = scene_dict["mask"][frame_index][oid]["mask"]
+    return mask
+
+
+def get_object_bbox(mesh_in_path):
+    mesh = trimesh.load(mesh_in_path)
+    bbox = mesh.bounds
+    return bbox
+
+
+
+def lift_traj(traj, height=0.02):
+    new_traj = []
+    for i in range(len(traj)):
+        new_traj.append(traj[i])
+    for i in range(1, len(new_traj) - 1):
+        new_traj[i][2][3] += height
+    return new_traj
+
+def determine_stacking_bbox(bbox_0, bbox_1):
+    """
+    Determine if two bboxes have intersection on x and y axes.
+
+    bbox_0, bbox_1: np.ndarray, (2,3) or (2,2) or (2,?) where axis 0 is (min,max) and axis 1 is x,y,(z)
+
+    Returns: True if x and y intervals overlap (i.e. stacking is possible), False otherwise
+
+    Generally this is enough for start frame.
+    """
+    # Extract x/y intervals
+    x0_min, x0_max = bbox_0[0][0], bbox_0[1][0]
+    y0_min, y0_max = bbox_0[0][1], bbox_0[1][1]
+    x1_min, x1_max = bbox_1[0][0], bbox_1[1][0]
+    y1_min, y1_max = bbox_1[0][1], bbox_1[1][1]
+    z0_min, z0_max = bbox_0[0][2], bbox_0[1][2]
+    z1_min, z1_max = bbox_1[0][2], bbox_1[1][2]
+
+    x_overlap = not (x0_max < x1_min or x1_max < x0_min)
+    y_overlap = not (y0_max < y1_min or y1_max < y0_min)
+    if x_overlap and y_overlap:
+        return True
+
+    return False
+
+
+
+def load_obj_masks(data: dict):
+    """
+    Return object list for frame-0:
+        [{'mask': bool array, 'name': name, 'bbox': (x1,y1,x2,y2)}, ...]
+    Filter out names: 'ground' / 'hand' / 'robot'
+    """
+    frame_objs = data.get(0, {})  # only frame 0
+    objs = []
+    for oid, item in frame_objs.items():
+        name = item["name"]
+        if name in ("ground", "hand", "robot"):
+            continue
+        objs.append({
+            "oid":  oid,
+            "mask":  item["mask"].astype(bool),
+            "name": name,
+            "bbox":  item["bbox"]          # used for cropping
+        })
+    # Keep original behavior: sort by mask area (desc)
+    objs.sort(key=lambda x: int(x["oid"]))
+    return objs
+
+
+
+def pose_distance(T1, T2):
+    # Translation distance
+    t1 = T1[:3, 3]
+    t2 = T2[:3, 3]
+    trans_dist = np.linalg.norm(t2 - t1)
+    # Rotation difference (angle)
+    R1 = T1[:3, :3]
+    R2 = T2[:3, :3]
+    dR = R2 @ R1.T
+    # numerical errors can make the trace slightly out of bounds
+    cos_angle = (np.trace(dR) - 1) / 2
+    cos_angle = np.clip(cos_angle, -1.0, 1.0)
+    angle = np.arccos(cos_angle)
+    return trans_dist, angle
+
+def build_mask_array(
+    oid: int,
+    scene_dict: dict[str, any]
+) -> np.ndarray:
+
+    H, W, N = scene_dict["height"], scene_dict["width"], scene_dict["n_frames"]
+    out = np.zeros((N, H, W), dtype=np.uint8)
+    for i in range(N):
+        m = scene_dict["mask"][i][oid]["mask"]
+        out[i] = m.astype(np.uint8)
+    return out
+
+
+
+def downsample_traj(trajectory, trans_threshold=0.05, rot_threshold=np.radians(20)):
+    """
+    Downsample pose trajectory where each element is a 4x4 transformation matrix (SE3).
+    Retains keyframes with sufficient translation or rotation from the previous kept frame.
+    Returns: list of indices for retained frames.
+    """
+    if len(trajectory) <= 2:
+        return list(range(len(trajectory)))
+
+
+    downsampled_indices = [0]
+    prev_idx = 0
+    for i in range(1, len(trajectory) - 1):
+        trans_dist, angle = pose_distance(trajectory[prev_idx], trajectory[i])
+        if trans_dist >= trans_threshold or angle >= rot_threshold:
+            downsampled_indices.append(i)
+            prev_idx = i
+    if len(downsampled_indices) > 0:
+        last_kept_idx = downsampled_indices[-1]
+        trans_dist, angle = pose_distance(trajectory[last_kept_idx], trajectory[-1])
+        if trans_dist > 0.6 * trans_threshold or angle > 0.6 * rot_threshold:
+            downsampled_indices.append(len(trajectory) - 1)
+        elif trans_dist > 0.1 * trans_threshold or angle > 0.1 * rot_threshold:
+            downsampled_indices[-1] = len(trajectory) - 1
+
+    return downsampled_indices
+
+def refine_end_frame(hand_masks, object_masks):
+    N = min(len(hand_masks), len(object_masks))
+    kernel = np.ones((10, 10), np.uint8)
+    for i in range(N-1, -1, -1):
+        if hand_masks[i] is None:
+            continue
+        hand_mask = hand_masks[i].astype(np.uint8)
+        obj_mask = object_masks[i].astype(np.uint8)
+        obj_mask_dilated = cv2.dilate(obj_mask, kernel, iterations=1)
+        overlap = (hand_mask & obj_mask_dilated).sum()
+        if overlap > 0:
+            return i + 1
+    return 0                 
+    
+
+def demo_motion_process(keys, key_scene_dicts, key_cfgs):
+    base_dir = Path.cwd()
+    for key in keys:
+        scene_dict = key_scene_dicts[key]
+        scene_json_dict_path = base_dir / f"outputs/{key}/simulation/scene.json"
+        with open(scene_json_dict_path, "r") as f:
+            scene_json_dict = json.load(f)
+        recompute_all_traj(scene_json_dict, scene_dict)
+        key_cfg = key_cfgs[key]
+        max_placement_oid = None
+        max_placement_distance = 0.0
+        placement_distances = {}
+        fd_traj_dict = {}
+        for obj_id, obj in scene_json_dict["objects"].items():
+            obj_traj = obj["simple_trajs_recomputed"]
+            obj_traj = np.load(obj_traj)
+            obj_traj = obj_traj.reshape(-1, 4, 4)
+            abs_distance, _ = pose_distance(obj_traj[0], obj_traj[-1])
+            fd_traj = obj["fdpose_trajs_recomputed"]
+            fd_traj = np.load(fd_traj)
+            fd_traj = fd_traj.reshape(-1, 4, 4)
+            fd_dist, fd_angle = pose_distance(fd_traj[0], fd_traj[-1])
+            fd_traj_dict[obj_id] = {
+                "fd_distance": fd_dist,
+                "fd_angle": fd_angle
+            }
+            print(f"Object: {obj_id}, distance: {abs_distance}")
+            if abs_distance > max_placement_distance:
+                max_placement_distance = abs_distance
+                max_placement_oid = obj_id
+            scene_json_dict["objects"][obj_id]["type"] = "static"  
+            placement_distances[obj_id] = abs_distance
+        scene_json_dict["manipulated_oid"] = max_placement_oid
+        name = scene_json_dict["objects"][max_placement_oid]["name"]
+        scene_json_dict["objects"][max_placement_oid]["type"] = "manipulated"
+        print(f"Manipulated object: {max_placement_oid}, distance: {abs_distance}")
+        manipulated_fd_trajs_path = scene_json_dict["objects"][max_placement_oid]["fdpose_trajs_recomputed"]
+        manipulated_fd_trajs = np.load(manipulated_fd_trajs_path)
+        angle_displacement = [pose_distance(manipulated_fd_trajs[i], manipulated_fd_trajs[i+1])[1] for i in range(len(manipulated_fd_trajs)-1)]
+        angle_displacement_sum = sum(angle_displacement)
+
+        fd_traj_dict.pop(max_placement_oid)
+        traj_key = 'fdpose_trajs_recomputed'
+        if len(fd_traj_dict.keys()) > 0:
+            max_fd_distance = max(fd_traj_dict.values(), key=lambda x: x["fd_distance"])
+            max_fd_angle = max(fd_traj_dict.values(), key=lambda x: x["fd_angle"])
+        else:
+            max_fd_distance = 0.0
+            max_fd_angle = 0.0
+        if angle_displacement_sum > 180 or max_fd_angle > np.radians(20):
+            traj_key = 'simple_trajs_recomputed'
+        else:
+            if max_fd_distance > 0.05:
+                traj_key = 'hybrid_trajs_recomputed'
+        print(f"Traj key: {traj_key}")
+        scene_json_dict["traj_key"] = traj_key
+        manipulated_trajs_path = scene_json_dict["objects"][max_placement_oid][traj_key]
+        manipulated_trajs = np.load(manipulated_trajs_path)
+        object_masks = build_mask_array(int(max_placement_oid), scene_dict)
+        hand_masks = scene_dict["simulation"]["hand_masks"]  
+        end_frame = refine_end_frame(hand_masks, object_masks)
+        print(f"End frame: {end_frame}")
+        trajs = manipulated_trajs
+        if end_frame == len(manipulated_trajs) - 1:
+            scene_json_dict["gripper_closed"] = True
+        elif end_frame == 0:
+            if np.abs(manipulated_trajs[0][2][3]) < 0.03:
+                scene_json_dict["gripper_closed"] = False
+            else:
+                scene_json_dict["gripper_closed"] = True
+        else:
+            scene_json_dict["gripper_closed"] = False
+            #trajs = manipulated_trajs[:end_frame]
+        
+        # require_calib = False
+        # placement_distances = {}
+        # for obj_id, distance in placement_distances.items():
+        #     if obj_id == max_placement_oid:
+        #         placement_distances.pop(obj_id)
+        #         break
+        # for obj_id, distance in placement_distances.items():
+        #     if distance > 0.05:
+        #         require_calib = True
+        #         break
+        # if require_calib:
+        #     new_manip_traj = []
+        #     calib_goal_id = np.argmax(placement_distances.values())
+        #     calib_goal_oid = list(placement_distances.keys())[calib_goal_id]
+        #     calib_trajs_path = scene_dict["info"]["objects"][calib_goal_oid]["fdpose_trajs"]
+        #     calib_trajs = np.load(calib_trajs_path)
+        #     calib_ref_pose = calib_trajs[0].reshape(4, 4)  # calib物体参考位姿（理想静止状态）
+        #     for (single_manip_traj, single_calib_traj) in zip(manipulated_trajs, calib_trajs):
+        #         single_manip_traj = single_manip_traj.reshape(4, 4)
+        #         single_calib_traj = single_calib_traj.reshape(4, 4)
+        #         # 把当前obj相对于当前calib的变换应用到原先的calib上（消除calib漂移）
+        #         # 步骤：1) 计算obj相对于当前calib的变换: inv(single_calib_traj) @ single_manip_traj
+        #         #      2) 将这个相对变换应用到参考calib: calib_ref_pose @ (相对变换)
+        #         # import pdb; pdb.set_trace()
+        #         new_single_traj = calib_ref_pose @ np.linalg.inv(single_calib_traj) @ single_manip_traj
+        #         new_manip_traj.append(new_single_traj)
+        #     new_manip_traj = np.array(new_manip_traj)
+        #     np.save(manipulated_trajs_path, new_manip_traj)
+        #     trajs = new_manip_traj[:end_frame]
+        # else:
+        #     trajs = manipulated_trajs[:end_frame]
+        downsampled_indices = downsample_traj(trajs, trans_threshold=0.03, rot_threshold=np.radians(20))
+        scene_json_dict["chosen_indices"] = downsampled_indices
+        trajs = trajs[downsampled_indices]
+        
+
+        trajs = lift_traj(trajs, height=0.02)
+        print(f"Downsampled indices: {[downsampled_indices]}")
+        downsampled_traj_path = base_dir / f"outputs/{key}/simulation" / f"{max_placement_oid}_final_traj.npy"
+        
+      
+        print(f"Trajs: {trajs[-1]}")
+        np.save(downsampled_traj_path, trajs)
+        scene_json_dict["objects"][max_placement_oid]["final_trajs"] = str(downsampled_traj_path)
+        scene_json_dict["start_frame_idx"] = downsampled_indices[1]
+
+        end_pose = trajs[-1]
+        mesh_in_path = scene_json_dict["objects"][max_placement_oid]["optimized"]
+        mesh = trimesh.load(mesh_in_path)
+        mesh.apply_transform(end_pose)
+        path = base_dir / f"outputs/{key}/simulation/{max_placement_oid}_{name}_end.glb"
+        mesh.export(path)
+        scene_json_dict["objects"][max_placement_oid]["end_mesh"] = str(path)
+        bbox_end = get_object_bbox(path)
+
+        start_related = []
+        end_related = []
+        mesh_in_path = scene_json_dict["objects"][max_placement_oid]["optimized"]
+        bbox_start = get_object_bbox(mesh_in_path)
+        for oid, obj in scene_json_dict["objects"].items():
+            if not isinstance(oid, int):
+                continue
+            if oid == max_placement_oid:
+                continue
+            mesh_in_path = obj["optimized"]
+            bbox = get_object_bbox(mesh_in_path)
+            if determine_stacking_bbox(bbox_start, bbox):
+                start_related.append(oid)
+            if determine_stacking_bbox(bbox_end, bbox):
+                end_related.append(oid)
+
+        print(f"Start related: {start_related}")
+        print(f"End related: {end_related}")
+
+        scene_json_dict["start_related"] = list(start_related)
+        scene_json_dict["end_related"] = list(end_related)
+
+        scene_path = scene_json_dict["scene_mesh"]["optimized"]
+        scene_mesh = trimesh.load(scene_path)
+        scene_mesh = scene_mesh + mesh
+        path = base_dir / f"outputs/{key}/simulation/scene_end.glb"
+        scene_mesh.export(path)
+
+
+
+        if len(end_related) > 0:
+            target_oid = end_related[0]
+            scene_json_dict["target_oid"] = target_oid
+            scene_json_dict["task_type"] = "targetted_pick_place"
+            if len(scene_json_dict["task_desc"]) > 0:
+                task_desc = "Pick up the " + name + " and place it on the " + target_oid + "."
+                if task_desc not in scene_json_dict["task_desc"]:
+                    scene_json_dict["task_desc"].append(task_desc)
+            else:
+                task_desc = "Pick up the " + name + " and place it on the " + target_oid + "."
+                if task_desc not in scene_json_dict["task_desc"]:
+                    scene_json_dict["task_desc"] = [task_desc]
+        else:
+            if bbox_end[0][2] < 0.03:
+                scene_json_dict["task_type"] = "simple_pick_place"
+                task_desc = "Pick up the " + name + " and place it on the ground."
+                if len(scene_json_dict.get("task_desc", [])) == 0:
+                    scene_json_dict["task_desc"] = [task_desc]
+                if task_desc not in scene_json_dict["task_desc"]:
+                    scene_json_dict["task_desc"].append(task_desc)
+            else:
+                scene_json_dict["task_type"] = "simple_pick"
+                task_desc = "Pick up the " + name + "."
+                if len(scene_json_dict.get("task_desc", [])) == 0:
+                    scene_json_dict["task_desc"] = [task_desc]
+                if task_desc not in scene_json_dict["task_desc"]:
+                    scene_json_dict["task_desc"].append(task_desc)
+
+        print(f"Task description: {scene_json_dict['task_desc']}")
+        print(f"Task type: {scene_json_dict['task_type']}")
+        
+        with open(base_dir / f"outputs/{key}/simulation/scene.json", "w") as f:
+            json.dump(scene_json_dict, f, indent=2)
+        with open(base_dir / f"outputs/{key}/scene/scene.pkl", "wb") as f:
+            pickle.dump(scene_dict, f)
+        return key_scene_dicts
+
+
+if __name__ == "__main__":
+    base_dir = Path.cwd()
+    cfg_path = base_dir / "config" / "config.yaml"
+    cfg = yaml.safe_load(cfg_path.open("r"))
+    keys = cfg["keys"]
+    from utils.compose_config import compose_configs
+    key_cfgs = {key: compose_configs(key, cfg) for key in keys} 
+    print(f"Key cfgs: {key_cfgs}")
+    key_scene_dicts = {}
+    for key in keys:
+        scene_pkl = base_dir / f'outputs/{key}/scene/scene.pkl'
+        with open(scene_pkl, 'rb') as f:
+            scene_dict = pickle.load(f)
+        key_scene_dicts[key] = scene_dict
+    demo_motion_process(keys, key_scene_dicts, key_cfgs)
+
+        
+
diff --git a/openreal2sim/simulation/modules/envs/randomzier.py b/openreal2sim/simulation/modules/envs/randomzier.py
new file mode 100755
index 0000000..fde28fa
--- /dev/null
+++ b/openreal2sim/simulation/modules/envs/randomzier.py
@@ -0,0 +1,348 @@
+import numpy as np
+import random
+from scipy.spatial.transform import Rotation as R
+from scipy.spatial.transform import Slerp
+from .task_cfg import TaskCfg, TaskType, SuccessMetric, SuccessMetricType, TrajectoryCfg
+
+import torch
+import transforms3d
+
+def pose_to_mat(pos, quat):
+    if torch.is_tensor(pos):  pos  = pos.cpu().numpy()
+    if torch.is_tensor(quat): quat = quat.cpu().numpy()
+    m = np.eye(4, dtype=np.float32)
+    m[:3, :3] = transforms3d.quaternions.quat2mat(quat)
+    m[:3,  3] = pos
+    return m
+
+def mat_to_pose(mat):
+    pos  = mat[:3, 3]
+    quat = transforms3d.quaternions.mat2quat(mat[:3, :3])
+    return pos, quat
+
+
+
+class Randomizer(TaskCfg):
+    def __init__(self, task_cfg: TaskCfg):
+        self.task_cfg = task_cfg
+
+    
+    def generate_randomized_scene_cfg(self, grid_dist: float, grid_num: int, angle_random_range: float, angle_random_num: int, traj_randomize_num:int, scene_randomize_num: int, robot_pose_randomize_range, robot_pose_randomize_angle, robot_pose_randomize_num):
+        # Step 1: Generate candidate transforms
+      
+        candidate_transforms = []
+        for i in range(-grid_num, grid_num):
+            for j in range(-grid_num, grid_num):
+                random_angles = np.random.uniform(-angle_random_range, angle_random_range, angle_random_num)
+                for angle in random_angles:
+                    orig = np.eye(4)
+                    orig[0, 3] = i * grid_dist
+                    orig[1, 3] = j * grid_dist
+                    orig[0, 0] = np.cos(angle)
+                    orig[0, 1] = -np.sin(angle)
+                    orig[1, 0] = np.sin(angle)
+                    orig[1, 1] = np.cos(angle)
+                    candidate_transforms.append(orig)
+        
+        # Step 2: Generate traj_randomize_num combinations of (start_pose, end_pose)
+        traj_pose_pairs = []
+        for _ in range(traj_randomize_num):
+            start_pose = random.choice(candidate_transforms)
+            if self.task_cfg.task_type == TaskType.SIMPLE_PICK:
+                end_pose = start_pose.copy()  # For SIMPLE_PICK, end same as start
+            else:
+                end_pose = random.choice(candidate_transforms)
+            traj_pose_pairs.append((start_pose, end_pose))
+        
+        # Step 3: Generate scene_randomize_num combinations for other objects
+        other_object_ids = [obj.object_id for obj in self.task_cfg.objects 
+                           if obj.object_id != self.task_cfg.manipulated_oid and obj.object_id not in self.task_cfg.start_related and obj.object_id not in self.task_cfg.end_related]
+        
+        scene_poses_combinations = []
+        for _ in range(scene_randomize_num):
+            # Create list of (oid, pose) pairs
+            other_object_poses = [(oid, random.choice(candidate_transforms)) 
+                                  for oid in other_object_ids]
+            scene_poses_combinations.append(other_object_poses)
+        
+        # Step 4: Generate robot_pose_randomize_num random robot poses
+        # robot_pose format: [x, y, z, w, x, y, z] (position + quaternion wxyz)
+        ref_traj = self.task_cfg.reference_trajectory
+        assert ref_traj is not None, "Reference trajectory is not found"
+        ref_robot_pose = np.array(ref_traj.robot_pose)
+        robot_pose_mat = pose_to_mat(ref_robot_pose[:3], ref_robot_pose[3:7])
+        
+        robot_poses = []
+        for _ in range(robot_pose_randomize_num):
+            # Random translation within range
+            random_trans = np.random.uniform(
+                -robot_pose_randomize_range, 
+                robot_pose_randomize_range, 
+                3
+            )
+            random_rot = np.random.uniform(
+                -robot_pose_randomize_angle, 
+                robot_pose_randomize_angle, 
+            )
+
+            rotate_matrix = np.eye(4)
+            rotate_matrix[:3, :3] = R.from_euler('z', random_rot).as_matrix()
+            new_robot_pose = robot_pose_mat @ rotate_matrix
+            new_robot_pose[:3, 3] += random_trans
+            # Combine position and quaternion [x, y, z, w, x, y, z]
+            pos, quat = mat_to_pose(new_robot_pose)
+            robot_pose_7d = np.concatenate([pos, quat])
+            robot_poses.append(robot_pose_7d.tolist())
+        
+        # Step 5: Generate trajectories for all combinations
+        generated_trajectories = []
+        ref_traj = self.task_cfg.reference_trajectory
+        for start_pose, end_pose in traj_pose_pairs:
+            for other_object_poses in scene_poses_combinations:
+                for robot_pose in robot_poses:
+                    # Generate trajectory for manipulated object
+                    if self.task_cfg.task_type == TaskType.SIMPLE_PICK:
+                        new_traj_mats = []
+                        for traj_pose_7d in ref_traj.object_trajectory:
+                            pose_mat = pose_to_mat(np.array(traj_pose_7d[:3]), np.array(traj_pose_7d[3:7]))
+                            new_traj_mats.append(start_pose @ pose_mat)
+                        # Convert back to 7D format
+                        new_traj_7d = []
+                        for mat in new_traj_mats:
+                            pos, quat = mat_to_pose(mat)
+                            new_traj_7d.append(np.concatenate([pos, quat]).tolist())
+                        
+                        # Transform pregrasp and grasp poses
+                        if ref_traj.pregrasp_pose:
+                            pregrasp_mat = start_pose @ pose_to_mat(np.array(ref_traj.pregrasp_pose[:3]), np.array(ref_traj.pregrasp_pose[3:7]))
+                            pos, quat = mat_to_pose(pregrasp_mat)
+                            pregrasp_pose = np.concatenate([pos, quat]).tolist()
+                        else:
+                            pregrasp_pose = None
+                            
+                        if ref_traj.grasp_pose:
+                            grasp_mat = start_pose @ pose_to_mat(np.array(ref_traj.grasp_pose[:3]), np.array(ref_traj.grasp_pose[3:7]))
+                            pos, quat = mat_to_pose(grasp_mat)
+                            grasp_pose = np.concatenate([pos, quat]).tolist()
+                        else:
+                            grasp_pose = None
+                    else:
+                        # Convert reference trajectory to mat format
+                        ref_traj_mats = []
+                        for traj_pose_7d in ref_traj.object_trajectory:
+                            ref_traj_mats.append(pose_to_mat(np.array(traj_pose_7d[:3]), np.array(traj_pose_7d[3:7])))
+                        ref_traj_mats = np.array(ref_traj_mats)
+                        
+                        new_traj_mats = self.compute_new_traj(start_pose, end_pose, ref_traj_mats)
+                        
+                        # Convert back to 7D format
+                        new_traj_7d = []
+                        for mat in new_traj_mats:
+                            pos, quat = mat_to_pose(mat)
+                            new_traj_7d.append(np.concatenate([pos, quat]).tolist())
+                        #new_traj_7d = lift_traj(new_traj_7d)
+                        # Transform pregrasp and grasp poses
+                        if ref_traj.pregrasp_pose:
+                            pregrasp_mat = start_pose @ pose_to_mat(np.array(ref_traj.pregrasp_pose[:3]), np.array(ref_traj.pregrasp_pose[3:7]))
+                            pos, quat = mat_to_pose(pregrasp_mat)
+                            pregrasp_pose = np.concatenate([pos, quat]).tolist()
+                        else:
+                            pregrasp_pose = None
+                            
+                        if ref_traj.grasp_pose:
+                            grasp_mat = start_pose @ pose_to_mat(np.array(ref_traj.grasp_pose[:3]), np.array(ref_traj.grasp_pose[3:7]))
+                            pos, quat = mat_to_pose(grasp_mat)
+                            grasp_pose = np.concatenate([pos, quat]).tolist()
+                        else:
+                            grasp_pose = None
+                    
+                    # Apply scene randomization to other objects (if needed, update object_poses here)
+                    # other_object_poses is now a list of (oid, pose) pairs
+                    
+                    # Create success metric    
+
+                    ref_end_pose_mat = pose_to_mat(
+                        np.array(ref_traj.success_metric.end_pose[:3]), 
+                        np.array(ref_traj.success_metric.end_pose[3:7])
+                    )
+                    end_pose_metric_mat = end_pose @ ref_end_pose_mat
+                    pos, quat = mat_to_pose(end_pose_metric_mat)
+                    end_pose_metric_7d = np.concatenate([pos, quat]).tolist()
+
+                    success_metric_type = ref_traj.success_metric.success_metric_type
+                    if success_metric_type == SuccessMetricType.SIMPLE_LIFT:
+                        success_metric = SuccessMetric(
+                            success_metric_type=SuccessMetricType.SIMPLE_LIFT,
+                            lift_height=ref_traj.success_metric.lift_height,
+                            final_gripper_close=ref_traj.success_metric.final_gripper_close,
+                            end_pose=end_pose_metric_7d
+                        )
+                    elif success_metric_type == SuccessMetricType.TARGET_POINT:
+                        # Convert end_pose from 7D to mat, transform, then back to 7D
+                        success_metric = SuccessMetric(
+                            success_metric_type=SuccessMetricType.TARGET_POINT,
+                            final_gripper_close=ref_traj.success_metric.final_gripper_close,
+                            end_pose=end_pose_metric_7d
+                        )
+                    elif success_metric_type == SuccessMetricType.TARGET_PLANE:
+                        success_metric = SuccessMetric(
+                            success_metric_type=SuccessMetricType.TARGET_PLANE,
+                            ground_value=ref_traj.success_metric.ground_value,
+                            final_gripper_close=ref_traj.success_metric.final_gripper_close,
+                            end_pose=end_pose_metric_7d
+                        )
+                    
+                    # Store object poses in 7D format
+                    object_poses = {}
+                    for oid, pose_mat in other_object_poses:
+                        pos, quat = mat_to_pose(pose_mat)
+                        object_poses[oid] = np.concatenate([pos, quat]).tolist()
+                    
+                    # Add start and end related objects
+                    start_pos, start_quat = mat_to_pose(start_pose)
+                    start_pose_7d = np.concatenate([start_pos, start_quat]).tolist()
+                    for oid in self.task_cfg.start_related:
+                        object_poses[oid] = start_pose_7d
+                    
+                    end_pos, end_quat = mat_to_pose(end_pose)
+                    end_pose_7d = np.concatenate([end_pos, end_quat]).tolist()
+                    for oid in self.task_cfg.end_related:
+                        object_poses[oid] = end_pose_7d
+                    
+                    object_poses[self.task_cfg.manipulated_oid] = start_pose_7d
+                    
+                    robot_type = ref_traj.robot_type
+                    generated_trajectories.append(TrajectoryCfg(
+                        robot_pose=robot_pose,
+                        object_poses=object_poses,
+                        object_trajectory=new_traj_7d,
+                        final_gripper_close=ref_traj.final_gripper_close,
+                        pregrasp_pose=pregrasp_pose,
+                        grasp_pose=grasp_pose,
+                        success_metric=success_metric,
+                        robot_type=robot_type
+                    ))
+        
+        # Total trajectories = traj_randomize_num * scene_randomize_num * robot_pose_randomize_num
+        assert len(generated_trajectories) == traj_randomize_num * scene_randomize_num * robot_pose_randomize_num, \
+            f"Expected {traj_randomize_num * scene_randomize_num * robot_pose_randomize_num} trajectories, got {len(generated_trajectories)}"
+      
+        random.shuffle(generated_trajectories)
+        return generated_trajectories
+
+    @staticmethod
+    def compute_new_traj(start_trans: np.ndarray, end_trans: np.ndarray, reference_traj: np.ndarray, 
+                        interp_mode: str = 'slerp') -> np.ndarray:
+        """
+        Compute interpolated trajectory using Real2Render2Real's relative shape preservation method.
+        
+        This maintains the relative motion pattern of the reference trajectory by normalizing
+        the progress along the reference and mapping it to the new start and end positions.
+        
+        Args:
+            start_trans: 4x4 transformation matrix from original start to new start
+            end_trans: 4x4 transformation matrix from original end to new end  
+            reference_traj: Reference trajectory in SE(3), shape (N, 4, 4) or (N, 16)
+            interp_mode: 'linear' or 'slerp' for rotation interpolation (default: 'slerp')
+            
+        Returns:
+            New interpolated trajectory with same shape as reference_traj
+        """
+        # Ensure reference_traj is in (N, 4, 4) format
+        if reference_traj.ndim == 2 and reference_traj.shape[1] == 16:
+            reference_traj = reference_traj.reshape(-1, 4, 4)
+        elif reference_traj.ndim == 3 and reference_traj.shape[1] == 4 and reference_traj.shape[2] == 4:
+            reference_traj = reference_traj.copy()
+        else:
+            raise ValueError(f"Invalid reference_traj shape: {reference_traj.shape}, expected (N, 4, 4) or (N, 16)")
+        
+        N = len(reference_traj)
+        if N == 0:
+            return reference_traj.copy()
+        
+        # Get start and end poses from reference trajectory
+        ref_start = reference_traj[0].copy()
+        ref_end = reference_traj[-1].copy()
+        
+        # Compute new start and end poses
+        new_start_mat = start_trans @ ref_start
+        new_end_mat = end_trans @ ref_end
+        
+        # Convert to 7D format using mat_to_pose
+        ref_traj_7d = []
+        for pose_mat in reference_traj:
+            pos, quat = mat_to_pose(pose_mat)
+            ref_traj_7d.append(np.concatenate([pos, quat]))
+        ref_traj_7d = np.array(ref_traj_7d)
+        
+        pos_start, quat_start = mat_to_pose(new_start_mat)
+        pos_end, quat_end = mat_to_pose(new_end_mat)
+        
+        # Initialize new trajectory
+        new_traj_7d = np.zeros_like(ref_traj_7d)
+        
+        # Normalize time steps
+        t = np.linspace(0, 1, N)
+        
+        # Split into position and rotation components
+        pos_orig = ref_traj_7d[:, :3]
+        quat_orig = ref_traj_7d[:, 3:7]  # wxyz format (from transforms3d)
+        
+        ref_start_pos, ref_start_quat = mat_to_pose(ref_start)
+        ref_end_pos, ref_end_quat = mat_to_pose(ref_end)
+        
+        # Interpolate positions: preserve relative shape from reference trajectory
+        for axis in range(3):
+            ref_range = ref_end_pos[axis] - ref_start_pos[axis]
+            if np.abs(ref_range) > 1e-10:
+                # Normalize progress along reference trajectory
+                normalized_progress = (pos_orig[:, axis] - ref_start_pos[axis]) / ref_range
+                # Map to new range
+                new_traj_7d[:, axis] = pos_start[axis] + (pos_end[axis] - pos_start[axis]) * normalized_progress
+            else:
+                # If no change in reference, use direct transformation
+                new_traj_7d[:, axis] = pos_orig[:, axis] + (pos_start[axis] - ref_start_pos[axis])
+        
+        # Interpolate rotations using SLERP
+        if interp_mode == 'slerp':
+            # Use scipy Slerp for spherical linear interpolation
+            # Convert wxyz to xyzw for scipy
+            quat_start_xyzw = np.array([quat_start[1], quat_start[2], quat_start[3], quat_start[0]])
+            quat_end_xyzw = np.array([quat_end[1], quat_end[2], quat_end[3], quat_end[0]])
+            
+            # Create Slerp interpolator
+            key_rots = R.from_quat([quat_start_xyzw, quat_end_xyzw])
+            key_times = [0, 1]
+            slerp = Slerp(key_times, key_rots)
+            
+            # Interpolate for all time steps
+            interp_rots = slerp(t)
+            quat_interp_xyzw = interp_rots.as_quat()
+            
+            # Convert back to wxyz format
+            new_traj_7d[:, 3] = quat_interp_xyzw[:, 3]  # w
+            new_traj_7d[:, 4] = quat_interp_xyzw[:, 0]  # x
+            new_traj_7d[:, 5] = quat_interp_xyzw[:, 1]  # y
+            new_traj_7d[:, 6] = quat_interp_xyzw[:, 2]  # z
+        else:  # linear
+            # Linear interpolation (needs normalization)
+            for i in range(N):
+                new_traj_7d[i, 3:7] = (1 - t[i]) * quat_start + t[i] * quat_end
+                new_traj_7d[i, 3:7] /= np.linalg.norm(new_traj_7d[i, 3:7])
+        
+        # Convert back to SE(3) matrices using pose_to_mat
+        new_traj = []
+        for pose_7d in new_traj_7d:
+            pose_mat = pose_to_mat(pose_7d[:3], pose_7d[3:7])
+            new_traj.append(pose_mat)
+        new_traj = np.array(new_traj)
+        
+        return new_traj
+
+
+    def select_randomized_cfg(self, train_set_size: int):
+        return random.choice(self.task_cfg.generated_trajectories[train_set_size:])
+
+
+
+
diff --git a/openreal2sim/simulation/modules/envs/task_cfg.py b/openreal2sim/simulation/modules/envs/task_cfg.py
new file mode 100755
index 0000000..74dad25
--- /dev/null
+++ b/openreal2sim/simulation/modules/envs/task_cfg.py
@@ -0,0 +1,86 @@
+
+from enum import Enum, auto
+from typing import List, Dict, Optional
+from dataclasses import dataclass, field
+
+class SuccessMetricType(Enum):
+    SIMPLE_LIFT = auto()
+    TARGET_POINT = auto()
+    TARGET_PLANE = auto()
+
+class TaskType(Enum):
+    SIMPLE_PICK = auto()
+    SIMPLE_PICK_PLACE = auto()
+    TARGETTED_PICK_PLACE = auto()
+
+class RobotType(Enum):
+    FRANKA = auto()
+    UR5 = auto()
+
+
+@dataclass
+class ObjectCfg:
+    object_id: int
+    object_name: str
+    mesh_path: str
+    usd_path: str
+
+
+@dataclass
+class SuccessMetric:
+    success_metric_type: SuccessMetricType
+    final_gripper_close:bool
+    lift_height: Optional[float] = None
+    ground_value: Optional[float] = None
+    end_pose: Optional[List[float]] = None
+ 
+
+@dataclass
+class TrajectoryCfg:  
+    robot_pose: List[float] # quat wxyz
+    object_poses: Dict[int, List[float]] # quat
+    object_trajectory: List[List[float]] # quat
+    final_gripper_close: bool
+    success_metric: SuccessMetric
+    pregrasp_pose: Optional[List[float]] = None  ### This is eef in world frame.   quat
+    grasp_pose: Optional[List[float]] = None  ### This is eef in world frame. quat
+    robot_type: Optional[RobotType] = None
+
+
+
+@dataclass
+class CameraInfo:
+    width: float
+    height: float
+    fx: float
+    fy: float
+    cx: float
+    cy: float
+    camera_opencv_to_world: List[List[float]]
+    camera_position: List[float]
+    camera_heading_wxyz: List[float]
+
+@dataclass
+class TaskCfg:
+    task_key: str
+    task_id: int
+    task_desc: List[str]
+    task_type: TaskType
+    bg_rgb_path: str
+    background_path: str
+    background_usd_path: str
+    camera_info: CameraInfo
+    manipulated_oid: int  
+    start_related: List[int]
+    end_related: List[int]
+    objects: List[ObjectCfg]
+    reference_trajectory: Optional[TrajectoryCfg] = None
+    generated_trajectories: List[TrajectoryCfg] = field(default_factory=list)
+
+
+
+
+
+
+
+
diff --git a/openreal2sim/simulation/modules/envs/task_construct.py b/openreal2sim/simulation/modules/envs/task_construct.py
new file mode 100644
index 0000000..cbca7cc
--- /dev/null
+++ b/openreal2sim/simulation/modules/envs/task_construct.py
@@ -0,0 +1,227 @@
+### construct the task config from scene dict.
+
+import json
+import numpy as np
+from enum import Enum
+import shutil
+from pathlib import Path
+import os
+import sys
+file_path = Path(__file__).resolve()
+sys.path.append(str(file_path.parent))
+from .task_cfg import TaskCfg, TaskType, ObjectCfg, CameraInfo, TrajectoryCfg, SuccessMetric, SuccessMetricType, RobotType
+
+def get_next_id(folder: Path) -> int:
+    if not folder.exists():
+        os.makedirs(folder, exist_ok=True)
+        return 0
+    subfolders = [f for f in folder.iterdir() if f.is_dir()]
+    task_num = len(subfolders)
+    return task_num
+    
+
+def construct_task_config(key, scene_dict: dict, base_folder: Path):
+    task_key = key
+    task_id = get_next_id(base_folder)
+    task_desc = scene_dict["task_desc"]
+    base_folder = base_folder / key
+    if base_folder.exists():
+        shutil.rmtree(base_folder)
+    base_folder.mkdir(parents=True, exist_ok=True)  # Create directory before copying files
+    background_path = scene_dict["background"]["registered"]
+    background_usd_path = scene_dict["background"]["usd"]
+    shutil.copy(background_path, base_folder / "background.glb")
+    shutil.copy(background_usd_path, base_folder / "background.usd")
+    background_path = base_folder / "background.glb"
+    background_usd_path = base_folder / "background.usd"
+    
+    bg_rgb_path = scene_dict["bg_rgb_path"]
+    shutil.copy(bg_rgb_path, base_folder / "bg_rgb.jpg")
+    bg_rgb_path = base_folder / "bg_rgb.jpg"
+    width = scene_dict["camera"]["width"]
+    height = scene_dict["camera"]["height"]
+    fx = scene_dict["camera"]["fx"]
+    fy = scene_dict["camera"]["fy"]
+    cx = scene_dict["camera"]["cx"]
+    cy = scene_dict["camera"]["cy"]
+    camera_opencv_to_world = scene_dict["camera"]["camera_opencv_to_world"]
+    camera_position = scene_dict["camera"]["camera_position"]
+    camera_heading_wxyz = scene_dict["camera"]["camera_heading_wxyz"]
+    camera_info = CameraInfo(width, height, fx, fy, cx, cy, camera_opencv_to_world, camera_position, camera_heading_wxyz)
+
+    objects = []
+    for oid, obj in scene_dict["objects"].items():
+        object_id = oid
+        object_name = obj["name"]
+        mesh_path = obj["optimized"]
+        shutil.copy(mesh_path, base_folder / f"object_{object_id}.glb")
+        mesh_path = base_folder / f"object_{object_id}.glb"
+        usd_path = obj['usd']
+        
+        shutil.copy(usd_path, base_folder / Path(usd_path).name)
+       
+        cfg_path = Path(usd_path).parent / "config.yaml"
+        asset_hash_path = Path(usd_path).parent / ".asset_hash"
+        usd_path = base_folder / Path(usd_path).name
+        shutil.copy(cfg_path, base_folder / "config.yaml")
+        shutil.copy(asset_hash_path, base_folder / ".asset_hash")
+        object_cfg = ObjectCfg(object_id, object_name, str(mesh_path), str(usd_path))
+        objects.append(object_cfg)
+
+
+    manipulated_oid = scene_dict["manipulated_oid"]
+    start_related = scene_dict["start_related"]
+    end_related = scene_dict["end_related"]
+
+    if scene_dict["task_type"] == "targetted_pick_place":
+        task_type = TaskType.TARGETTED_PICK_PLACE
+    elif scene_dict["task_type"] == "simple_pick_place":
+        task_type = TaskType.SIMPLE_PICK_PLACE
+    elif scene_dict["task_type"] == "simple_pick":
+        task_type = TaskType.SIMPLE_PICK
+    else:
+        raise ValueError(f"Invalid task type: {scene_dict['info']['task_type']}")
+    task_config = TaskCfg(task_key, task_id, task_desc, task_type, str(bg_rgb_path), str(background_path), str(background_usd_path), camera_info, manipulated_oid, start_related, end_related, objects)
+    json_path = base_folder / "task.json"
+    with open(json_path, "w") as f:
+        json.dump(serialize_task_cfg(task_config), f)
+    return task_config, base_folder
+
+
+def add_reference_trajectory(task_cfg: TaskCfg, reference_trajectory: TrajectoryCfg, base_folder: Path):
+    task_cfg.reference_trajectory = reference_trajectory
+    json_path = base_folder / "task.json"
+    with open(json_path, "w") as f:
+        json.dump(serialize_task_cfg(task_cfg), f)
+    return task_cfg
+
+
+
+
+def serialize_task_cfg(task_cfg):
+    """
+    Serialize TaskCfg and all nested fields (including numpy arrays) into pure Python dict/list/primitive types,
+    so it can be safely saved as JSON.
+    """
+
+    def serialize(obj):
+        # Handle None
+        if obj is None:
+            return None
+        # Handle basic types
+        elif isinstance(obj, (int, float, str, bool)):
+            return obj
+        # Handle numpy array
+        elif isinstance(obj, np.ndarray):
+            return obj.tolist()
+        # Handle enum
+        elif hasattr(obj, 'name') and isinstance(obj, (Enum,)):
+            return obj.name
+        # Handle dict
+        elif isinstance(obj, dict):
+            return {serialize(k): serialize(v) for k, v in obj.items()}
+        # Handle list/tuple
+        elif isinstance(obj, (list, tuple)):
+            return [serialize(i) for i in obj]
+        # Handle dataclass/object with __dict__
+        elif hasattr(obj, '__dict__'):
+            data = {}
+            for key, value in obj.__dict__.items():
+                data[key] = serialize(value)
+            return data
+        # Handle class with __slots__
+        elif hasattr(obj, '__slots__'):
+            data = {}
+            for slot in obj.__slots__:
+                data[slot] = serialize(getattr(obj, slot))
+            return data
+        # Fallback (e.g. Path objects)
+        elif hasattr(obj, '__str__'):
+            return str(obj)
+        else:
+            raise TypeError(f"Cannot serialize object of type {type(obj)}: {repr(obj)}")
+
+    return serialize(task_cfg)
+
+
+
+
+
+def load_task_cfg(json_path: Path) -> TaskCfg:
+    """
+    Load a TaskCfg from the given JSON file path and construct a TaskCfg instance.
+    """
+    with open(json_path, "r") as f:
+        cfg_dict = json.load(f)
+
+    # Handle all fields and reconstruct proper datatypes
+    # Helper to reconstruct enums
+    def parse_enum(enum_cls, val):
+        if isinstance(val, enum_cls):
+            return val
+        elif isinstance(val, str):
+            return enum_cls[val]
+        else:
+            raise ValueError(f"Unknown value {val} for enum {enum_cls}")
+
+    # Parse SuccessMetric(s)
+    def parse_success_metric(metric_dict):
+        return SuccessMetric(
+            success_metric_type=parse_enum(SuccessMetricType, metric_dict["success_metric_type"]),
+            final_gripper_close=metric_dict["final_gripper_close"],
+            lift_height=metric_dict.get("lift_height", None),
+            ground_value=metric_dict.get("ground_value", None),
+            end_pose=metric_dict.get("end_pose", None)
+        )
+
+    # Parse TrajectoryCfg(s)
+    def parse_traj_cfg(traj_dict):
+        return TrajectoryCfg(
+            robot_pose=np.array(traj_dict["robot_pose"], dtype=np.float32).tolist(),
+            object_poses={oid: np.array(pose, dtype=np.float32).tolist() for oid, pose in traj_dict["object_poses"].items()},
+            object_trajectory=[np.array(m, dtype=np.float32).tolist() for m in traj_dict["object_trajectory"]],
+            final_gripper_close=traj_dict.get("final_gripper_close", None),
+            success_metric=parse_success_metric(traj_dict.get("success_metric", None)),
+            pregrasp_pose=traj_dict.get("pregrasp_pose", None),
+            grasp_pose=traj_dict.get("grasp_pose", None),
+            robot_type=parse_enum(RobotType, traj_dict.get("robot_type", None))
+        )
+
+    def parse_camera_info(camera_dict):
+        return CameraInfo(
+            width=camera_dict["width"],
+            height=camera_dict["height"],
+            fx=camera_dict["fx"],
+            fy=camera_dict["fy"],
+            cx=camera_dict["cx"],
+            cy=camera_dict["cy"],
+            camera_opencv_to_world=np.array(camera_dict["camera_opencv_to_world"], dtype=np.float32).tolist(),
+            camera_position=np.array(camera_dict["camera_position"], dtype=np.float32).tolist(),
+            camera_heading_wxyz=np.array(camera_dict["camera_heading_wxyz"], dtype=np.float32).tolist(),
+        )
+    def parse_object_cfg(object_dict):
+        return ObjectCfg(
+            object_id=object_dict["object_id"],
+            object_name=object_dict["object_name"],
+            mesh_path=object_dict["mesh_path"],
+            usd_path=object_dict["usd_path"]
+        )
+    # Compose TaskCfg
+    task_cfg = TaskCfg(
+        task_id=cfg_dict["task_id"],
+        task_desc=cfg_dict["task_desc"],
+        task_key=cfg_dict["task_key"],
+        task_type=parse_enum(TaskType, cfg_dict["task_type"]),
+        bg_rgb_path=cfg_dict["bg_rgb_path"],
+        background_path=cfg_dict["background_path"],
+        background_usd_path=cfg_dict["background_usd_path"],
+        camera_info=parse_camera_info(cfg_dict["camera_info"]),
+        manipulated_oid=cfg_dict["manipulated_oid"],
+        start_related=cfg_dict["start_related"],
+        end_related=cfg_dict["end_related"],
+        objects=[parse_object_cfg(obj) for obj in cfg_dict["objects"]],
+        reference_trajectory=parse_traj_cfg(cfg_dict.get("reference_trajectory", None)),
+        generated_trajectories=[parse_traj_cfg(traj) for traj in cfg_dict.get("generated_trajectories", [])]
+    )
+
+    return task_cfg
\ No newline at end of file
diff --git a/openreal2sim/simulation/isaaclab/sim_preprocess/grasp_generation.py b/openreal2sim/simulation/modules/grasp_preprocess/grasp_generation.py
old mode 100644
new mode 100755
similarity index 94%
rename from openreal2sim/simulation/isaaclab/sim_preprocess/grasp_generation.py
rename to openreal2sim/simulation/modules/grasp_preprocess/grasp_generation.py
index 9d20fae..895f4ef
--- a/openreal2sim/simulation/isaaclab/sim_preprocess/grasp_generation.py
+++ b/openreal2sim/simulation/modules/grasp_preprocess/grasp_generation.py
@@ -153,7 +153,7 @@ def run_for_key(key: str,
                 vis_pts_per_gripper: int):
     base_dir = Path.cwd()
     out_dir = base_dir / "outputs"
-    scene_json = out_dir / key / "scene" / "scene.json"
+    scene_json = out_dir / key / "simulation" / "scene.json"
     if not scene_json.exists():
         raise FileNotFoundError(scene_json)
 
@@ -191,6 +191,22 @@ def run_for_key(key: str,
                 json.dump(scene_dict, f, indent=2)
             print(f"[OK][{key}] scene.json updated with 'grasps' for {obj['oid']}_{obj['name']}.")
 
+
+def grasp_generation(keys):
+    for key in keys:
+        print(f"\n========== [GraspDet] Processing key: {key} ==========")
+        run_for_key(
+            key=key,
+            n_points=args.n_points,
+            keep=args.keep,
+            nms=args.nms,
+            overwrite=args.overwrite,
+            vis_pts_per_gripper=args.vis_pts_per_gripper,
+        )
+
+    print('[Info] Grasp generation completed.')
+
+
 def main():
     parser = argparse.ArgumentParser("Export grasp proposals for ALL objects (batch over keys) and write paths into scene.json")
     parser.add_argument("--n_points", type=int, default=100000)
diff --git a/openreal2sim/simulation/isaaclab/sim_preprocess/grasp_utils.py b/openreal2sim/simulation/modules/grasp_preprocess/grasp_utils.py
old mode 100644
new mode 100755
similarity index 100%
rename from openreal2sim/simulation/isaaclab/sim_preprocess/grasp_utils.py
rename to openreal2sim/simulation/modules/grasp_preprocess/grasp_utils.py
diff --git a/openreal2sim/simulation/modules/hand_preprocess/affordance_extraction.py b/openreal2sim/simulation/modules/hand_preprocess/affordance_extraction.py
new file mode 100755
index 0000000..e69de29
diff --git a/openreal2sim/simulation/modules/hand_preprocess/grasp_point_extraction.py b/openreal2sim/simulation/modules/hand_preprocess/grasp_point_extraction.py
new file mode 100755
index 0000000..942088e
--- /dev/null
+++ b/openreal2sim/simulation/modules/hand_preprocess/grasp_point_extraction.py
@@ -0,0 +1,331 @@
+### This part is used to generate the affordance of the object.
+### Include:
+### 1. Grasp point generation.
+### 2. Affordance map generation. Mainly for articulated object. This might be further combined with a PartSLIP network.
+### TODO: Human annotation.
+### The logic we use for grasp point generation here is: we select the frist frame of object-hand contact, compute the contact point, overlay the object and extract the 3D coordinate of the point.
+### Be careful that we transfer this point to the first frame using pose matrices and extrinsics.
+### TODO: This might be helpful to object selection in the first step.
+import pickle
+import pathlib
+from pathlib import Path
+import logging
+import yaml
+import os
+import sys
+import torch
+import numpy as np
+import cv2
+import imageio
+import pytorch3d
+from pytorch3d.structures import Meshes
+from pytorch3d.renderer import (
+    look_at_view_transform,
+    PerspectiveCameras,
+    RasterizationSettings,
+    MeshRenderer,
+    MeshRasterizer,
+    SoftPhongShader,
+    PointLights,
+    TexturesVertex,
+    BlendParams,
+)
+import trimesh
+
+
+def render_pytorch3d_rgbd(mesh, K, img_size, device="cuda:0"):
+    """
+    Given a trimesh mesh (already in camera coordinate system), render using pytorch3d.
+    Args:
+        mesh: trimesh.Trimesh, should be in camera coordinate system.
+        K: (3,3) camera intrinsic matrix, fx, fy, cx, cy.
+        img_size: (H, W), output image (and depth) size.
+        device: device string
+    Returns:
+        mask_img: (H,W) uint8 binary mask (1 for foreground, 0 for background)
+        depth_img: (H,W) float32 (Z buffer, 0 for background)
+    """
+    if not isinstance(mesh, trimesh.Trimesh):
+        if hasattr(mesh, 'geometry') and len(mesh.geometry) > 0:
+            mesh = list(mesh.geometry.values())[0]
+        else:
+            raise ValueError('mesh is not a valid trimesh.Trimesh!')
+    device = torch.device(device if torch.cuda.is_available() else "cpu")
+
+    verts = torch.tensor(np.asarray(mesh.vertices), dtype=torch.float32, device=device)
+    faces = torch.tensor(np.asarray(mesh.faces), dtype=torch.int64, device=device)
+    verts_rgb = torch.ones_like(verts)[None] * torch.tensor([[0.7, 0.7, 0.7]], dtype=torch.float32, device=device)
+    from pytorch3d.renderer import TexturesVertex
+    textures = TexturesVertex(verts_features=verts_rgb)
+
+    mesh_p3d = Meshes(verts=[verts], faces=[faces], textures=textures)
+
+    # Get camera intrinsics
+    fx = float(K[0, 0])
+    fy = float(K[1, 1])
+    cx = float(K[0, 2])
+    cy = float(K[1, 2])
+    if isinstance(img_size, int):
+        image_size = [[img_size, img_size]]
+        H, W = img_size, img_size
+    else:
+        H, W = img_size
+        image_size = [[H, W]]
+
+    # Use pytorch3d's PerspectiveCameras for custom intrinsics
+    # Note: R and T are camera-to-world transforms (identity for camera coordinate system)
+    cameras = PerspectiveCameras(
+        device=device,
+        focal_length=torch.tensor([[fx, fy]], dtype=torch.float32, device=device),
+        principal_point=torch.tensor([[cx, cy]], dtype=torch.float32, device=device),
+        image_size=torch.tensor(image_size, dtype=torch.float32, device=device),
+        in_ndc=False,
+        R=torch.eye(3, dtype=torch.float32, device=device).unsqueeze(0),
+        T=torch.zeros(1, 3, dtype=torch.float32, device=device),
+    )
+    
+    blend_params = BlendParams(sigma=1e-4, gamma=1e-4)
+    raster_settings = RasterizationSettings(
+        image_size=(H, W),
+        blur_radius=0.0,
+        faces_per_pixel=1,
+    )
+    lights = PointLights(device=device, location=[[0.0, 0.0, 5.0]])
+
+    renderer = MeshRenderer(
+        rasterizer=MeshRasterizer(
+            cameras=cameras,
+            raster_settings=raster_settings
+        ),
+        shader=SoftPhongShader(
+            device=device,
+            cameras=cameras,
+            lights=lights,
+            blend_params=blend_params
+        ),
+    )
+    # Render RGB (used only for mask)
+    img = renderer(mesh_p3d).detach().cpu().numpy()[0][:,:,:3].clip(0, 1) * 255
+    img = img.astype(np.uint8)
+    # Depth renderer
+    class DepthShader(torch.nn.Module):
+        def __init__(self, device="cpu"):
+            super().__init__()
+            self.device = device
+        def forward(self, fragments, meshes, **kwargs):
+            return fragments.zbuf
+    depth_renderer = MeshRenderer(
+        rasterizer=MeshRasterizer(
+            cameras=cameras,
+            raster_settings=raster_settings
+        ),
+        shader=DepthShader(device=device)
+    )
+    depth = depth_renderer(mesh_p3d)[0, ..., 0].detach().cpu().numpy()  # (H, W)
+    mask = (depth > 0).astype(np.uint8)
+    return img, mask, depth
+
+
+def find_nearest_point(point, object_mask):
+    """
+    Given a point and an object binary mask (H,W), return the nearest point in the mask.
+    """
+    H, W = object_mask.shape
+    ys, xs = np.where(object_mask)
+    if len(xs) == 0:
+        return None
+    dists = np.sqrt((xs - point[0]) ** 2 + (ys - point[1]) ** 2)
+    min_idx = np.argmin(dists)
+    return np.array([xs[min_idx], ys[min_idx]])
+
+def get_bbox_mask_from_mask(mask):
+    ys, xs = np.where(mask)
+    if len(xs) == 0:
+        return None
+    x_min = np.min(xs)
+    x_max = np.max(xs)
+    y_min = np.min(ys)
+    y_max = np.max(ys)
+    return x_min, x_max, y_min, y_max
+
+def compute_contact_point(kpts_2d, object_mask):
+    """
+    Given 2D keypoints (N,2) and an object binary mask (H,W), return the mean keypoint location [x, y]
+    of all keypoints that fall inside the mask.
+    """
+    kpts_2d = np.asarray(kpts_2d)
+    H, W = object_mask.shape
+    inside = []
+    for kp in kpts_2d:
+        x, y = int(round(kp[0])), int(round(kp[1]))
+        if 0 <= y < H and 0 <= x < W:
+            if object_mask[y, x]:
+                inside.append(kp)
+    if len(inside) > 0:
+        point = np.mean(np.stack(inside, axis=0), axis=0)
+        px, py = int(round(point[0])), int(round(point[1]))
+        if 0 <= py < H and 0 <= px < W and object_mask[py, px]:
+            return point, True, True
+        else:
+            return point, True, False
+    else:
+        return np.mean(kpts_2d, axis=0), False, False
+
+def bbox_to_mask(bbox):
+    x, y, w, h = bbox
+    mask = np.zeros((h, w))
+    mask[y:y+h, x:x+w] = True
+    return mask
+
+
+def visualize_grasp_points(image, kpts_2d, contact_point, output_path):
+    """
+    Simple visualization of fingertip keypoints and contact point on image.
+    
+    Args:
+        image: Input image (H, W, 3) uint8 or float
+        kpts_2d: Fingertip keypoints array (N, 2)
+        contact_point: Contact point (2,)
+        output_path: Path to save visualization
+    """
+    # Prepare image
+    vis_image = image.copy()
+    if vis_image.dtype != np.uint8:
+        if vis_image.max() <= 1.0:
+            vis_image = (vis_image * 255).astype(np.uint8)
+        else:
+            vis_image = vis_image.astype(np.uint8)
+    
+    # Convert RGB to BGR for OpenCV
+    if vis_image.shape[-1] == 3:
+        vis_image = cv2.cvtColor(vis_image, cv2.COLOR_RGB2BGR)
+    
+    H, W = vis_image.shape[:2]
+    
+    # Draw fingertip keypoints (blue)
+    kpts_2d = np.asarray(kpts_2d)
+    for kp in kpts_2d:
+        kp_x, kp_y = int(round(kp[0])), int(round(kp[1]))
+        if 0 <= kp_y < H and 0 <= kp_x < W:
+            cv2.circle(vis_image, (kp_x, kp_y), 5, (255, 0, 0), -1)  # Blue filled circle
+    
+    # Draw contact point (red)
+    cp_x, cp_y = int(round(contact_point[0])), int(round(contact_point[1]))
+    if 0 <= cp_y < H and 0 <= cp_x < W:
+        cv2.circle(vis_image, (cp_x, cp_y), 8, (0, 0, 255), -1)  # Red filled circle
+    
+    # Convert back to RGB and save
+    vis_image_rgb = cv2.cvtColor(vis_image, cv2.COLOR_BGR2RGB)
+    imageio.imwrite(str(output_path), vis_image_rgb)
+    logging.info(f"Saved grasp point visualization to: {output_path}")
+
+    
+def grasp_point_generation(keys, key_scene_dicts, key_cfgs):
+    base_dir = Path.cwd()
+    for key in keys:
+        scene_dict = key_scene_dicts[key]
+        scene_dict["key"] = key  # Store key in scene_dict for visualization
+        key_cfg = key_cfgs[key]
+        single_grasp_point_generation(scene_dict, key_cfg, key, base_dir)
+        key_scene_dicts[key] = scene_dict
+        with open(base_dir / f'outputs/{key}/scene/scene.pkl', 'wb') as f:
+            pickle.dump(scene_dict, f)
+
+def single_grasp_point_generation(scene_dict, key_cfg, key, base_dir):
+    gpu_id = key_cfg["gpu"]
+    device = f"cuda:{gpu_id}"
+    start_frame_idx = scene_dict["info"]["start_frame_idx"]
+    points_2d = None    
+    i = start_frame_idx
+    manipulated_oid = scene_dict["info"]["manipulated_oid"]
+    if_close_list = []
+    is_inside_list = []
+    backup_list = []
+    extrinsics = np.array(scene_dict["extrinsics"]).astype(np.float64).reshape(-1, 4, 4)
+    T_c2w = np.array(scene_dict["info"]["camera"]["camera_opencv_to_world"]).astype(np.float64).reshape(4,4) 
+    T_w2c = np.linalg.inv(T_c2w)
+    model_path = scene_dict["info"]["objects"][manipulated_oid]["optimized"]
+    model = trimesh.load(model_path)
+    
+    K = np.array(scene_dict["intrinsics"]).astype(np.float32).reshape(3, 3)
+    img_size = scene_dict["height"], scene_dict["width"]
+
+    traj_key = scene_dict["info"]["traj_key"]
+    traj_key = traj_key.replace("_recomputed", "")
+    traj_path = scene_dict["info"]["objects"][manipulated_oid][traj_key]
+    traj = np.load(traj_path)
+    traj = traj.reshape(-1, 4, 4)
+    start_pose = traj[start_frame_idx]
+    model = model.apply_transform(start_pose)
+    model = model.apply_transform(T_w2c)
+
+    
+    images, mask_img, depth = render_pytorch3d_rgbd(model, K, img_size, device=device)
+    kpts_2d = scene_dict["simulation"]["hand_kpts"][i][[4,8,12,16,20]]
+    point_2d, is_close, is_inside = compute_contact_point(kpts_2d, mask_img)
+
+    # Use PyTorch3D's unproject_points for accurate 2D-to-3D conversion
+    x, y = int(round(point_2d[0])), int(round(point_2d[1]))
+    image = scene_dict["images"][i]
+    
+    # Visualize keypoints and contact point
+    vis_dir = base_dir / f"outputs/{key}/simulation/debug"
+    vis_dir.mkdir(parents=True, exist_ok=True)
+    vis_path = vis_dir / f"grasp_point_visualization_frame_{i:06d}.png"
+    visualize_grasp_points(image, kpts_2d, point_2d, vis_path)
+    mask_save_path = vis_dir / f"mask_frame_{i:06d}.png"
+    cv2.imwrite(mask_save_path, mask_img * 255)
+    image_save_path = vis_dir / f"image_frame_{i:06d}.png"
+    imageio.imwrite(image_save_path, images)
+                       
+    
+    
+    # Create camera object for unprojection (same as in render function)
+    fx, fy, cx, cy = float(K[0, 0]), float(K[1, 1]), float(K[0, 2]), float(K[1, 2])
+    H, W = img_size
+    
+    camera = PerspectiveCameras(
+        device=device,
+        focal_length=torch.tensor([[fx, fy]], dtype=torch.float32, device=device),
+        principal_point=torch.tensor([[cx, cy]], dtype=torch.float32, device=device),
+        image_size=torch.tensor([[H, W]], dtype=torch.float32, device=device),
+        in_ndc=False,
+        R=torch.eye(3, dtype=torch.float32, device=device).unsqueeze(0),
+        T=torch.zeros(1, 3, dtype=torch.float32, device=device),
+    )
+    
+    # Unproject 2D point to 3D using PyTorch3D
+    xy_coords = torch.tensor([[x, y]], dtype=torch.float32, device=device)  # (1, 2)
+    depth_values = torch.tensor([[z]], dtype=torch.float32, device=device)  # (1, 1)
+    
+    # Unproject to camera space (since mesh is already in camera coordinates)
+    points_camera = camera.unproject_points(
+        xy_depth=torch.cat([xy_coords, depth_values], dim=1),  # (1, 3) [x, y, depth]
+        world_coordinates=False,  # Output in camera coordinates
+        scaled_depth_input=False,  # Depth is in world units
+    )
+    
+    winner_point_3d = points_camera[0].cpu().numpy()  # (3,) in camera coordinates
+
+    winner_point_to_obj = winner_point_3d @ np.linalg.inv(start_pose)
+    return winner_point_to_obj
+
+
+
+if __name__ == "__main__":
+    base_dir = Path.cwd()
+    cfg_path = base_dir / "config" / "config.yaml"
+    cfg = yaml.safe_load(cfg_path.open("r"))
+    keys = cfg["keys"]
+    sys.path.append(str(base_dir / "openreal2sim" / "simulation"))
+    from utils.compose_config import compose_configs
+    key_cfgs = {key: compose_configs(key, cfg) for key in keys} 
+    print(f"Key cfgs: {key_cfgs}")
+    key_scene_dicts = {}
+    for key in keys:
+        scene_pkl = base_dir / f'outputs/{key}/scene/scene.pkl'
+        with open(scene_pkl, 'rb') as f:
+            scene_dict = pickle.load(f)
+        key_scene_dicts[key] = scene_dict
+    grasp_point_generation(keys, key_scene_dicts, key_cfgs)
+
diff --git a/openreal2sim/simulation/modules/hand_preprocess/hand_extraction.py b/openreal2sim/simulation/modules/hand_preprocess/hand_extraction.py
new file mode 100755
index 0000000..79b25ed
--- /dev/null
+++ b/openreal2sim/simulation/modules/hand_preprocess/hand_extraction.py
@@ -0,0 +1,213 @@
+## In this file: we use hand-object detector to decide the starting and ending frame of contact.
+## Then we use wilor to extract the hand info.
+
+
+import os
+import sys
+import cv2
+import numpy as np
+import torch
+import yaml
+from typing import List, Tuple, Dict, Optional
+import logging
+from pathlib import Path
+import pickle
+import logging
+import tqdm
+base_dir = Path.cwd()
+sys.path.append(str(base_dir / "openreal2sim" / "simulation"))
+sys.path.append(str(base_dir / "third_party" / "WiLoR"))
+sys.path.append(str(base_dir / "third_party" / "Grounded-SAM-2"))
+
+
+from wilor.utils.renderer import cam_crop_to_full
+from wilor.models import load_wilor
+from wilor.utils import recursive_to
+from wilor.datasets.vitdet_dataset import ViTDetDataset    
+from sam2.build_sam import build_sam2
+from sam2.sam2_image_predictor import SAM2ImagePredictor    
+from ultralytics import YOLO
+
+class WiLoRExtractor:
+    def __init__(self,
+                 model_path: str,
+                 cfg_path: str,
+                 yolo_weights_path: str,
+                 sam_weights_path: str,
+                 sam_cfg_path: str,
+                 device: str):
+        self._wilor_model, self._wilor_cfg = load_wilor(model_path, cfg_path)
+        self._wilor_model.eval()
+        self._yolo_detector = YOLO(yolo_weights_path)
+        self.device = torch.device(device)
+        self._sam_predictor = SAM2ImagePredictor(build_sam2(sam_cfg_path, sam_weights_path))
+        
+    def process(self, images: np.ndarray, batch_size: int = 16, rescale_factor: float = 1.0):
+        boxes = []
+        right = []
+        masks = []
+        self._wilor_model.to(self.device)
+        self._yolo_detector.to(self.device)
+        self._sam_predictor.model.to(self.device)
+        self._wilor_model.eval()
+
+        all_global_orient = []
+        all_kpts = []
+        all_masks = []
+        
+        for i in tqdm.tqdm(range(0, len(images)), desc="Detecting hands"):
+            batch = np.array(images)[i]
+            with torch.no_grad():
+                detections = self._yolo_detector.predict(batch, conf=0.3, verbose=False, save=False, show=False)[0]
+                
+                if len(detections.boxes.cls.cpu().detach().numpy()) == 0:
+                    all_global_orient.append(None)
+                    all_kpts.append(None)
+                    all_masks.append(None)
+                else:
+                    det = max(detections, key=lambda d: d.boxes.conf.cpu().detach().item())
+                    Bbox = det.boxes.data.cpu().detach().squeeze().numpy()
+                    cls_flag = det.boxes.cls.cpu().detach().squeeze().item()
+                    
+                    self._sam_predictor.set_image(batch)
+                    hand_masks, scores, logits = self._sam_predictor.predict(
+                        point_coords=None,
+                        point_labels=None,
+                        box=np.array([[Bbox[0], Bbox[1], Bbox[2], Bbox[3]]]),
+                        multimask_output=False
+                    )
+                    hand_masks = hand_masks[0]
+                    hand_masks = hand_masks.astype(bool)
+                    dataset = ViTDetDataset(self._wilor_cfg, batch, np.array([Bbox[:4]]), np.array([cls_flag]), rescale_factor=rescale_factor)
+                    dataloader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=False, num_workers=0)
+                    sign = False
+                    for batch in dataloader:
+                        assert sign == False
+                        batch = recursive_to(batch, torch.device(self.device))
+                        with torch.no_grad():
+                            out = self._wilor_model(batch)
+                        multiplier = 2 * batch['right'][0].cpu().numpy() - 1
+                        pred_cam = out['pred_cam'][0] 
+                        pred_cam[1] = multiplier * pred_cam[1]
+                        box_center =  batch['box_center'][0].float()
+                        box_size      = batch['box_size'][0].float()
+                        img_size      = batch['img_size'][0].float()
+                        scaled_focal_length = self._wilor_cfg.EXTRA.FOCAL_LENGTH / self._wilor_cfg.MODEL.IMAGE_SIZE * img_size.max()
+                        #import pdb; pdb.set_trace()
+                        pred_cam_t_full     = cam_crop_to_full(pred_cam.reshape(1, 3), box_center.reshape(1, 2), box_size.reshape(1, 1), img_size.reshape(1, 2), scaled_focal_length).detach().cpu().numpy()
+                        batch_size = batch['img'].shape[0]                    
+                        joints = out['pred_keypoints_3d'][0].detach().cpu().numpy()
+                        joints[:, 0] = multiplier * joints[:, 0]
+                        cam_t = pred_cam_t_full[0]
+                        kpts_2d = self.project_full_img(joints, cam_t, float(scaled_focal_length), img_size)
+                        all_kpts.append(kpts_2d)
+                        all_global_orient.append(out['pred_mano_params']['global_orient'][0,0].detach().cpu().numpy())
+                        all_masks.append(hand_masks)        
+                        sign = True
+        return all_kpts, all_global_orient, all_masks
+
+
+
+
+
+    def project_full_img(self, points, cam_trans, focal_length, img_res):
+        ''' we use simple K here. It works.'''
+        if not isinstance(points, torch.Tensor):
+            points = torch.tensor(points, dtype=torch.float32)
+        if not isinstance(cam_trans, torch.Tensor):
+            cam_trans = torch.tensor(cam_trans, dtype=torch.float32)
+        # Ensure numeric image resolution
+        try:
+            img_w = float(img_res[0])
+            img_h = float(img_res[1])
+        except Exception:
+            # Fallback for unexpected types
+            img_w, img_h =  float(img_res[0].item()), float(img_res[1].item())
+        K = torch.eye(3, dtype=torch.float32)
+        K[0, 0] = float(focal_length)
+        K[1, 1] = float(focal_length)
+        K[0, 2] = img_w / 2.0
+        K[1, 2] = img_h / 2.0
+        pts = points + cam_trans
+        pts = pts / pts[..., -1:]
+        V_2d = (K @ pts.T).T
+        return V_2d[..., :-1].detach().cpu().numpy()
+
+
+
+
+
+
+def hand_extraction(keys, key_scene_dicts, key_cfgs):
+    base_dir = Path.cwd()
+    model_path = base_dir / "third_party" / "WiLoR" / "pretrained_models" / "wilor_final.ckpt"
+    cfg_path = base_dir / "third_party" / "WiLoR" / "pretrained_models" / "model_config.yaml"
+    yolo_weights_path = base_dir / "third_party" / "WiLoR" / "pretrained_models" / "detector.pt"
+    sam_cfg_path = "configs/sam2.1/sam2.1_hiera_l.yaml"
+    sam_weights_path = "third_party/Grounded-SAM-2/checkpoints/sam2.1_hiera_large.pt"
+    
+    for key in keys:
+      
+        scene_dict = key_scene_dicts[key]
+        config = key_cfgs[key]
+        gpu_id = config['gpu']
+        device = f"cuda:{gpu_id}"
+        wilor_extractor = WiLoRExtractor(model_path=model_path, cfg_path=cfg_path, yolo_weights_path=yolo_weights_path, sam_weights_path=sam_weights_path, sam_cfg_path=sam_cfg_path, device=device)
+        #images = np.transpose(scene_dict["images"], (0, 3, 1, 2))
+        images = scene_dict["images"].astype(np.float32)
+        kpts, global_orient, masks = wilor_extractor.process(images)
+        if scene_dict.get("simulation") is None:
+            scene_dict["simulation"] = {}
+        scene_dict["simulation"]["hand_kpts"] = kpts
+        scene_dict["simulation"]["hand_global_orient"] = global_orient
+        scene_dict["simulation"]["hand_masks"] = masks
+        key_scene_dicts[key] = scene_dict
+        with open(base_dir / f'outputs/{key}/scene/scene.pkl', 'wb') as f:
+            pickle.dump(scene_dict, f)
+
+        # import cv2
+        # start_frame_idx = scene_dict["recon"]["start_frame_idx"]
+
+        # first_img = scene_dict["images"][start_frame_idx] 
+        # first_kpts = kpts[start_frame_idx] 
+        # first_masks = masks[start_frame_idx]
+        # pts2d = np.asarray(first_kpts).astype(np.int32)
+        # overlay_img = first_img.copy()
+        # import pdb; pdb.set_trace()
+        # for x, y in pts2d:
+        #     cv2.circle(overlay_img, (int(x), int(y)), 4, (0, 255, 0), -1)
+        # mask_vis = (first_masks.astype(np.uint8) * 255)
+        # if mask_vis.ndim == 2:
+        #     mask_vis = cv2.cvtColor(mask_vis, cv2.COLOR_GRAY2BGR)
+        # overlay_alpha = 0.4
+        # mask_color = np.zeros_like(overlay_img)
+        # mask_color[:, :, 2] = mask_vis[:, :, 0] 
+        # overlay_img = cv2.addWeighted(overlay_img, 1.0, mask_color, overlay_alpha, 0)
+        # save_dir = base_dir / f'outputs/{key}/simulation/'
+        # save_dir.mkdir(parents=True, exist_ok=True)
+        # overlay_path = save_dir / "first_frame_hand_kpts_overlay.jpg"
+        # cv2.imwrite(str(overlay_path), overlay_img)
+        # print(f"Saved hand keypoints overlay for {key} to {overlay_path}")
+        return key_scene_dicts
+
+
+
+
+if __name__ == "__main__":
+    base_dir = Path.cwd()
+    cfg_path = base_dir / "config" / "config.yaml"
+    cfg = yaml.safe_load(cfg_path.open("r"))
+    keys = cfg["keys"]
+
+    from utils.compose_config import compose_configs
+    key_cfgs = {key: compose_configs(key, cfg) for key in keys} 
+    print(f"Key cfgs: {key_cfgs}")
+    key_scene_dicts = {}
+    for key in keys:
+        scene_pkl = base_dir / f'outputs/{key}/scene/scene.pkl'
+        with open(scene_pkl, 'rb') as f:
+            scene_dict = pickle.load(f)
+        key_scene_dicts[key] = scene_dict
+    hand_extraction(keys, key_scene_dicts, key_cfgs)
+
+    
diff --git a/openreal2sim/simulation/sim_agent.py b/openreal2sim/simulation/sim_agent.py
new file mode 100755
index 0000000..486654b
--- /dev/null
+++ b/openreal2sim/simulation/sim_agent.py
@@ -0,0 +1,248 @@
+### isaaclab
+
+from pathlib import Path
+import sys
+import pickle
+import yaml
+import json
+import argparse
+import shutil
+import numpy as np
+from isaaclab.app import AppLauncher
+import typing
+from typing import Optional
+sys.path.append(str(Path.cwd() / "IsaacLab"))
+sys.path.append(str(Path.cwd() / "openreal2sim" / "simulation"))
+from utils.compose_config import compose_configs
+from utils.notification import notify_started, notify_failed, notify_success
+
+class ProprocessAgent:
+    def __init__(self, stage=None, key=None):
+        print('[Info] Initializing ProprocessAgent...')
+        self.base_dir = Path.cwd()
+        cfg_path = self.base_dir / "config" / "config.yaml"
+        cfg = yaml.safe_load(cfg_path.open("r"))
+        self.keys = [key] if key is not None else cfg["keys"]
+        self.key_cfgs = {key: compose_configs(key, cfg) for key in self.keys}
+        self.key_scene_dicts = {}
+        for key in self.keys:
+            scene_pkl = self.base_dir / f'outputs/{key}/scene/scene.pkl'
+            with open(scene_pkl, 'rb') as f:
+                scene_dict = pickle.load(f)
+            self.key_scene_dicts[key] = scene_dict
+        self.stages = [
+            # "hand_extraction",
+            #"demo_motion_process",
+            # "grasp_point_generation",
+            # "grasp_generation"
+
+        ]
+        if stage is not None:
+            if stage in self.stages:
+                start_idx = self.stages.index(stage)
+                self.stages = self.stages[start_idx:]
+            else:
+                print(f"[Warning] Stage '{stage}' not found. It must be one of the {self.stages}. Running all stages by default.")
+        print('[Info] ProprocessAgent initialized.')
+    
+ 
+    
+    def hand_extraction(self):
+        from modules.hand_preprocess.hand_extraction import hand_extraction
+        self.key_scene_dicts = hand_extraction(self.keys, self.key_scene_dicts, self.key_cfgs)
+        print('[Info] Hand extraction completed.')
+    
+    def demo_motion_process(self):
+        from modules.demo_motion_process import demo_motion_process
+        self.key_scene_dicts = demo_motion_process(self.keys, self.key_scene_dicts, self.key_cfgs)
+        print('[Info] Demo motion process completed.')
+    
+    # def grasp_point_generation(self):
+    #     from modules.grasp_point_generation import grasp_point_generation
+    #     self.key_scene_dicts = grasp_point_generation(self.keys, self.key_scene_dicts, self.key_cfgs)
+    #     print('[Info] Grasp point generation completed.')
+    
+    def grasp_generation(self):
+        from modules.grasp_preprocess.grasp_generation import grasp_generation
+        grasp_generation(self.keys)
+        print('[Info] Grasp proposal generation completed.')
+    
+    def run(self):
+        if "hand_extraction" in self.stages:
+            self.hand_extraction()
+        if "demo_motion_process" in self.stages:
+            self.demo_motion_process()
+        # if "grasp_point_generation" in self.stages:
+        #     self.grasp_point_generation()
+        if "grasp_generation" in self.stages:
+            self.grasp_generation()
+       
+        print('[Info] ProprocessAgent run completed.')
+        return self.key_scene_dicts
+    
+
+        
+
+
+def create_args_from_config(config_path: Optional[str] = None, config_dict: Optional[dict] = None) -> argparse.Namespace:
+    """
+    Create an argparse.Namespace from a config file (yaml/json) or dictionary.
+    
+    Args:
+        config_path: Path to yaml or json config file
+        config_dict: Dictionary with config parameters
+        
+    Returns:
+        argparse.Namespace with the config parameters
+        
+    Example config file (yaml):
+        key: "demo_video"
+        robot: "franka"
+        num_envs: 4
+        num_trials: 10
+        teleop_device: "keyboard"
+        sensitivity: 1.0
+        device: "cuda:0"
+        enable_cameras: true
+        headless: true
+    """
+
+    if config_dict is None:
+        config_dict = {
+            'robot': 'franka',
+            'num_envs': 5,
+            'num_trials': 10,
+            'teleop_device': 'keyboard',
+            'sensitivity': 1.0,
+            'device': 'cuda:0',
+            'enable_cameras': True,
+            'headless': True,
+        }
+    else:
+        config_dict = {
+            'robot': config_dict['robot'],
+            'num_envs': config_dict['num_envs'],
+            'num_trials': config_dict['num_trials'],
+            'teleop_device': config_dict['teleop_device'],
+            'sensitivity': config_dict['sensitivity'],
+            'device': config_dict['device'],
+            'enable_cameras': config_dict['enable_cameras'],
+            'headless': config_dict['headless'],
+        }
+    
+    return argparse.Namespace(**config_dict)
+
+
+class IsaacAgent:
+    def __init__(self, stage=None, key=None):
+        print('[Info] Initializing IsaacAgent...')
+        self.base_dir = Path.cwd()
+        cfg_path = self.base_dir / "config" / "config.yaml"
+        cfg = yaml.safe_load(cfg_path.open("r"))
+        self.keys = [key] if key is not None else cfg["keys"]
+        self.key_cfgs = {key: compose_configs(key, cfg) for key in self.keys}
+        self.key_scene_dicts = {}
+        for key in self.keys:
+            scene_pkl = self.base_dir / f'outputs/{key}/scene/scene.pkl'
+            with open(scene_pkl, 'rb') as f:
+                scene_dict = pickle.load(f)
+            self.key_scene_dicts[key] = scene_dict
+        self.stages = [
+            # "usd_conversion",
+            #"grasp_generation",
+            #"sim_heuristic_manip",
+            "sim_randomize_rollout",
+
+        ]
+        if stage is not None:
+            if stage in self.stages:
+                start_idx = self.stages.index(stage)
+                self.stages = self.stages[start_idx:]
+            else:
+                print(f"[Warning] Stage '{stage}' not found. It must be one of the {self.stages}. Running all stages by default.")
+        print('[Info] IsaacAgent initialized.')
+    
+    def launch_isaaclab(self):
+
+        self.args_cli = create_args_from_config()
+        #AppLauncher.add_app_launcher_args(self.args_cli)
+        self.args_cli.enable_cameras = True
+        self.args_cli.headless = True
+        self.app_launcher = AppLauncher(vars(self.args_cli))
+        self.simulation_app = self.app_launcher.app
+        return self.simulation_app
+    
+    def close_isaaclab(self):
+        self.simulation_app.close()
+
+    def usd_conversion(self):
+        from isaaclab.sim_preprocess.usd_conversion import usd_conversion
+        usd_conversion(self.keys)
+        print('[Info] USD conversion completed.')
+    
+    def sim_heuristic_manip(self):
+        self.launch_isaaclab()
+        from isaaclab.sim_heuristic_manip import sim_heuristic_manip
+        sim_heuristic_manip(self.keys, args_cli=self.args_cli)
+        print('[Info] Heuristic manipulation simulation completed.')
+        self.close_isaaclab()
+    
+
+    def sim_randomize_rollout(self):
+        self.launch_isaaclab()
+        from isaaclab.sim_randomize_rollout import sim_randomize_rollout
+        sim_randomize_rollout(self.keys, args_cli=self.args_cli)
+        print('[Info] Randomize rollout completed.')
+        self.close_isaaclab()
+
+    def grasp_generation(self):
+        from modules.grasp_preprocess.grasp_generation import grasp_generation
+        grasp_generation(self.keys)
+        print('[Info] Grasp proposal generation completed.')
+    
+    def run(self):
+        if "usd_conversion" in self.stages:
+            self.usd_conversion()
+        if "grasp_generation" in self.stages:
+            self.grasp_generation()
+        if "launch_isaaclab" in self.stages:
+            self.launch_isaaclab()
+        if "sim_heuristic_manip" in self.stages:
+            self.sim_heuristic_manip()
+
+        if "sim_randomize_rollout" in self.stages:
+            self.sim_randomize_rollout()
+        if "close_isaaclab" in self.stages:
+            self.close_isaaclab()
+
+if __name__ == '__main__':
+    args = argparse.ArgumentParser()
+    args.add_argument('--stage', type=str, default=None, help='Starting from a certain stage')
+    args.add_argument('--key', type=str, default=None, help='Process a single key instead of all keys from config')
+    args.add_argument('--label', type=str, default=None, help='Optional label for notifications')
+    args = args.parse_args()
+
+    if args.label:
+        notify_started(args.label)
+
+    try:
+        agent = ProprocessAgent(stage=args.stage, key=args.key)
+        scene_dicts = agent.run()
+
+        if args.label:
+            notify_success(args.label)
+    except Exception as e:
+        if args.label:
+            notify_failed(args.label)
+        raise
+
+    try:
+        agent = IsaacAgent(stage=args.stage, key=args.key)
+        scene_dicts = agent.run()
+
+        if args.label:
+            notify_success(args.label)
+    except Exception as e:
+        if args.label:
+            notify_failed(args.label)
+        raise
\ No newline at end of file
diff --git a/openreal2sim/simulation/utils/compose_config.py b/openreal2sim/simulation/utils/compose_config.py
new file mode 100755
index 0000000..89c4410
--- /dev/null
+++ b/openreal2sim/simulation/utils/compose_config.py
@@ -0,0 +1,17 @@
+
+default_config = {
+    "gpu": "0",
+}
+   
+
+def compose_configs(key_name: str, config: dict) -> dict:
+    ret_key_config = {}
+    local_config = config["local"].get(key_name, {})
+    local_config = local_config.get("simulation", {})
+    global_config = config.get("global", {})
+    global_config = global_config.get("simulation", {})
+    for param in default_config.keys():
+        value = local_config.get(param, global_config.get(param, default_config[param]))
+        ret_key_config[param] = value
+    print(f"[Info] Config for {key_name}: {ret_key_config}")
+    return ret_key_config
\ No newline at end of file
diff --git a/openreal2sim/simulation/utils/notification.py b/openreal2sim/simulation/utils/notification.py
new file mode 100755
index 0000000..a36c546
--- /dev/null
+++ b/openreal2sim/simulation/utils/notification.py
@@ -0,0 +1,42 @@
+import requests
+
+ntfy_url = ""
+
+def notify_success(pipeline_label):
+    try:
+        requests.post(
+            ntfy_url, 
+            data=pipeline_label,
+            headers={
+                "Title": f"Reconstruction completed",
+                "Tags": "white_check_mark"      
+            }
+        )
+    except:
+        pass
+
+def notify_started(pipeline_label):
+    try:
+        requests.post(
+            ntfy_url, 
+            data=pipeline_label,
+            headers={
+                "Title": f"Reconstruction started",
+                "Tags": "watch"      
+            }
+        )
+    except:
+        pass
+
+def notify_failed(pipeline_label):
+    try:
+        requests.post(
+            ntfy_url, 
+            data=pipeline_label,
+            headers={
+                "Title": f"Reconstruction failed",
+                "Tags": "x"      
+            }
+        )
+    except:
+        pass
\ No newline at end of file

From 81cce38cdcffdac0356b0c406d4fb583f3308cd0 Mon Sep 17 00:00:00 2001
From: DynamoDuan <2030524470@qq.com>
Date: Sat, 8 Nov 2025 11:13:30 -0800
Subject: [PATCH 2/2] refine

---
 openreal2sim/simulation/isaaclab/sim_base.py  |  11 +-
 .../isaaclab/sim_heuristic_manip.py           |  44 ++---
 .../isaaclab/sim_preprocess/usd_conversion.py |  16 +-
 .../isaaclab/sim_randomize_rollout.py         |  50 +++---
 openreal2sim/simulation/sim_agent.py          | 162 ++++++++----------
 openreal2sim/simulation/sim_agent.sh          | 136 +++++++++++++++
 6 files changed, 281 insertions(+), 138 deletions(-)
 create mode 100644 openreal2sim/simulation/sim_agent.sh

diff --git a/openreal2sim/simulation/isaaclab/sim_base.py b/openreal2sim/simulation/isaaclab/sim_base.py
index fa8a3cd..dfe2906 100755
--- a/openreal2sim/simulation/isaaclab/sim_base.py
+++ b/openreal2sim/simulation/isaaclab/sim_base.py
@@ -8,7 +8,7 @@
 from pathlib import Path
 from typing import Any, Optional, Dict, Sequence, Tuple
 from typing import List
-
+import copy
 import numpy as np
 import torch
 import imageio
@@ -689,7 +689,8 @@ def save_data(self, ignore_keys: List[str] = [], env_ids: Optional[List[int]] =
         if env_ids is None:
             env_ids = self._all_env_ids.cpu().numpy()
 
-        composed_rgb = []
+        composed_rgb = copy.deepcopy(stacked["rgb"])
+        self.save_dict["composed_rgb"] = composed_rgb
         hdf5_names = []
         for b in env_ids:
             if self.defined_demo_dir is  None:
@@ -735,10 +736,10 @@ def save_data(self, ignore_keys: List[str] = [], env_ids: Optional[List[int]] =
             self.bg_rgb = imageio.imread(bg_rgb_path)
             for t in range(rgb.shape[0]):
                 #import ipdb; ipdb.set_trace()
-                composed = self.convert_real(mask[t, b], self.bg_rgb, rgb[t, b])        
+                composed = self.convert_real(mask[t, b], self.bg_rgb, rgb[t, b])    
+                self.save_dict["composed_rgb"][t, b] = composed
                 writer.append_data(composed)
             writer.close()
-            composed_rgb.append(composed)
 
    
         self.export_batch_data_to_hdf5(hdf5_names)
@@ -899,7 +900,7 @@ def export_batch_data_to_hdf5(self, hdf5_names: List[str]) -> int:
                     cam_grp.attrs["resolution"] = resolution
 
                 if "rgb" in stacked:
-                    rgb_frames = stacked["rgb"][:, env_idx]
+                    rgb_frames = stacked["composed_rgb"][:, env_idx]
                     encoded_frames, max_len = self._encode_rgb_sequence(rgb_frames)
                     dtype = f"S{max_len}" if max_len > 0 else "S1"
                     cam_grp.create_dataset("rgb", data=np.asarray(encoded_frames, dtype=dtype))
diff --git a/openreal2sim/simulation/isaaclab/sim_heuristic_manip.py b/openreal2sim/simulation/isaaclab/sim_heuristic_manip.py
index 8719d69..1ccfc3b 100755
--- a/openreal2sim/simulation/isaaclab/sim_heuristic_manip.py
+++ b/openreal2sim/simulation/isaaclab/sim_heuristic_manip.py
@@ -20,19 +20,19 @@
 sys.path.append(str(file_path.parent.parent))
 
 # ─────────── CLI ───────────
-# parser = argparse.ArgumentParser("sim_policy")
-# parser.add_argument("--key", type=str, default="demo_video", help="scene key (outputs/<key>)")
-# parser.add_argument("--robot", type=str, default="franka")
-# parser.add_argument("--num_envs", type=int, default=1)
-# parser.add_argument("--num_trials", type=int, default=1)
-# parser.add_argument("--teleop_device", type=str, default="keyboard")
-# parser.add_argument("--sensitivity", type=float, default=1.0)
-# AppLauncher.add_app_launcher_args(parser)
-# args_cli = parser.parse_args()
-# args_cli.enable_cameras = True
-# args_cli.headless = True  # headless mode for batch execution
-# app_launcher = AppLauncher(vars(args_cli))
-# simulation_app = app_launcher.app
+parser = argparse.ArgumentParser("sim_policy")
+parser.add_argument("--key", type=str, default="demo_video", help="scene key (outputs/<key>)")
+parser.add_argument("--robot", type=str, default="franka")
+parser.add_argument("--num_envs", type=int, default=1)
+parser.add_argument("--num_trials", type=int, default=1)
+parser.add_argument("--teleop_device", type=str, default="keyboard")
+parser.add_argument("--sensitivity", type=float, default=1.0)
+AppLauncher.add_app_launcher_args(parser)
+args_cli = parser.parse_args()
+args_cli.enable_cameras = True
+args_cli.headless = True  # headless mode for batch execution
+app_launcher = AppLauncher(vars(args_cli))
+simulation_app = app_launcher.app
 
 # ─────────── Runtime imports ───────────
 import isaaclab.sim as sim_utils
@@ -829,7 +829,8 @@ def sim_heuristic_manip(keys: list[str], args_cli: argparse.Namespace, config_pa
                 success = True
                 task_cfg = my_sim.from_data_to_task_cfg(key)
                 my_sim.task_cfg = task_cfg
-                my_sim.trajectory_cfg_list = [task_cfg.reference_trajectory]
+                assert task_cfg.reference_trajectory is not None
+                my_sim.traj_cfg_list = [task_cfg.reference_trajectory]
                 my_sim.save_data(ignore_keys=["segmask", "depth"], env_ids=success_ids[:1])
                 break
             # actions = np.load("outputs/lab16/demos/demo_5/env_000/actions.npy")
@@ -838,11 +839,14 @@ def sim_heuristic_manip(keys: list[str], args_cli: argparse.Namespace, config_pa
             print("[ERR] no successful environments!")
         env.close()
     #simulation_app.close()
-    return True
+    return True 
 
+def main():
+    base_dir = Path.cwd()
+    cfg = yaml.safe_load((base_dir / "config" / "config.yaml").open("r"))
+    keys = cfg["keys"]
+    sim_heuristic_manip(keys, args_cli)
 
-
-#if __name__ == "__main__":
-    #main()
-    #os.system("quit()")
-    #simulation_app.close()
+if __name__ == "__main__":
+    main()
+    simulation_app.close()
diff --git a/openreal2sim/simulation/isaaclab/sim_preprocess/usd_conversion.py b/openreal2sim/simulation/isaaclab/sim_preprocess/usd_conversion.py
index 1056fcd..2e98438 100755
--- a/openreal2sim/simulation/isaaclab/sim_preprocess/usd_conversion.py
+++ b/openreal2sim/simulation/isaaclab/sim_preprocess/usd_conversion.py
@@ -9,7 +9,7 @@
 AppLauncher.add_app_launcher_args(parser)
 args_isaaclab = parser.parse_args()
 # Force GUI unless you toggle via IsaacLab flags
-args_isaaclab.headless = False
+args_isaaclab.headless = True
 app_launcher = AppLauncher(args_isaaclab)
 simulation_app = app_launcher.app
 
@@ -378,9 +378,11 @@ def count_rigid_api(usd_path):
 ########################################################
 ## Per-key conversion (batch)
 ########################################################
-def run_conversion_for_key(key: str):
+from typing import Optional
+def run_conversion_for_key(key: str, args_cli:Optional[argparse.Namespace] = None):
     global args
-
+    if args_cli is None:
+        args_cli = args_isaaclab
     scene_json = out_dir / key / "simulation" / "scene.json"
     assert scene_json.exists(), f"[{key}] scene.json not found: {scene_json}"
     scene_dict = json.load(open(scene_json, "r"))
@@ -401,7 +403,7 @@ def run_conversion_for_key(key: str):
     args = Args(input=input_list, output=output_list, make_instanceable=False,
                 collision_approximation="convexDecomposition",
                 mass=1.0, disable_gravity=False, kinematic_enabled=False,
-                headless=args_isaaclab.headless, exit_on_finish=True)
+                headless=args_cli.headless, exit_on_finish=True)
 
     # Run the original main conversion logic for this key
     log.info(f"[{key}] converting {len(args.input)} assets...")
@@ -553,11 +555,11 @@ def run_conversion(scene_dict: dict):
             log.info(f"Saved USD file to {os.path.abspath(output_path)}")
 
 
-def usd_conversion(keys):
+def usd_conversion(keys, args_cli):
     for key in keys:
         log.info(f"\n========== [USD Convert] key: {key} ==========")
-        run_conversion_for_key(key)
-    simulation_app.close()
+        run_conversion_for_key(key, args_cli)
+    #simulation_app.close()
     print('[Info] USD conversion completed.')
 
 ########################################################
diff --git a/openreal2sim/simulation/isaaclab/sim_randomize_rollout.py b/openreal2sim/simulation/isaaclab/sim_randomize_rollout.py
index 530995a..4156ddd 100755
--- a/openreal2sim/simulation/isaaclab/sim_randomize_rollout.py
+++ b/openreal2sim/simulation/isaaclab/sim_randomize_rollout.py
@@ -13,8 +13,8 @@
 import sys
 from isaaclab.app import AppLauncher
 from typing import Optional, List
-from modules.envs.task_cfg import TaskCfg, TaskType, SuccessMetric, SuccessMetricType, TrajectoryCfg
-from modules.envs.task_construct import construct_task_config, add_reference_trajectory, load_task_cfg
+from modules.envs. task_cfg import TaskCfg, TaskType, SuccessMetric, SuccessMetricType, TrajectoryCfg
+from modules.envs.task_construct import construct_task_config, add_reference_trajectory, load_task_cfg, add_generated_trajectories
 from modules.envs.randomizer import Randomizer
 file_path = Path(__file__).resolve()
 import imageio 
@@ -23,19 +23,19 @@
 sys.path.append(str(file_path.parent.parent))
 
 # ─────────── CLI ───────────
-# parser = argparse.ArgumentParser("sim_policy")
-# parser.add_argument("--key", type=str, default="demo_video", help="scene key (outputs/<key>)")
-# parser.add_argument("--robot", type=str, default="franka")
-# parser.add_argument("--num_envs", type=int, default=1)
-# parser.add_argument("--num_trials", type=int, default=1)
-# parser.add_argument("--teleop_device", type=str, default="keyboard")
-# parser.add_argument("--sensitivity", type=float, default=1.0)
-# AppLauncher.add_app_launcher_args(parser)
-# args_cli = parser.parse_args()
-# args_cli.enable_cameras = True
-# args_cli.headless = False  # headless mode for batch execution
-# app_launcher = AppLauncher(vars(args_cli))
-# simulation_app = app_launcher.app
+parser = argparse.ArgumentParser("sim_policy")
+parser.add_argument("--key", type=str, default="demo_video", help="scene key (outputs/<key>)")
+parser.add_argument("--robot", type=str, default="franka")
+parser.add_argument("--num_envs", type=int, default=1)
+parser.add_argument("--num_trials", type=int, default=1)
+parser.add_argument("--teleop_device", type=str, default="keyboard")
+parser.add_argument("--sensitivity", type=float, default=1.0)
+AppLauncher.add_app_launcher_args(parser)
+args_cli = parser.parse_args()
+args_cli.enable_cameras = True
+args_cli.headless = False  # headless mode for batch execution
+app_launcher = AppLauncher(vars(args_cli))
+simulation_app = app_launcher.app
 
 # ─────────── Runtime imports ───────────
 import isaaclab.sim as sim_utils
@@ -551,7 +551,7 @@ def run_batch_trajectory(self, traj_cfg_list: List[TrajectoryCfg]):
 def sim_randomize_rollout(keys: list[str], args_cli: argparse.Namespace):
     for key in keys:
         success_trajectory_config_list = []
-        total_require_traj_num = 50
+        total_require_traj_num = args_cli.total_num
         task_json_path = BASE_DIR / "tasks" / key / "task.json"
         task_cfg = load_task_cfg(task_json_path)
         randomizer = Randomizer(task_cfg)
@@ -564,7 +564,7 @@ def sim_randomize_rollout(keys: list[str], args_cli: argparse.Namespace):
         data_dir = BASE_DIR / "data" / key
         current_timestep = 0
         env, _ = make_env(
-                cfgs=sim_cfgs, num_envs=3,
+                cfgs=sim_cfgs, num_envs=args_cli.num_envs,
                 device=args_cli.device,
                 bg_simplify=False,
             )
@@ -575,8 +575,8 @@ def sim_randomize_rollout(keys: list[str], args_cli: argparse.Namespace):
         my_sim = RandomizeExecution(sim, scene, sim_cfgs=sim_cfgs, demo_dir=demo_dir, data_dir=data_dir, record=True, args_cli=args_cli, bg_rgb=bg_rgb)
         my_sim.task_cfg = task_cfg
         while len(success_trajectory_config_list) < total_require_traj_num:
-            traj_cfg_list = random_task_cfg_list[current_timestep: current_timestep + 3]
-            current_timestep += 3
+            traj_cfg_list = random_task_cfg_list[current_timestep: current_timestep + args_cli.num_envs]
+            current_timestep += args_cli.num_envs
            
             success_env_ids = my_sim.run_batch_trajectory(traj_cfg_list)
             
@@ -594,7 +594,17 @@ def sim_randomize_rollout(keys: list[str], args_cli: argparse.Namespace):
         #     success_env_ids = my_sim.inference()
         #     del my_sim
         #     torch.cuda.empty_cache()
-     
+        add_generated_trajectories(task_cfg, success_trajectory_config_list, task_json_path.parent)
     
     return success_trajectory_config_list
 
+def main():
+    base_dir = Path.cwd()
+    cfg = yaml.safe_load((base_dir / "config" / "config.yaml").open("r"))
+    keys = cfg["keys"]
+    sim_randomize_rollout(keys, args_cli)
+  
+
+if __name__ == "__main__":
+    main()
+    simulation_app.close()
\ No newline at end of file
diff --git a/openreal2sim/simulation/sim_agent.py b/openreal2sim/simulation/sim_agent.py
index 486654b..fc5898b 100755
--- a/openreal2sim/simulation/sim_agent.py
+++ b/openreal2sim/simulation/sim_agent.py
@@ -34,7 +34,8 @@ def __init__(self, stage=None, key=None):
             # "hand_extraction",
             #"demo_motion_process",
             # "grasp_point_generation",
-            # "grasp_generation"
+            # "grasp_generation",
+            # "usd_generation"
 
         ]
         if stage is not None:
@@ -133,87 +134,87 @@ def create_args_from_config(config_path: Optional[str] = None, config_dict: Opti
     return argparse.Namespace(**config_dict)
 
 
-class IsaacAgent:
-    def __init__(self, stage=None, key=None):
-        print('[Info] Initializing IsaacAgent...')
-        self.base_dir = Path.cwd()
-        cfg_path = self.base_dir / "config" / "config.yaml"
-        cfg = yaml.safe_load(cfg_path.open("r"))
-        self.keys = [key] if key is not None else cfg["keys"]
-        self.key_cfgs = {key: compose_configs(key, cfg) for key in self.keys}
-        self.key_scene_dicts = {}
-        for key in self.keys:
-            scene_pkl = self.base_dir / f'outputs/{key}/scene/scene.pkl'
-            with open(scene_pkl, 'rb') as f:
-                scene_dict = pickle.load(f)
-            self.key_scene_dicts[key] = scene_dict
-        self.stages = [
-            # "usd_conversion",
-            #"grasp_generation",
-            #"sim_heuristic_manip",
-            "sim_randomize_rollout",
-
-        ]
-        if stage is not None:
-            if stage in self.stages:
-                start_idx = self.stages.index(stage)
-                self.stages = self.stages[start_idx:]
-            else:
-                print(f"[Warning] Stage '{stage}' not found. It must be one of the {self.stages}. Running all stages by default.")
-        print('[Info] IsaacAgent initialized.')
+# class IsaacAgent:
+#     def __init__(self, stage=None, key=None):
+#         print('[Info] Initializing IsaacAgent...')
+#         self.base_dir = Path.cwd()
+#         cfg_path = self.base_dir / "config" / "config.yaml"
+#         cfg = yaml.safe_load(cfg_path.open("r"))
+#         self.keys = [key] if key is not None else cfg["keys"]
+#         self.key_cfgs = {key: compose_configs(key, cfg) for key in self.keys}
+#         self.key_scene_dicts = {}
+#         for key in self.keys:
+#             scene_pkl = self.base_dir / f'outputs/{key}/scene/scene.pkl'
+#             with open(scene_pkl, 'rb') as f:
+#                 scene_dict = pickle.load(f)
+#             self.key_scene_dicts[key] = scene_dict
+#         self.stages = [
+#             # "usd_conversion",
+#             #"grasp_generation",
+#             #"sim_heuristic_manip",
+#             "sim_randomize_rollout",
+
+#         ]
+#         if stage is not None:
+#             if stage in self.stages:
+#                 start_idx = self.stages.index(stage)
+#                 self.stages = self.stages[start_idx:]
+#             else:
+#                 print(f"[Warning] Stage '{stage}' not found. It must be one of the {self.stages}. Running all stages by default.")
+#         print('[Info] IsaacAgent initialized.')
     
-    def launch_isaaclab(self):
-
-        self.args_cli = create_args_from_config()
-        #AppLauncher.add_app_launcher_args(self.args_cli)
-        self.args_cli.enable_cameras = True
-        self.args_cli.headless = True
-        self.app_launcher = AppLauncher(vars(self.args_cli))
-        self.simulation_app = self.app_launcher.app
-        return self.simulation_app
+#     def launch_isaaclab(self):
+
+#         self.args_cli = create_args_from_config()
+#         #AppLauncher.add_app_launcher_args(self.args_cli)
+#         self.args_cli.enable_cameras = True
+#         self.args_cli.headless = True
+#         self.app_launcher = AppLauncher(vars(self.args_cli))
+#         self.simulation_app = self.app_launcher.app
+#         return self.simulation_app
     
-    def close_isaaclab(self):
-        self.simulation_app.close()
+#     def close_isaaclab(self):
+#         self.simulation_app.close()
 
-    def usd_conversion(self):
-        from isaaclab.sim_preprocess.usd_conversion import usd_conversion
-        usd_conversion(self.keys)
-        print('[Info] USD conversion completed.')
+#     def usd_conversion(self):
+#         from isaaclab.sim_preprocess.usd_conversion import usd_conversion
+#         usd_conversion(self.keys)
+#         print('[Info] USD conversion completed.')
     
-    def sim_heuristic_manip(self):
-        self.launch_isaaclab()
-        from isaaclab.sim_heuristic_manip import sim_heuristic_manip
-        sim_heuristic_manip(self.keys, args_cli=self.args_cli)
-        print('[Info] Heuristic manipulation simulation completed.')
-        self.close_isaaclab()
+#     def sim_heuristic_manip(self):
+#         self.launch_isaaclab()
+#         from isaaclab.sim_heuristic_manip import sim_heuristic_manip
+#         sim_heuristic_manip(self.keys, args_cli=self.args_cli)
+#         print('[Info] Heuristic manipulation simulation completed.')
+#         self.close_isaaclab()
     
 
-    def sim_randomize_rollout(self):
-        self.launch_isaaclab()
-        from isaaclab.sim_randomize_rollout import sim_randomize_rollout
-        sim_randomize_rollout(self.keys, args_cli=self.args_cli)
-        print('[Info] Randomize rollout completed.')
-        self.close_isaaclab()
-
-    def grasp_generation(self):
-        from modules.grasp_preprocess.grasp_generation import grasp_generation
-        grasp_generation(self.keys)
-        print('[Info] Grasp proposal generation completed.')
+#     def sim_randomize_rollout(self):
+#         self.launch_isaaclab()
+#         from isaaclab.sim_randomize_rollout import sim_randomize_rollout
+#         sim_randomize_rollout(self.keys, args_cli=self.args_cli)
+#         print('[Info] Randomize rollout completed.')
+#         self.close_isaaclab()
+
+#     def grasp_generation(self):
+#         from modules.grasp_preprocess.grasp_generation import grasp_generation
+#         grasp_generation(self.keys)
+#         print('[Info] Grasp proposal generation completed.')
     
-    def run(self):
-        if "usd_conversion" in self.stages:
-            self.usd_conversion()
-        if "grasp_generation" in self.stages:
-            self.grasp_generation()
-        if "launch_isaaclab" in self.stages:
-            self.launch_isaaclab()
-        if "sim_heuristic_manip" in self.stages:
-            self.sim_heuristic_manip()
-
-        if "sim_randomize_rollout" in self.stages:
-            self.sim_randomize_rollout()
-        if "close_isaaclab" in self.stages:
-            self.close_isaaclab()
+#     def run(self):
+#         if "usd_conversion" in self.stages:
+#             self.usd_conversion()
+#         if "grasp_generation" in self.stages:
+#             self.grasp_generation()
+#         if "launch_isaaclab" in self.stages:
+#             self.launch_isaaclab()
+#         if "sim_heuristic_manip" in self.stages:
+#             self.sim_heuristic_manip()
+
+#         if "sim_randomize_rollout" in self.stages:
+#             self.sim_randomize_rollout()
+#         if "close_isaaclab" in self.stages:
+#             self.close_isaaclab()
 
 if __name__ == '__main__':
     args = argparse.ArgumentParser()
@@ -235,14 +236,3 @@ def run(self):
         if args.label:
             notify_failed(args.label)
         raise
-
-    try:
-        agent = IsaacAgent(stage=args.stage, key=args.key)
-        scene_dicts = agent.run()
-
-        if args.label:
-            notify_success(args.label)
-    except Exception as e:
-        if args.label:
-            notify_failed(args.label)
-        raise
\ No newline at end of file
diff --git a/openreal2sim/simulation/sim_agent.sh b/openreal2sim/simulation/sim_agent.sh
new file mode 100644
index 0000000..ae79bae
--- /dev/null
+++ b/openreal2sim/simulation/sim_agent.sh
@@ -0,0 +1,136 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ROOT_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
+
+# Default parameters
+KEY="demo_video"
+STAGE=""
+NUM_ENVS=1
+NUM_TRIALS=1
+DEVICE="cuda:0"
+HEADLESS="--headless"
+TOTAL_NUM=10
+
+usage() {
+  cat <<'EOF'
+Usage: sim_agent.sh [options]
+
+Options:
+  --key <name>           Scene key to process (default: demo_video)
+  --stage <stage>        Starting stage: usd_conversion | sim_heuristic_manip | sim_randomize_rollout
+  --num_envs <int>       Number of parallel environments for heuristic/randomize (default: 1)
+  --num_trials <int>     Number of trials for heuristic stage (default: 1)
+  --device <str>         Device for simulations, e.g. cuda:0 (default: cuda:0)
+  --no-headless          Disable headless mode (passes through to AppLauncher)
+  -h, --help             Show this message and exit
+
+The script runs the following stages in order (starting from --stage if provided):
+  1. USD conversion (`isaaclab/sim_preprocess/usd_conversion.py`)
+  2. Heuristic manipulation (`isaaclab/sim_heuristic_manip.py`)
+  3. Randomized rollout (`isaaclab/sim_randomize_rollout.py`)
+EOF
+}
+
+# Parse arguments
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --key)
+      KEY="${2:?Missing value for --key}"
+      shift 2
+      ;;
+    --stage)
+      STAGE="${2:?Missing value for --stage}"
+      shift 2
+      ;;
+    --num_envs)
+      NUM_ENVS="${2:?Missing value for --num_envs}"
+      shift 2
+      ;;
+    --num_trials)
+      NUM_TRIALS="${2:?Missing value for --num_trials}"
+      shift 2
+      ;;
+    --device)
+      DEVICE="${2:?Missing value for --device}"
+      shift 2
+      ;;
+    --no-headless)
+      HEADLESS=""
+      shift
+      ;;
+    -h|--help)
+      usage
+      exit 0
+      ;;
+    *)
+      echo "[ERR] Unknown option: $1" >&2
+      usage >&2
+      exit 1
+      ;;
+  esac
+done
+
+# Determine which stages to run
+declare -a PIPELINE=("usd_conversion" "sim_heuristic_manip" "sim_randomize_rollout")
+declare -a RUN_STAGES=()
+if [[ -z "${STAGE}" ]]; then
+  RUN_STAGES=("${PIPELINE[@]}")
+else
+  found=""
+  for stage in "${PIPELINE[@]}"; do
+    if [[ -z "${found}" && "${stage}" == "${STAGE}" ]]; then
+      found="yes"
+    fi
+    if [[ -n "${found}" ]]; then
+      RUN_STAGES+=("${stage}")
+    fi
+  done
+  if [[ ${#RUN_STAGES[@]} -eq 0 ]]; then
+    echo "[ERR] Invalid --stage '${STAGE}'. Valid options: ${PIPELINE[*]}" >&2
+    exit 1
+  fi
+fi
+
+export PYTHONPATH="${ROOT_DIR}:${PYTHONPATH:-}"
+cd "${ROOT_DIR}"
+
+run_stage() {
+  local stage="$1"
+  echo "=============================="
+  echo "[RUN] Stage: ${stage}"
+  echo "=============================="
+  case "${stage}" in
+    usd_conversion)
+      python pi_i/simulation/isaaclab/sim_preprocess/usd_conversion.py ${HEADLESS}
+      ;;
+    sim_heuristic_manip)
+      python pi_i/simulation/isaaclab/sim_heuristic_manip.py \
+        --key "${KEY}" \
+        --num_envs "${NUM_ENVS}" \
+        --num_trials "${NUM_TRIALS}" \
+        --device "${DEVICE}" \
+        ${HEADLESS}
+      ;;
+    sim_randomize_rollout)
+      python pi_i/simulation/isaaclab/sim_randomize_rollout.py \
+        --key "${KEY}" \
+        --num_envs "${NUM_ENVS}" \
+        --device "${DEVICE}" \
+        -- total_num "${TOTAL_NUM}" \
+        ${HEADLESS}
+      ;;
+    *)
+      echo "[ERR] Unsupported stage '${stage}'" >&2
+      exit 1
+      ;;
+  esac
+}
+
+for stage in "${RUN_STAGES[@]}"; do
+  run_stage "${stage}"
+done
+
+echo "[DONE] Pipeline completed: ${RUN_STAGES[*]}"
+