From 47860b0adf964f57765165374962e38e02dc8abf Mon Sep 17 00:00:00 2001
From: Sriram Krishna <sriramsk1999@gmail.com>
Date: Tue, 30 Dec 2025 04:49:38 -0500
Subject: [PATCH 1/2] some misc changes for working with droid

---
 src/lfd3d/datasets/droid/render_robotiq.py    | 18 +++++++++---------
 src/lfd3d/datasets/lerobot/lerobot_dataset.py |  5 +++++
 src/lfd3d/utils/script_utils.py               |  1 +
 3 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/src/lfd3d/datasets/droid/render_robotiq.py b/src/lfd3d/datasets/droid/render_robotiq.py
index 5dbe0f8..91cc577 100644
--- a/src/lfd3d/datasets/droid/render_robotiq.py
+++ b/src/lfd3d/datasets/droid/render_robotiq.py
@@ -11,10 +11,11 @@
 import tensorflow_datasets as tfds
 import trimesh
 from lfd3d.utils.data_utils import combine_meshes
-from robot_descriptions.loaders.mujoco import load_robot_description
 from scipy.spatial.transform import Rotation
 from tqdm import tqdm
 
+from robot_descriptions.loaders.mujoco import load_robot_description
+
 """
 NOTE: MJCF needs to be modified to add free joint
 TODO: Dynamically add to MJCF to avoid this.
@@ -214,16 +215,15 @@ def process_item(
         # Sample point cloud from its surface for saving
         # NOTE: Keeping a seed is very important so that we maintain correspondences over samples.
         points_, _ = trimesh.sample.sample_surface(mesh_, sample_n_points, seed=42)
-
-        homogenous_append = np.ones((sample_n_points, 1))
-        gripper_urdf_3d_pos = np.concatenate([points_, homogenous_append], axis=-1)[
-            :, :, None
-        ]
-        urdf_cam3dcoords = (world_to_cam @ gripper_urdf_3d_pos)[:, :3].squeeze(2)
-
-        gripper_pcds.append(urdf_cam3dcoords)
+        gripper_pcds.append(points_)
 
         if idx % 100 == 0:
+            homogenous_append = np.ones((sample_n_points, 1))
+            gripper_urdf_3d_pos = np.concatenate([points_, homogenous_append], axis=-1)[
+                :, :, None
+            ]
+            urdf_cam3dcoords = (world_to_cam @ gripper_urdf_3d_pos)[:, :3].squeeze(2)
+
             images = np.array([i["observation"][cam_image_key] for i in steps])
             os.makedirs(f"{viz_dir}/{idx}", exist_ok=True)
             urdf_proj_hom = (K @ urdf_cam3dcoords.T).T
diff --git a/src/lfd3d/datasets/lerobot/lerobot_dataset.py b/src/lfd3d/datasets/lerobot/lerobot_dataset.py
index 9da89b8..7dfe85c 100644
--- a/src/lfd3d/datasets/lerobot/lerobot_dataset.py
+++ b/src/lfd3d/datasets/lerobot/lerobot_dataset.py
@@ -34,6 +34,10 @@ def source_of_data(item):
     """
     Return the source of the current demo i.e. where the data is from
     """
+    return item["embodiment"]
+    # Code below is deprecated and only kept for future reference.
+    # Embodiment is expected to be baked into the dataset
+    """
     # Currently identifying demo type by checking number of vertices
     # For human data, we have 778 vertices from the MANO mesh
     # For aloha data, we're sampling 500 points from the mesh.
@@ -50,6 +54,7 @@ def source_of_data(item):
         raise NotImplementedError
 
     return demo_type
+    """
 
 
 class RpadLeRobotDataset(BaseDataset):
diff --git a/src/lfd3d/utils/script_utils.py b/src/lfd3d/utils/script_utils.py
index 53d9c09..7c064e6 100644
--- a/src/lfd3d/utils/script_utils.py
+++ b/src/lfd3d/utils/script_utils.py
@@ -174,6 +174,7 @@ def load_checkpoint_config_from_wandb(
         "dataset.additional_img_dir",
         "dataset.use_intermediate_frames",
         "dataset.data_sources",
+        "dataset.val_episode_ratio",
     ],
 ):
     """

From d6105a235b4ae5a2b4f6ecf9c6b04fde5880db2a Mon Sep 17 00:00:00 2001
From: Sriram Krishna <sriramsk1999@gmail.com>
Date: Tue, 30 Dec 2025 06:39:01 -0500
Subject: [PATCH 2/2] update articubot to work with latest changes

---
 configs/model/articubot.yaml                  |  4 +-
 src/lfd3d/datasets/base_data.py               | 10 ++--
 src/lfd3d/datasets/lerobot/lerobot_dataset.py | 46 ++++++++++++++-----
 src/lfd3d/models/articubot.py                 | 41 ++++++++++++++---
 4 files changed, 76 insertions(+), 25 deletions(-)

diff --git a/configs/model/articubot.yaml b/configs/model/articubot.yaml
index e8d3b29..5a39090 100644
--- a/configs/model/articubot.yaml
+++ b/configs/model/articubot.yaml
@@ -2,12 +2,12 @@ name: articubot
 
 # Model settings
 type: cross_displacement # (flow, point) Type of model
-in_channels: 4 # Number of input channels (7 - xyz + one-hot mask for action_pcd + rgb)
+in_channels: 7 # Number of input channels (7 - xyz + one-hot mask for action_pcd + rgb)
 num_classes: 13 # xyz of 4 points and weight of points
 keep_gripper_in_fps: False
 add_action_pcd_masked: True # Add in action pcd with boolean mask (increment in_channels)
 use_text_embedding: True # If true, expects (siglip) text embedding to be provided as input
-use_rgb: False # If true, expects +3 in_chanels
+use_rgb: True # If true, expects +3 in_chanels
 use_dual_head: False # If true, maintain separate heads for human/robot inputs
 
 is_gmm: True # Train a GMM and minimize negative log likelihood instead
diff --git a/src/lfd3d/datasets/base_data.py b/src/lfd3d/datasets/base_data.py
index 1772ead..bafbf19 100644
--- a/src/lfd3d/datasets/base_data.py
+++ b/src/lfd3d/datasets/base_data.py
@@ -289,15 +289,15 @@ def augment_scene_pcd(self, scene_pcd, feat_flat, augment_cfg):
             raise NotImplementedError
 
         if augment_cfg["augment_transform"]:
-            # Apply random SO(2) rotation (around Y-axis as pcd is in camera frame) and translation
+            # Apply random SO(2) rotation (around Z-axis as pcd is in world frame) and translation
             theta = np.random.uniform(0, 2 * np.pi)  # Random angle
             R = np.array(
                 [
-                    [np.cos(theta), 0, np.sin(theta)],
-                    [0, 1, 0],
-                    [-np.sin(theta), 0, np.cos(theta)],
+                    [np.cos(theta), -np.sin(theta), 0],
+                    [np.sin(theta), np.cos(theta), 0],
+                    [0, 0, 1],
                 ]
-            )  # Rotation matrix around Y-axis
+            )  # Rotation matrix around Z-axis
             translation = np.random.uniform(
                 -0.2, 0.2, size=3
             )  # Random translation vector
diff --git a/src/lfd3d/datasets/lerobot/lerobot_dataset.py b/src/lfd3d/datasets/lerobot/lerobot_dataset.py
index 7dfe85c..a66babd 100644
--- a/src/lfd3d/datasets/lerobot/lerobot_dataset.py
+++ b/src/lfd3d/datasets/lerobot/lerobot_dataset.py
@@ -273,19 +273,41 @@ def __getitem__(self, index):
         primary_K = all_intrinsics_aug[0]
         primary_T = all_extrinsics[0]
 
-        # Compute scene PCD from primary camera
-        rgb_embed, text_embed = self.rgb_text_featurizer.compute_rgb_text_feat(
-            primary_rgbs[0], caption
-        )
-        start_scene_pcd, start_scene_feat_pcd, augment_tf = self.get_scene_pcd(
-            rgb_embed, primary_depths[0], primary_K, self.num_points, self.max_depth
-        )
+        # Compute scene PCD from all cameras and combine in world frame
+        all_scene_pcds_world = []
+        all_scene_feat_pcds = []
+        augment_tf = None
 
-        # Transform scene PCD from camera frame to world frame
-        # start_scene_pcd is (N, 3) in camera frame, transform using primary extrinsics
-        start_scene_pcd_world = self._transform_to_world_frame(
-            start_scene_pcd, primary_T
-        )
+        for i, (rgbs, depths, K, T) in enumerate(
+            zip(all_rgbs_aug, all_depths_aug, all_intrinsics_aug, all_extrinsics)
+        ):
+            # Compute RGB embedding for this camera
+            rgb_embed_cam, text_embed = self.rgb_text_featurizer.compute_rgb_text_feat(
+                rgbs[0], caption
+            )
+
+            # Compute scene PCD in camera frame with points_per_camera
+            scene_pcd_cam, scene_feat_pcd_cam, augment_tf_cam = self.get_scene_pcd(
+                rgb_embed_cam,
+                depths[0],
+                K,
+                self.num_points // len(all_rgbs_aug),
+                self.max_depth,
+            )
+
+            # Store augment_tf from primary camera (first iteration)
+            if augment_tf is None:
+                augment_tf = augment_tf_cam
+
+            # Transform to world frame
+            scene_pcd_world = self._transform_to_world_frame(scene_pcd_cam, T)
+
+            all_scene_pcds_world.append(scene_pcd_world)
+            all_scene_feat_pcds.append(scene_feat_pcd_cam)
+
+        # Concatenate point clouds from all cameras
+        start_scene_pcd_world = np.concatenate(all_scene_pcds_world, axis=0)
+        start_scene_feat_pcd = np.concatenate(all_scene_feat_pcds, axis=0)
 
         gripper_idx = self.GRIPPER_IDX[data_source]
 
diff --git a/src/lfd3d/models/articubot.py b/src/lfd3d/models/articubot.py
index efc1582..725a44f 100644
--- a/src/lfd3d/models/articubot.py
+++ b/src/lfd3d/models/articubot.py
@@ -1303,9 +1303,7 @@ def log_viz_to_wandb(self, batch, pred_dict, weighted_displacement, tag):
             gt_pcd, pcd_mean, pcd_std, R, t, scene_centroid
         )
 
-        # All points cloud are in the start image's coordinate frame
-        # We need to visualize the end image, therefore need to apply transform
-        # Transform the point clouds to align with end image coordinate frame
+        # Transform to end frame
         pcd_endframe = np.hstack((pcd, np.ones((pcd.shape[0], 1))))
         pcd_endframe = (end2start @ pcd_endframe.T).T[:, :3]
         all_pred_pcd_tmp = []
@@ -1317,6 +1315,37 @@ def log_viz_to_wandb(self, batch, pred_dict, weighted_displacement, tag):
         gt_pcd = np.hstack((gt_pcd, np.ones((gt_pcd.shape[0], 1))))
         gt_pcd = (end2start @ gt_pcd.T).T[:, :3]
 
+        # Transform from world frame to primary camera frame for projection
+        # Primary camera extrinsics: T_world_from_cam, we need T_cam_from_world
+        primary_extrinsics = batch["extrinsics"][viz_idx].cpu().numpy()  # (4, 4)
+        T_cam_from_world = np.linalg.inv(primary_extrinsics)
+
+        # Save world-frame versions for action_anchor_pcd visualization
+        pcd_world = pcd.copy()
+        anchor_pcd_world = anchor_pcd.copy()
+
+        # Transform points to primary camera frame
+        # Transform initial pcd (for init_rgb_proj)
+        pcd_cam = np.hstack((pcd, np.ones((pcd.shape[0], 1))))
+        pcd_cam = (T_cam_from_world @ pcd_cam.T).T[:, :3]
+
+        pcd_endframe = np.hstack((pcd_endframe, np.ones((pcd_endframe.shape[0], 1))))
+        pcd_endframe = (T_cam_from_world @ pcd_endframe.T).T[:, :3]
+
+        all_pred_pcd_tmp = []
+        for i in range(N):
+            tmp_pcd = np.hstack((all_pred_pcd[i], np.ones((all_pred_pcd.shape[1], 1))))
+            tmp_pcd = (T_cam_from_world @ tmp_pcd.T).T[:, :3]
+            all_pred_pcd_tmp.append(tmp_pcd)
+        all_pred_pcd = np.stack(all_pred_pcd_tmp)
+
+        gt_pcd = np.hstack((gt_pcd, np.ones((gt_pcd.shape[0], 1))))
+        gt_pcd = (T_cam_from_world @ gt_pcd.T).T[:, :3]
+
+        # Also transform anchor_pcd to camera frame
+        anchor_pcd = np.hstack((anchor_pcd, np.ones((anchor_pcd.shape[0], 1))))
+        anchor_pcd = (T_cam_from_world @ anchor_pcd.T).T[:, :3]
+
         K = batch["intrinsics"][viz_idx].cpu().numpy()
 
         rgb_init, rgb_end = (
@@ -1329,7 +1358,7 @@ def log_viz_to_wandb(self, batch, pred_dict, weighted_displacement, tag):
         )
 
         # Project tracks to image and save
-        init_rgb_proj = project_pcd_on_image(pcd, padding_mask, rgb_init, K, GREEN)
+        init_rgb_proj = project_pcd_on_image(pcd_cam, padding_mask, rgb_init, K, GREEN)
         end_rgb_proj = project_pcd_on_image(gt_pcd, padding_mask, rgb_end, K, RED)
         pred_rgb_proj = project_pcd_on_image(
             all_pred_pcd[-1], padding_mask, rgb_end, K, BLUE
@@ -1363,8 +1392,8 @@ def log_viz_to_wandb(self, batch, pred_dict, weighted_displacement, tag):
 
         # Visualize action/anchor point cloud
         action_anchor_pcd = get_action_anchor_pcd(
-            pcd,
-            anchor_pcd,
+            pcd_world,
+            anchor_pcd_world,
             GREEN,
             RED,
         )