From 47860b0adf964f57765165374962e38e02dc8abf Mon Sep 17 00:00:00 2001 From: Sriram Krishna Date: Tue, 30 Dec 2025 04:49:38 -0500 Subject: [PATCH 1/2] some misc changes for working with droid --- src/lfd3d/datasets/droid/render_robotiq.py | 18 +++++++++--------- src/lfd3d/datasets/lerobot/lerobot_dataset.py | 5 +++++ src/lfd3d/utils/script_utils.py | 1 + 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/src/lfd3d/datasets/droid/render_robotiq.py b/src/lfd3d/datasets/droid/render_robotiq.py index 5dbe0f8..91cc577 100644 --- a/src/lfd3d/datasets/droid/render_robotiq.py +++ b/src/lfd3d/datasets/droid/render_robotiq.py @@ -11,10 +11,11 @@ import tensorflow_datasets as tfds import trimesh from lfd3d.utils.data_utils import combine_meshes -from robot_descriptions.loaders.mujoco import load_robot_description from scipy.spatial.transform import Rotation from tqdm import tqdm +from robot_descriptions.loaders.mujoco import load_robot_description + """ NOTE: MJCF needs to be modified to add free joint TODO: Dynamically add to MJCF to avoid this. @@ -214,16 +215,15 @@ def process_item( # Sample point cloud from its surface for saving # NOTE: Keeping a seed is very important so that we maintain correspondences over samples. points_, _ = trimesh.sample.sample_surface(mesh_, sample_n_points, seed=42) - - homogenous_append = np.ones((sample_n_points, 1)) - gripper_urdf_3d_pos = np.concatenate([points_, homogenous_append], axis=-1)[ - :, :, None - ] - urdf_cam3dcoords = (world_to_cam @ gripper_urdf_3d_pos)[:, :3].squeeze(2) - - gripper_pcds.append(urdf_cam3dcoords) + gripper_pcds.append(points_) if idx % 100 == 0: + homogenous_append = np.ones((sample_n_points, 1)) + gripper_urdf_3d_pos = np.concatenate([points_, homogenous_append], axis=-1)[ + :, :, None + ] + urdf_cam3dcoords = (world_to_cam @ gripper_urdf_3d_pos)[:, :3].squeeze(2) + images = np.array([i["observation"][cam_image_key] for i in steps]) os.makedirs(f"{viz_dir}/{idx}", exist_ok=True) urdf_proj_hom = (K @ urdf_cam3dcoords.T).T diff --git a/src/lfd3d/datasets/lerobot/lerobot_dataset.py b/src/lfd3d/datasets/lerobot/lerobot_dataset.py index 9da89b8..7dfe85c 100644 --- a/src/lfd3d/datasets/lerobot/lerobot_dataset.py +++ b/src/lfd3d/datasets/lerobot/lerobot_dataset.py @@ -34,6 +34,10 @@ def source_of_data(item): """ Return the source of the current demo i.e. where the data is from """ + return item["embodiment"] + # Code below is deprecated and only kept for future reference. + # Embodiment is expected to be baked into the dataset + """ # Currently identifying demo type by checking number of vertices # For human data, we have 778 vertices from the MANO mesh # For aloha data, we're sampling 500 points from the mesh. @@ -50,6 +54,7 @@ def source_of_data(item): raise NotImplementedError return demo_type + """ class RpadLeRobotDataset(BaseDataset): diff --git a/src/lfd3d/utils/script_utils.py b/src/lfd3d/utils/script_utils.py index 53d9c09..7c064e6 100644 --- a/src/lfd3d/utils/script_utils.py +++ b/src/lfd3d/utils/script_utils.py @@ -174,6 +174,7 @@ def load_checkpoint_config_from_wandb( "dataset.additional_img_dir", "dataset.use_intermediate_frames", "dataset.data_sources", + "dataset.val_episode_ratio", ], ): """ From d6105a235b4ae5a2b4f6ecf9c6b04fde5880db2a Mon Sep 17 00:00:00 2001 From: Sriram Krishna Date: Tue, 30 Dec 2025 06:39:01 -0500 Subject: [PATCH 2/2] update articubot to work with latest changes --- configs/model/articubot.yaml | 4 +- src/lfd3d/datasets/base_data.py | 10 ++-- src/lfd3d/datasets/lerobot/lerobot_dataset.py | 46 ++++++++++++++----- src/lfd3d/models/articubot.py | 41 ++++++++++++++--- 4 files changed, 76 insertions(+), 25 deletions(-) diff --git a/configs/model/articubot.yaml b/configs/model/articubot.yaml index e8d3b29..5a39090 100644 --- a/configs/model/articubot.yaml +++ b/configs/model/articubot.yaml @@ -2,12 +2,12 @@ name: articubot # Model settings type: cross_displacement # (flow, point) Type of model -in_channels: 4 # Number of input channels (7 - xyz + one-hot mask for action_pcd + rgb) +in_channels: 7 # Number of input channels (7 - xyz + one-hot mask for action_pcd + rgb) num_classes: 13 # xyz of 4 points and weight of points keep_gripper_in_fps: False add_action_pcd_masked: True # Add in action pcd with boolean mask (increment in_channels) use_text_embedding: True # If true, expects (siglip) text embedding to be provided as input -use_rgb: False # If true, expects +3 in_chanels +use_rgb: True # If true, expects +3 in_chanels use_dual_head: False # If true, maintain separate heads for human/robot inputs is_gmm: True # Train a GMM and minimize negative log likelihood instead diff --git a/src/lfd3d/datasets/base_data.py b/src/lfd3d/datasets/base_data.py index 1772ead..bafbf19 100644 --- a/src/lfd3d/datasets/base_data.py +++ b/src/lfd3d/datasets/base_data.py @@ -289,15 +289,15 @@ def augment_scene_pcd(self, scene_pcd, feat_flat, augment_cfg): raise NotImplementedError if augment_cfg["augment_transform"]: - # Apply random SO(2) rotation (around Y-axis as pcd is in camera frame) and translation + # Apply random SO(2) rotation (around Z-axis as pcd is in world frame) and translation theta = np.random.uniform(0, 2 * np.pi) # Random angle R = np.array( [ - [np.cos(theta), 0, np.sin(theta)], - [0, 1, 0], - [-np.sin(theta), 0, np.cos(theta)], + [np.cos(theta), -np.sin(theta), 0], + [np.sin(theta), np.cos(theta), 0], + [0, 0, 1], ] - ) # Rotation matrix around Y-axis + ) # Rotation matrix around Z-axis translation = np.random.uniform( -0.2, 0.2, size=3 ) # Random translation vector diff --git a/src/lfd3d/datasets/lerobot/lerobot_dataset.py b/src/lfd3d/datasets/lerobot/lerobot_dataset.py index 7dfe85c..a66babd 100644 --- a/src/lfd3d/datasets/lerobot/lerobot_dataset.py +++ b/src/lfd3d/datasets/lerobot/lerobot_dataset.py @@ -273,19 +273,41 @@ def __getitem__(self, index): primary_K = all_intrinsics_aug[0] primary_T = all_extrinsics[0] - # Compute scene PCD from primary camera - rgb_embed, text_embed = self.rgb_text_featurizer.compute_rgb_text_feat( - primary_rgbs[0], caption - ) - start_scene_pcd, start_scene_feat_pcd, augment_tf = self.get_scene_pcd( - rgb_embed, primary_depths[0], primary_K, self.num_points, self.max_depth - ) + # Compute scene PCD from all cameras and combine in world frame + all_scene_pcds_world = [] + all_scene_feat_pcds = [] + augment_tf = None - # Transform scene PCD from camera frame to world frame - # start_scene_pcd is (N, 3) in camera frame, transform using primary extrinsics - start_scene_pcd_world = self._transform_to_world_frame( - start_scene_pcd, primary_T - ) + for i, (rgbs, depths, K, T) in enumerate( + zip(all_rgbs_aug, all_depths_aug, all_intrinsics_aug, all_extrinsics) + ): + # Compute RGB embedding for this camera + rgb_embed_cam, text_embed = self.rgb_text_featurizer.compute_rgb_text_feat( + rgbs[0], caption + ) + + # Compute scene PCD in camera frame with points_per_camera + scene_pcd_cam, scene_feat_pcd_cam, augment_tf_cam = self.get_scene_pcd( + rgb_embed_cam, + depths[0], + K, + self.num_points // len(all_rgbs_aug), + self.max_depth, + ) + + # Store augment_tf from primary camera (first iteration) + if augment_tf is None: + augment_tf = augment_tf_cam + + # Transform to world frame + scene_pcd_world = self._transform_to_world_frame(scene_pcd_cam, T) + + all_scene_pcds_world.append(scene_pcd_world) + all_scene_feat_pcds.append(scene_feat_pcd_cam) + + # Concatenate point clouds from all cameras + start_scene_pcd_world = np.concatenate(all_scene_pcds_world, axis=0) + start_scene_feat_pcd = np.concatenate(all_scene_feat_pcds, axis=0) gripper_idx = self.GRIPPER_IDX[data_source] diff --git a/src/lfd3d/models/articubot.py b/src/lfd3d/models/articubot.py index efc1582..725a44f 100644 --- a/src/lfd3d/models/articubot.py +++ b/src/lfd3d/models/articubot.py @@ -1303,9 +1303,7 @@ def log_viz_to_wandb(self, batch, pred_dict, weighted_displacement, tag): gt_pcd, pcd_mean, pcd_std, R, t, scene_centroid ) - # All points cloud are in the start image's coordinate frame - # We need to visualize the end image, therefore need to apply transform - # Transform the point clouds to align with end image coordinate frame + # Transform to end frame pcd_endframe = np.hstack((pcd, np.ones((pcd.shape[0], 1)))) pcd_endframe = (end2start @ pcd_endframe.T).T[:, :3] all_pred_pcd_tmp = [] @@ -1317,6 +1315,37 @@ def log_viz_to_wandb(self, batch, pred_dict, weighted_displacement, tag): gt_pcd = np.hstack((gt_pcd, np.ones((gt_pcd.shape[0], 1)))) gt_pcd = (end2start @ gt_pcd.T).T[:, :3] + # Transform from world frame to primary camera frame for projection + # Primary camera extrinsics: T_world_from_cam, we need T_cam_from_world + primary_extrinsics = batch["extrinsics"][viz_idx].cpu().numpy() # (4, 4) + T_cam_from_world = np.linalg.inv(primary_extrinsics) + + # Save world-frame versions for action_anchor_pcd visualization + pcd_world = pcd.copy() + anchor_pcd_world = anchor_pcd.copy() + + # Transform points to primary camera frame + # Transform initial pcd (for init_rgb_proj) + pcd_cam = np.hstack((pcd, np.ones((pcd.shape[0], 1)))) + pcd_cam = (T_cam_from_world @ pcd_cam.T).T[:, :3] + + pcd_endframe = np.hstack((pcd_endframe, np.ones((pcd_endframe.shape[0], 1)))) + pcd_endframe = (T_cam_from_world @ pcd_endframe.T).T[:, :3] + + all_pred_pcd_tmp = [] + for i in range(N): + tmp_pcd = np.hstack((all_pred_pcd[i], np.ones((all_pred_pcd.shape[1], 1)))) + tmp_pcd = (T_cam_from_world @ tmp_pcd.T).T[:, :3] + all_pred_pcd_tmp.append(tmp_pcd) + all_pred_pcd = np.stack(all_pred_pcd_tmp) + + gt_pcd = np.hstack((gt_pcd, np.ones((gt_pcd.shape[0], 1)))) + gt_pcd = (T_cam_from_world @ gt_pcd.T).T[:, :3] + + # Also transform anchor_pcd to camera frame + anchor_pcd = np.hstack((anchor_pcd, np.ones((anchor_pcd.shape[0], 1)))) + anchor_pcd = (T_cam_from_world @ anchor_pcd.T).T[:, :3] + K = batch["intrinsics"][viz_idx].cpu().numpy() rgb_init, rgb_end = ( @@ -1329,7 +1358,7 @@ def log_viz_to_wandb(self, batch, pred_dict, weighted_displacement, tag): ) # Project tracks to image and save - init_rgb_proj = project_pcd_on_image(pcd, padding_mask, rgb_init, K, GREEN) + init_rgb_proj = project_pcd_on_image(pcd_cam, padding_mask, rgb_init, K, GREEN) end_rgb_proj = project_pcd_on_image(gt_pcd, padding_mask, rgb_end, K, RED) pred_rgb_proj = project_pcd_on_image( all_pred_pcd[-1], padding_mask, rgb_end, K, BLUE @@ -1363,8 +1392,8 @@ def log_viz_to_wandb(self, batch, pred_dict, weighted_displacement, tag): # Visualize action/anchor point cloud action_anchor_pcd = get_action_anchor_pcd( - pcd, - anchor_pcd, + pcd_world, + anchor_pcd_world, GREEN, RED, )