Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions configs/model/articubot.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@ name: articubot

# Model settings
type: cross_displacement # (flow, point) Type of model
in_channels: 4 # Number of input channels (7 - xyz + one-hot mask for action_pcd + rgb)
in_channels: 7 # Number of input channels (7 - xyz + one-hot mask for action_pcd + rgb)
num_classes: 13 # xyz of 4 points and weight of points
keep_gripper_in_fps: False
add_action_pcd_masked: True # Add in action pcd with boolean mask (increment in_channels)
use_text_embedding: True # If true, expects (siglip) text embedding to be provided as input
use_rgb: False # If true, expects +3 in_chanels
use_rgb: True # If true, expects +3 in_chanels
use_dual_head: False # If true, maintain separate heads for human/robot inputs

is_gmm: True # Train a GMM and minimize negative log likelihood instead
Expand Down
10 changes: 5 additions & 5 deletions src/lfd3d/datasets/base_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,15 +289,15 @@ def augment_scene_pcd(self, scene_pcd, feat_flat, augment_cfg):
raise NotImplementedError

if augment_cfg["augment_transform"]:
# Apply random SO(2) rotation (around Y-axis as pcd is in camera frame) and translation
# Apply random SO(2) rotation (around Z-axis as pcd is in world frame) and translation
theta = np.random.uniform(0, 2 * np.pi) # Random angle
R = np.array(
[
[np.cos(theta), 0, np.sin(theta)],
[0, 1, 0],
[-np.sin(theta), 0, np.cos(theta)],
[np.cos(theta), -np.sin(theta), 0],
[np.sin(theta), np.cos(theta), 0],
[0, 0, 1],
]
) # Rotation matrix around Y-axis
) # Rotation matrix around Z-axis
translation = np.random.uniform(
-0.2, 0.2, size=3
) # Random translation vector
Expand Down
18 changes: 9 additions & 9 deletions src/lfd3d/datasets/droid/render_robotiq.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,11 @@
import tensorflow_datasets as tfds
import trimesh
from lfd3d.utils.data_utils import combine_meshes
from robot_descriptions.loaders.mujoco import load_robot_description
from scipy.spatial.transform import Rotation
from tqdm import tqdm

from robot_descriptions.loaders.mujoco import load_robot_description

"""
NOTE: MJCF needs to be modified to add free joint
TODO: Dynamically add to MJCF to avoid this.
Expand Down Expand Up @@ -214,16 +215,15 @@ def process_item(
# Sample point cloud from its surface for saving
# NOTE: Keeping a seed is very important so that we maintain correspondences over samples.
points_, _ = trimesh.sample.sample_surface(mesh_, sample_n_points, seed=42)

homogenous_append = np.ones((sample_n_points, 1))
gripper_urdf_3d_pos = np.concatenate([points_, homogenous_append], axis=-1)[
:, :, None
]
urdf_cam3dcoords = (world_to_cam @ gripper_urdf_3d_pos)[:, :3].squeeze(2)

gripper_pcds.append(urdf_cam3dcoords)
gripper_pcds.append(points_)

if idx % 100 == 0:
homogenous_append = np.ones((sample_n_points, 1))
gripper_urdf_3d_pos = np.concatenate([points_, homogenous_append], axis=-1)[
:, :, None
]
urdf_cam3dcoords = (world_to_cam @ gripper_urdf_3d_pos)[:, :3].squeeze(2)

images = np.array([i["observation"][cam_image_key] for i in steps])
os.makedirs(f"{viz_dir}/{idx}", exist_ok=True)
urdf_proj_hom = (K @ urdf_cam3dcoords.T).T
Expand Down
51 changes: 39 additions & 12 deletions src/lfd3d/datasets/lerobot/lerobot_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ def source_of_data(item):
"""
Return the source of the current demo i.e. where the data is from
"""
return item["embodiment"]
# Code below is deprecated and only kept for future reference.
# Embodiment is expected to be baked into the dataset
"""
# Currently identifying demo type by checking number of vertices
# For human data, we have 778 vertices from the MANO mesh
# For aloha data, we're sampling 500 points from the mesh.
Expand All @@ -50,6 +54,7 @@ def source_of_data(item):
raise NotImplementedError

return demo_type
"""


class RpadLeRobotDataset(BaseDataset):
Expand Down Expand Up @@ -268,19 +273,41 @@ def __getitem__(self, index):
primary_K = all_intrinsics_aug[0]
primary_T = all_extrinsics[0]

# Compute scene PCD from primary camera
rgb_embed, text_embed = self.rgb_text_featurizer.compute_rgb_text_feat(
primary_rgbs[0], caption
)
start_scene_pcd, start_scene_feat_pcd, augment_tf = self.get_scene_pcd(
rgb_embed, primary_depths[0], primary_K, self.num_points, self.max_depth
)
# Compute scene PCD from all cameras and combine in world frame
all_scene_pcds_world = []
all_scene_feat_pcds = []
augment_tf = None

# Transform scene PCD from camera frame to world frame
# start_scene_pcd is (N, 3) in camera frame, transform using primary extrinsics
start_scene_pcd_world = self._transform_to_world_frame(
start_scene_pcd, primary_T
)
for i, (rgbs, depths, K, T) in enumerate(
zip(all_rgbs_aug, all_depths_aug, all_intrinsics_aug, all_extrinsics)
):
# Compute RGB embedding for this camera
rgb_embed_cam, text_embed = self.rgb_text_featurizer.compute_rgb_text_feat(
rgbs[0], caption
)

# Compute scene PCD in camera frame with points_per_camera
scene_pcd_cam, scene_feat_pcd_cam, augment_tf_cam = self.get_scene_pcd(
rgb_embed_cam,
depths[0],
K,
self.num_points // len(all_rgbs_aug),
self.max_depth,
)

# Store augment_tf from primary camera (first iteration)
if augment_tf is None:
augment_tf = augment_tf_cam

# Transform to world frame
scene_pcd_world = self._transform_to_world_frame(scene_pcd_cam, T)

all_scene_pcds_world.append(scene_pcd_world)
all_scene_feat_pcds.append(scene_feat_pcd_cam)

# Concatenate point clouds from all cameras
start_scene_pcd_world = np.concatenate(all_scene_pcds_world, axis=0)
start_scene_feat_pcd = np.concatenate(all_scene_feat_pcds, axis=0)

gripper_idx = self.GRIPPER_IDX[data_source]

Expand Down
41 changes: 35 additions & 6 deletions src/lfd3d/models/articubot.py
Original file line number Diff line number Diff line change
Expand Up @@ -1303,9 +1303,7 @@ def log_viz_to_wandb(self, batch, pred_dict, weighted_displacement, tag):
gt_pcd, pcd_mean, pcd_std, R, t, scene_centroid
)

# All points cloud are in the start image's coordinate frame
# We need to visualize the end image, therefore need to apply transform
# Transform the point clouds to align with end image coordinate frame
# Transform to end frame
pcd_endframe = np.hstack((pcd, np.ones((pcd.shape[0], 1))))
pcd_endframe = (end2start @ pcd_endframe.T).T[:, :3]
all_pred_pcd_tmp = []
Expand All @@ -1317,6 +1315,37 @@ def log_viz_to_wandb(self, batch, pred_dict, weighted_displacement, tag):
gt_pcd = np.hstack((gt_pcd, np.ones((gt_pcd.shape[0], 1))))
gt_pcd = (end2start @ gt_pcd.T).T[:, :3]

# Transform from world frame to primary camera frame for projection
# Primary camera extrinsics: T_world_from_cam, we need T_cam_from_world
primary_extrinsics = batch["extrinsics"][viz_idx].cpu().numpy() # (4, 4)
T_cam_from_world = np.linalg.inv(primary_extrinsics)

# Save world-frame versions for action_anchor_pcd visualization
pcd_world = pcd.copy()
anchor_pcd_world = anchor_pcd.copy()

# Transform points to primary camera frame
# Transform initial pcd (for init_rgb_proj)
pcd_cam = np.hstack((pcd, np.ones((pcd.shape[0], 1))))
pcd_cam = (T_cam_from_world @ pcd_cam.T).T[:, :3]

pcd_endframe = np.hstack((pcd_endframe, np.ones((pcd_endframe.shape[0], 1))))
pcd_endframe = (T_cam_from_world @ pcd_endframe.T).T[:, :3]

all_pred_pcd_tmp = []
for i in range(N):
tmp_pcd = np.hstack((all_pred_pcd[i], np.ones((all_pred_pcd.shape[1], 1))))
tmp_pcd = (T_cam_from_world @ tmp_pcd.T).T[:, :3]
all_pred_pcd_tmp.append(tmp_pcd)
all_pred_pcd = np.stack(all_pred_pcd_tmp)

gt_pcd = np.hstack((gt_pcd, np.ones((gt_pcd.shape[0], 1))))
gt_pcd = (T_cam_from_world @ gt_pcd.T).T[:, :3]

# Also transform anchor_pcd to camera frame
anchor_pcd = np.hstack((anchor_pcd, np.ones((anchor_pcd.shape[0], 1))))
anchor_pcd = (T_cam_from_world @ anchor_pcd.T).T[:, :3]

K = batch["intrinsics"][viz_idx].cpu().numpy()

rgb_init, rgb_end = (
Expand All @@ -1329,7 +1358,7 @@ def log_viz_to_wandb(self, batch, pred_dict, weighted_displacement, tag):
)

# Project tracks to image and save
init_rgb_proj = project_pcd_on_image(pcd, padding_mask, rgb_init, K, GREEN)
init_rgb_proj = project_pcd_on_image(pcd_cam, padding_mask, rgb_init, K, GREEN)
end_rgb_proj = project_pcd_on_image(gt_pcd, padding_mask, rgb_end, K, RED)
pred_rgb_proj = project_pcd_on_image(
all_pred_pcd[-1], padding_mask, rgb_end, K, BLUE
Expand Down Expand Up @@ -1363,8 +1392,8 @@ def log_viz_to_wandb(self, batch, pred_dict, weighted_displacement, tag):

# Visualize action/anchor point cloud
action_anchor_pcd = get_action_anchor_pcd(
pcd,
anchor_pcd,
pcd_world,
anchor_pcd_world,
GREEN,
RED,
)
Expand Down
1 change: 1 addition & 0 deletions src/lfd3d/utils/script_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,7 @@ def load_checkpoint_config_from_wandb(
"dataset.additional_img_dir",
"dataset.use_intermediate_frames",
"dataset.data_sources",
"dataset.val_episode_ratio",
],
):
"""
Expand Down
Loading