Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions configs/inference/rpadLerobot_vit_3dgp.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
defaults:
- base_inference
42 changes: 42 additions & 0 deletions configs/model/vit_3dgp.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
name: vit_3dgp

# Model settings
type: cross_displacement # Type of model
use_text_embedding: True # If true, expects text to be provided as input
use_gripper_token: True # Adds an additional gripper token
use_source_token: True # If true, adds a learnable token for human/robot data source
num_transformer_layers: 4 # num blocks for self attention

# ViT backbone settings (trainable, from scratch)
# Sized to be comparable to ResNet-18 (~11M params)
img_size: 224
patch_size: 16
vit_embed_dim: 384 # Output dim matches DINO ViT-B for compatibility
vit_depth: 6 # Number of transformer layers in ViT
vit_num_heads: 6 # Number of attention heads
vit_mlp_ratio: 4.0 # MLP hidden dim ratio

dropout: 0.1

# Positional encoding settings
use_fourier_pe: False # If true, use Fourier positional encoding instead of MLP
fourier_num_frequencies: 21 # Number of frequency bands for Fourier encoding
fourier_include_input: True # Whether to include input coordinates in Fourier encoding

is_gmm: True # Train a GMM and minimize negative log likelihood
fixed_variance: [0.01, 0.05, 0.1, 0.25, 0.5] # for gmm
uniform_weights_coeff: 0.1 # coefficient for nll loss term when using uniform mixing weights

# Optimal Transport loss settings (for domain adaptation)
use_ot_loss: False # Enable optimal transport loss for aligning human/robot latent distributions
ot_alpha: 0.05 # Weight for combining OT loss with main loss
ot_lambda: 0.1 # Discount factor for matching latents
ot_epsilon: 0.1 # Regularization parameter for Sinkhorn algorithm
ot_percentile: 0.1 # Percentile threshold for determining best matches

# Model-specific training augmentations
image_token_dropout: True # Enable image token dropout during training
gripper_noise_prob: 0.4 # Probability of applying gripper noise augmentation
gripper_noise_translation: 0.01 # Translation noise in meters
gripper_noise_rotation: 3.0 # Rotation noise in degrees
gripper_noise_width: 0.005 # Gripper width noise in meters
18 changes: 18 additions & 0 deletions configs/training/rpadLerobot_vit_3dgp.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
defaults:
- base_train

# Override augment_train to image_color_only for vit 3dgp (safe for multiview)
augment_train: "image_color_only"

epochs: 100
batch_size: 128
val_batch_size: 128

# ModelCheckpoint configurations
checkpoints:
rmse:
monitor: val/rmse
mode: min
rmse_and_std_combi:
monitor: val/rmse_and_std_combi
mode: min
Loading
Loading