From ded8d41fddf0d6632020c406eac9fa02f5c3d480 Mon Sep 17 00:00:00 2001 From: Wu Bingqian Date: Sat, 27 Jun 2026 00:04:31 +0800 Subject: [PATCH] Fix dataset asset download layout --- AGENTS.md | 4 +-- docs/docs/getting-started/download-assets.md | 4 +-- docs/docs/reference/assets.md | 4 +-- docs/docs/reference/dataset.md | 15 +++++---- docs/docs/tutorials/training.md | 19 +++++------ .../getting-started/download-assets.md | 4 +-- .../current/reference/assets.md | 4 +-- .../current/reference/dataset.md | 14 ++++---- .../current/tutorials/training.md | 18 +++++------ scripts/setup/download_assets.py | 17 ++++++++-- teleopit/runtime/external_assets.py | 8 ++++- tests/test_download_assets.py | 32 ++++++++++++++++--- tests/test_train_script.py | 2 +- train_mimic/scripts/benchmark.py | 2 +- train_mimic/scripts/play.py | 6 ++-- train_mimic/scripts/train.py | 10 +++--- .../tasks/tracking/config/constants.py | 2 +- 17 files changed, 105 insertions(+), 60 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index b9293986..5024575d 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -225,8 +225,8 @@ Quick reference: python train_mimic/scripts/data/build_dataset.py --spec train_mimic/configs/datasets/twist2.yaml python scripts/run/record_pico_motion.py python train_mimic/scripts/data/build_dataset.py --spec data/pico_motion/pico_recorded.yaml --force -python train_mimic/scripts/data/precompute_dataset.py data/datasets/seed --outdir data/datasets/seed_precomputed --jobs 8 -python train_mimic/scripts/train.py --motion_file data/datasets/seed_precomputed +python train_mimic/scripts/data/precompute_dataset.py data/datasets --outdir data/datasets_precomputed --jobs 8 +python train_mimic/scripts/train.py --motion_file data/datasets_precomputed python train_mimic/scripts/data/precompute_dataset.py data/datasets/twist2 --outdir data/datasets/twist2_precomputed --jobs 8 --force python train_mimic/scripts/save_onnx.py --checkpoint logs/rsl_rl/g1_general_tracking//model_30000.pt --output policy.onnx --history_length 10 ``` diff --git a/docs/docs/getting-started/download-assets.md b/docs/docs/getting-started/download-assets.md index 994d9486..81854a30 100644 --- a/docs/docs/getting-started/download-assets.md +++ b/docs/docs/getting-started/download-assets.md @@ -31,7 +31,7 @@ Downloaded file sizes change as checkpoints, datasets, and asset bundles are upd |------------|---------| | `track.onnx` | ONNX inference model | | `track.pt` | PyTorch checkpoint for resume training | -| `data/datasets/seed/shard_*.h5` | Minimal motion dataset; run precompute before training | +| `data/datasets//shard_*.h5` | Minimal motion datasets; run precompute before training | | `data/sample_bvh/*.bvh` | Sample motion files | | `assets/robots/unitree_g1/` | Canonical G1 XML and meshes used by training, sim2sim, retargeting, and FK validation | | `teleopit/retargeting/gmr/assets/` | GMR retargeting assets, IK configs, and non-canonical robot descriptions | @@ -44,6 +44,6 @@ Downloaded file sizes change as checkpoints, datasets, and asset bundles are upd | `robots` | `BingqianWu/Teleopit-models` | Canonical robot XML/meshes | | `gmr` | `BingqianWu/Teleopit-models` | GMR retargeting assets | | `bvh` | `BingqianWu/Teleopit-models` | Sample BVH motion files | -| `data` | `BingqianWu/Teleopit-datasets` | Training/validation shards | +| `data` | `BingqianWu/Teleopit-datasets` | Minimal shards for `lafan1`, `pico_record`, `seed`, and `twist2` | For asset management details (uploading, versioning), see [Asset Management](../reference/assets). diff --git a/docs/docs/reference/assets.md b/docs/docs/reference/assets.md index 92322c10..daf811a3 100644 --- a/docs/docs/reference/assets.md +++ b/docs/docs/reference/assets.md @@ -37,7 +37,7 @@ Datasets, checkpoints, robot models, and demo media are not tracked in Git. They | `robots` | Teleopit-models | `archives/robot_assets.tar.gz` | | `gmr` | Teleopit-models | `archives/gmr_assets.tar.gz` | | `bvh` | Teleopit-models | `archives/sample_bvh.tar.gz` | -| `data` | Teleopit-datasets | `data/` | +| `data` | Teleopit-datasets | `data/datasets/*/*.h5` (`lafan1`, `pico_record`, `seed`, `twist2`) | ## Download @@ -66,7 +66,7 @@ Local paths after download: | `archives/robot_assets.tar.gz` | `assets/robots/` (extracted) | | `archives/gmr_assets.tar.gz` | `teleopit/retargeting/gmr/assets/` (extracted) | | `archives/sample_bvh.tar.gz` | `data/sample_bvh/` (extracted) | -| `data/` | `data/datasets/seed/` | +| `data/datasets/*/*.h5` | `data/datasets/` | ## Upload to ModelScope diff --git a/docs/docs/reference/dataset.md b/docs/docs/reference/dataset.md index b5580847..08e9178c 100644 --- a/docs/docs/reference/dataset.md +++ b/docs/docs/reference/dataset.md @@ -10,12 +10,13 @@ sidebar_position: 3 python scripts/setup/download_assets.py --only robots data ``` -Then precompute the training shard and train with the precomputed dataset root: +Then precompute all downloaded datasets and train with the combined precomputed +dataset root: ```bash python train_mimic/scripts/data/precompute_dataset.py \ - data/datasets/seed --outdir data/datasets/seed_precomputed --jobs 8 -python train_mimic/scripts/train.py --motion_file data/datasets/seed_precomputed + data/datasets --outdir data/datasets_precomputed --jobs 8 +python train_mimic/scripts/train.py --motion_file data/datasets_precomputed ``` For custom dataset construction, read on. @@ -63,7 +64,7 @@ python train_mimic/scripts/data/build_dataset.py \ data/datasets// └── shard_*.h5 -data/datasets/_precomputed/ +data/datasets_precomputed// └── shard_*.h5 ``` @@ -71,7 +72,7 @@ data/datasets/_precomputed/ - If the spec is all `pkl` or `seed_csv` sources, the builder takes a batch path producing shards directly - `build_dataset.py` only writes the minimal distributable dataset. It does not run FK precompute. - `precompute_dataset.py` writes a separate training dataset containing the minimal motion plus precomputed joint velocities and body FK/velocities. -- Training accepts only the precomputed dataset directory. It recursively discovers precomputed `*.h5` shards below the specified root, so precomputed datasets can be merged by placing multiple shard directories under one parent. +- Training accepts only the precomputed dataset directory. It recursively discovers precomputed `*.h5` shards below the specified root, so use `data/datasets_precomputed` to train on all downloaded datasets together. - Training loads all discovered precomputed motion windows into memory at startup. Joint velocities and body FK/velocities are not computed during training. ## YAML Spec Format @@ -143,9 +144,9 @@ python train_mimic/scripts/data/build_dataset.py \ python train_mimic/scripts/data/build_dataset.py \ --spec train_mimic/configs/datasets/twist2.yaml --json -# Generate a precomputed training dataset from an existing minimal dataset +# Generate one combined precomputed training dataset from all downloaded minimal datasets python train_mimic/scripts/data/precompute_dataset.py \ - data/datasets/twist2 --outdir data/datasets/twist2_precomputed --jobs 8 --force + data/datasets --outdir data/datasets_precomputed --jobs 8 --force # Inspect a dataset root python train_mimic/scripts/data/inspect_dataset.py data/datasets/twist2 diff --git a/docs/docs/tutorials/training.md b/docs/docs/tutorials/training.md index cdd0efb7..fcf5cc2a 100644 --- a/docs/docs/tutorials/training.md +++ b/docs/docs/tutorials/training.md @@ -23,12 +23,13 @@ Verify: python -c "import train_mimic.tasks; print('training OK')" ``` -Download the minimal seed dataset and generate the precomputed training shard: +Download the distributed minimal datasets and generate the combined precomputed +training dataset: ```bash python scripts/setup/download_assets.py --only robots data python train_mimic/scripts/data/precompute_dataset.py \ - data/datasets/seed --outdir data/datasets/seed_precomputed --jobs 8 + data/datasets --outdir data/datasets_precomputed --jobs 8 ``` ## Training @@ -39,7 +40,7 @@ python train_mimic/scripts/data/precompute_dataset.py \ python train_mimic/scripts/train.py \ --num_envs 64 \ --max_iterations 100 \ - --motion_file data/datasets/seed_precomputed + --motion_file data/datasets_precomputed ``` ### Full Training @@ -48,7 +49,7 @@ python train_mimic/scripts/train.py \ python train_mimic/scripts/train.py \ --num_envs 4096 \ --max_iterations 30000 \ - --motion_file data/datasets/seed_precomputed + --motion_file data/datasets_precomputed ``` ### Multi-GPU @@ -58,7 +59,7 @@ python train_mimic/scripts/train.py \ --gpu_ids 0 1 2 3 \ --num_envs 1024 \ --max_iterations 30000 \ - --motion_file data/datasets/seed_precomputed + --motion_file data/datasets_precomputed ``` ### Multi-Node Multi-GPU @@ -75,7 +76,7 @@ torchrun \ train_mimic/scripts/train.py \ --num_envs 1024 \ --max_iterations 1000 \ - --motion_file data/datasets/seed_precomputed + --motion_file data/datasets_precomputed ``` **Notes:** @@ -105,7 +106,7 @@ The exported model is a dual-input ONNX (`obs` + `obs_history`). The inference s ```bash python train_mimic/scripts/play.py \ --checkpoint logs/rsl_rl/g1_general_tracking//model_30000.pt \ - --motion_file data/datasets/seed_precomputed + --motion_file data/datasets_precomputed ``` ### Benchmark @@ -113,7 +114,7 @@ python train_mimic/scripts/play.py \ ```bash python train_mimic/scripts/benchmark.py \ --checkpoint logs/rsl_rl/g1_general_tracking//model_30000.pt \ - --motion_file data/datasets/seed_precomputed \ + --motion_file data/datasets_precomputed \ --num_envs 1 ``` @@ -122,7 +123,7 @@ python train_mimic/scripts/benchmark.py \ ```bash python train_mimic/scripts/benchmark.py \ --checkpoint logs/rsl_rl/g1_general_tracking//model_30000.pt \ - --motion_file data/datasets/seed_precomputed \ + --motion_file data/datasets_precomputed \ --num_envs 1 \ --video \ --video_length 600 diff --git a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/getting-started/download-assets.md b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/getting-started/download-assets.md index e8f17538..9c6463ac 100644 --- a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/getting-started/download-assets.md +++ b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/getting-started/download-assets.md @@ -31,7 +31,7 @@ checkpoint、数据集和资源包更新后,下载文件大小会变化。下 |----------|------| | `track.onnx` | ONNX 推理模型 | | `track.pt` | 用于恢复训练的 PyTorch checkpoint | -| `data/datasets/seed/shard_*.h5` | 最小运动数据集;训练前需先预计算 | +| `data/datasets//shard_*.h5` | 最小运动数据集;训练前需先预计算 | | `data/sample_bvh/*.bvh` | 示例动捕文件 | | `assets/robots/unitree_g1/` | 训练、sim2sim、重定向和 FK 校验共用的 G1 canonical XML 与 mesh | | `teleopit/retargeting/gmr/assets/` | GMR 重定向资源、IK 配置和非 canonical 机器人描述 | @@ -44,6 +44,6 @@ checkpoint、数据集和资源包更新后,下载文件大小会变化。下 | `robots` | `BingqianWu/Teleopit-models` | Canonical 机器人 XML/mesh | | `gmr` | `BingqianWu/Teleopit-models` | GMR 重定向资源 | | `bvh` | `BingqianWu/Teleopit-models` | 示例 BVH 动捕文件 | -| `data` | `BingqianWu/Teleopit-datasets` | 训练 / 验证数据分片 | +| `data` | `BingqianWu/Teleopit-datasets` | `lafan1`、`pico_record`、`seed`、`twist2` 的最小 shard | 资源管理的更多细节(上传、版本控制等)请参阅 [资源管理](../reference/assets)。 diff --git a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/reference/assets.md b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/reference/assets.md index 2451e7b7..98ba4a86 100644 --- a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/reference/assets.md +++ b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/reference/assets.md @@ -37,7 +37,7 @@ sidebar_position: 2 | `robots` | Teleopit-models | `archives/robot_assets.tar.gz` | | `gmr` | Teleopit-models | `archives/gmr_assets.tar.gz` | | `bvh` | Teleopit-models | `archives/sample_bvh.tar.gz` | -| `data` | Teleopit-datasets | `data/` | +| `data` | Teleopit-datasets | `data/datasets/*/*.h5`(`lafan1`、`pico_record`、`seed`、`twist2`) | ## 下载 @@ -66,7 +66,7 @@ python scripts/setup/download_assets.py --source huggingface | `archives/robot_assets.tar.gz` | `assets/robots/`(自动解压) | | `archives/gmr_assets.tar.gz` | `teleopit/retargeting/gmr/assets/`(自动解压) | | `archives/sample_bvh.tar.gz` | `data/sample_bvh/`(自动解压) | -| `data/` | `data/datasets/seed/` | +| `data/datasets/*/*.h5` | `data/datasets/` | ## 上传到 ModelScope diff --git a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/reference/dataset.md b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/reference/dataset.md index eb497301..93fb2f21 100644 --- a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/reference/dataset.md +++ b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/reference/dataset.md @@ -10,12 +10,12 @@ sidebar_position: 3 python scripts/setup/download_assets.py --only robots data ``` -下载后先生成预计算训练 shard,再把预计算数据集根目录用于训练: +下载后先预计算所有已下载数据集,再把合并后的预计算数据集根目录用于训练: ```bash python train_mimic/scripts/data/precompute_dataset.py \ - data/datasets/seed --outdir data/datasets/seed_precomputed --jobs 8 -python train_mimic/scripts/train.py --motion_file data/datasets/seed_precomputed + data/datasets --outdir data/datasets_precomputed --jobs 8 +python train_mimic/scripts/train.py --motion_file data/datasets_precomputed ``` 如需自定义构建,继续阅读下文。 @@ -61,7 +61,7 @@ python train_mimic/scripts/data/build_dataset.py \ data/datasets// └── shard_*.h5 -data/datasets/_precomputed/ +data/datasets_precomputed// └── shard_*.h5 ``` @@ -69,7 +69,7 @@ data/datasets/_precomputed/ - 若 spec 全部是 `pkl` 或 `seed_csv` source,builder 会直接并行产出 shard,默认不写中间 clip 文件 - `build_dataset.py` 只写最小分发数据集,不执行 FK 预计算。 - `precompute_dataset.py` 会写出独立的训练数据集,里面包含最小运动数据以及预计算的 joint velocity 和 body FK/velocity。 -- 训练只接受预计算后的数据集目录。它会递归发现指定根目录下的预计算 `*.h5` shard,因此可以把多个预计算数据集目录放到同一个父目录下完成合并。 +- 训练只接受预计算后的数据集目录。它会递归发现指定根目录下的预计算 `*.h5` shard,因此使用 `data/datasets_precomputed` 可以一起训练所有已下载数据集。 - 训练会在启动时把所有发现的预计算 motion window 全量加载到内存中。joint velocity 和 body FK/velocity 不会在训练时计算。 ## YAML spec @@ -139,9 +139,9 @@ python train_mimic/scripts/data/build_dataset.py \ python train_mimic/scripts/data/build_dataset.py \ --spec train_mimic/configs/datasets/twist2.yaml --json -# 从已有最小数据集生成预计算训练数据集 +# 从所有已下载最小数据集生成合并后的预计算训练数据集 python train_mimic/scripts/data/precompute_dataset.py \ - data/datasets/twist2 --outdir data/datasets/twist2_precomputed --jobs 8 --force + data/datasets --outdir data/datasets_precomputed --jobs 8 --force # 查看数据集统计 python train_mimic/scripts/data/inspect_dataset.py data/datasets/twist2 diff --git a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/tutorials/training.md b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/tutorials/training.md index 84a9b366..1de8d8e1 100644 --- a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/tutorials/training.md +++ b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/tutorials/training.md @@ -23,12 +23,12 @@ pip install -e '.[train]' python -c "import train_mimic.tasks; print('training OK')" ``` -下载最小 seed 数据集,并生成预计算训练 shard: +下载分发的最小数据集,并生成合并后的预计算训练数据集: ```bash python scripts/setup/download_assets.py --only robots data python train_mimic/scripts/data/precompute_dataset.py \ - data/datasets/seed --outdir data/datasets/seed_precomputed --jobs 8 + data/datasets --outdir data/datasets_precomputed --jobs 8 ``` ## 训练 @@ -39,7 +39,7 @@ python train_mimic/scripts/data/precompute_dataset.py \ python train_mimic/scripts/train.py \ --num_envs 64 \ --max_iterations 100 \ - --motion_file data/datasets/seed_precomputed + --motion_file data/datasets_precomputed ``` ### 完整训练 @@ -48,7 +48,7 @@ python train_mimic/scripts/train.py \ python train_mimic/scripts/train.py \ --num_envs 4096 \ --max_iterations 30000 \ - --motion_file data/datasets/seed_precomputed + --motion_file data/datasets_precomputed ``` ### 多卡训练 @@ -58,7 +58,7 @@ python train_mimic/scripts/train.py \ --gpu_ids 0 1 2 3 \ --num_envs 1024 \ --max_iterations 30000 \ - --motion_file data/datasets/seed_precomputed + --motion_file data/datasets_precomputed ``` ### 多机多卡训练 @@ -75,7 +75,7 @@ torchrun \ train_mimic/scripts/train.py \ --num_envs 1024 \ --max_iterations 1000 \ - --motion_file data/datasets/seed_precomputed + --motion_file data/datasets_precomputed ``` **注意事项:** @@ -105,7 +105,7 @@ python train_mimic/scripts/save_onnx.py \ ```bash python train_mimic/scripts/play.py \ --checkpoint logs/rsl_rl/g1_general_tracking//model_30000.pt \ - --motion_file data/datasets/seed_precomputed + --motion_file data/datasets_precomputed ``` ### 定量评估 @@ -113,7 +113,7 @@ python train_mimic/scripts/play.py \ ```bash python train_mimic/scripts/benchmark.py \ --checkpoint logs/rsl_rl/g1_general_tracking//model_30000.pt \ - --motion_file data/datasets/seed_precomputed \ + --motion_file data/datasets_precomputed \ --num_envs 1 ``` @@ -122,7 +122,7 @@ python train_mimic/scripts/benchmark.py \ ```bash python train_mimic/scripts/benchmark.py \ --checkpoint logs/rsl_rl/g1_general_tracking//model_30000.pt \ - --motion_file data/datasets/seed_precomputed \ + --motion_file data/datasets_precomputed \ --num_envs 1 \ --video \ --video_length 600 diff --git a/scripts/setup/download_assets.py b/scripts/setup/download_assets.py index 77d3dae3..674632e1 100755 --- a/scripts/setup/download_assets.py +++ b/scripts/setup/download_assets.py @@ -56,6 +56,17 @@ def _resolve_entry_source(repo_cache: Path, entry: AssetEntry) -> Path: return repo_cache / entry.remote_path +def _entry_allow_patterns(entry: AssetEntry) -> list[str]: + if entry.allow_patterns: + return list(entry.allow_patterns) + return [f"{entry.remote_path}*"] + + +def _clear_cached_entry_sources(repo_cache: Path, entries: list[AssetEntry]) -> None: + for entry in entries: + _remove_path(_resolve_entry_source(repo_cache, entry)) + + def _copy_path(src: Path, dst: Path) -> None: dst.parent.mkdir(parents=True, exist_ok=True) if src.is_dir(): @@ -113,8 +124,9 @@ def download_all(groups, cache_dir): if not repo_entries: continue repo_type = repo_type_map[repo_id] - allow_patterns = [f"{e.remote_path}*" for e in repo_entries] + allow_patterns = [pattern for entry in repo_entries for pattern in _entry_allow_patterns(entry)] repo_cache = cache_dir / repo_type / repo_id.split("/")[-1] + _clear_cached_entry_sources(repo_cache, repo_entries) print(f"\nDownloading {repo_id} ({repo_type}) to {repo_cache} ...") print(f"Fetching: {[e.remote_path for e in repo_entries]}") @@ -155,8 +167,9 @@ def download_all_hf(groups, cache_dir): if not repo_entries: continue repo_type = repo_type_map[repo_id] - allow_patterns = [f"{e.remote_path}*" for e in repo_entries] + allow_patterns = [pattern for entry in repo_entries for pattern in _entry_allow_patterns(entry)] repo_cache = cache_dir / repo_type / repo_id.split("/")[-1] + _clear_cached_entry_sources(repo_cache, repo_entries) print(f"\nDownloading {repo_id} ({repo_type}) from HuggingFace to {repo_cache} ...") print(f"Fetching: {[e.remote_path for e in repo_entries]}") diff --git a/teleopit/runtime/external_assets.py b/teleopit/runtime/external_assets.py index 6fdfb7c9..1f63c929 100644 --- a/teleopit/runtime/external_assets.py +++ b/teleopit/runtime/external_assets.py @@ -16,6 +16,7 @@ class AssetEntry: local_path: str repo: str = "model" # "model" or "dataset" mode: str = "copy" + allow_patterns: tuple[str, ...] = field(default_factory=tuple) ASSET_GROUPS: dict[str, list[AssetEntry]] = { @@ -48,6 +49,11 @@ class AssetEntry: ), ], "data": [ - AssetEntry("data", "data/datasets/seed", repo="dataset"), + AssetEntry( + "data/datasets", + "data/datasets", + repo="dataset", + allow_patterns=("data/datasets/*/*.h5",), + ), ], } diff --git a/tests/test_download_assets.py b/tests/test_download_assets.py index bdd383c1..9b7a0f26 100644 --- a/tests/test_download_assets.py +++ b/tests/test_download_assets.py @@ -7,9 +7,14 @@ if str(PROJECT_ROOT) not in sys.path: sys.path.insert(0, str(PROJECT_ROOT)) -from scripts.setup.download_assets import _resolve_entry_source, _safe_extract_tar +from scripts.setup.download_assets import ( + _clear_cached_entry_sources, + _entry_allow_patterns, + _resolve_entry_source, + _safe_extract_tar, +) from scripts.setup.prepare_modelscope_assets import _archive_directory -from teleopit.runtime.external_assets import AssetEntry +from teleopit.runtime.external_assets import ASSET_GROUPS, AssetEntry def test_archive_round_trip(tmp_path: Path) -> None: @@ -43,11 +48,30 @@ def test_resolve_entry_source_uses_only_current_remote_layout(tmp_path: Path) -> def test_robot_asset_group_uses_archive_layout() -> None: - from teleopit.runtime.external_assets import ASSET_GROUPS - entries = ASSET_GROUPS["robots"] assert len(entries) == 1 assert entries[0].remote_path == "archives/robot_assets.tar.gz" assert entries[0].local_path == "assets/robots" assert entries[0].mode == "extract" + + +def test_data_asset_group_downloads_only_hdf5_shards() -> None: + entries = ASSET_GROUPS["data"] + + assert len(entries) == 1 + assert entries[0].remote_path == "data/datasets" + assert entries[0].local_path == "data/datasets" + assert _entry_allow_patterns(entries[0]) == ["data/datasets/*/*.h5"] + + +def test_clear_cached_entry_sources_removes_stale_data_files(tmp_path: Path) -> None: + entry = ASSET_GROUPS["data"][0] + dataset_cache = tmp_path / "data" / "datasets" + dataset_cache.mkdir(parents=True) + (dataset_cache / "old_clip.npz").write_bytes(b"old") + (dataset_cache / "shard_000.h5").write_bytes(b"new") + + _clear_cached_entry_sources(tmp_path, [entry]) + + assert not dataset_cache.exists() diff --git a/tests/test_train_script.py b/tests/test_train_script.py index 53681c6f..f9fecdf5 100644 --- a/tests/test_train_script.py +++ b/tests/test_train_script.py @@ -35,7 +35,7 @@ def _args(**overrides: object) -> argparse.Namespace: "seed": 42, "logger": "tensorboard", "experiment_name": None, - "motion_file": "data/datasets/seed_precomputed", + "motion_file": "data/datasets_precomputed", "robot_xml": None, "resume": None, "sampling_mode": None, diff --git a/train_mimic/scripts/benchmark.py b/train_mimic/scripts/benchmark.py index 50c59f1a..7fdcb632 100644 --- a/train_mimic/scripts/benchmark.py +++ b/train_mimic/scripts/benchmark.py @@ -10,7 +10,7 @@ # Benchmark only (no video) python train_mimic/scripts/benchmark.py \ --checkpoint logs/rsl_rl/g1_tracking/.../model_30000.pt \ - --motion_file data/datasets/seed_precomputed \ + --motion_file data/datasets_precomputed \ --num_envs 1 # Single video (one continuous clip) diff --git a/train_mimic/scripts/play.py b/train_mimic/scripts/play.py index 8ad5a9d7..193add7a 100644 --- a/train_mimic/scripts/play.py +++ b/train_mimic/scripts/play.py @@ -9,18 +9,18 @@ # Native window python train_mimic/scripts/play.py \ --checkpoint logs/rsl_rl/g1_tracking/2026-.../model_30000.pt \ - --motion_file data/datasets/seed_precomputed + --motion_file data/datasets_precomputed # Browser viewer (no display required) python train_mimic/scripts/play.py \ --checkpoint logs/rsl_rl/g1_tracking/2026-.../model_30000.pt \ - --motion_file data/datasets/seed_precomputed \ + --motion_file data/datasets_precomputed \ --viewer viser # Record video instead of interactive viewer python train_mimic/scripts/play.py \ --checkpoint logs/rsl_rl/g1_tracking/2026-.../model_30000.pt \ - --motion_file data/datasets/seed_precomputed \ + --motion_file data/datasets_precomputed \ --video """ diff --git a/train_mimic/scripts/train.py b/train_mimic/scripts/train.py index 4f8bf7b3..361b261b 100644 --- a/train_mimic/scripts/train.py +++ b/train_mimic/scripts/train.py @@ -4,30 +4,30 @@ Usage: python train_mimic/scripts/train.py \ --num_envs 4096 --max_iterations 18000 \ - --motion_file data/datasets/seed_precomputed + --motion_file data/datasets_precomputed # Quick verification python train_mimic/scripts/train.py \ --num_envs 64 --max_iterations 100 \ - --motion_file data/datasets/seed_precomputed + --motion_file data/datasets_precomputed # With W&B logging python train_mimic/scripts/train.py \ --num_envs 4096 --max_iterations 30000 \ - --motion_file data/datasets/seed_precomputed \ + --motion_file data/datasets_precomputed \ --logger wandb # With SwanLab logging python train_mimic/scripts/train.py \ --num_envs 4096 --max_iterations 30000 \ - --motion_file data/datasets/seed_precomputed \ + --motion_file data/datasets_precomputed \ --logger swanlab # Resume for additional iterations python train_mimic/scripts/train.py \ --resume logs/rsl_rl/g1_general_tracking//model_12000.pt \ --max_iterations 18000 \ - --motion_file data/datasets/seed_precomputed + --motion_file data/datasets_precomputed """ from __future__ import annotations diff --git a/train_mimic/tasks/tracking/config/constants.py b/train_mimic/tasks/tracking/config/constants.py index 17560fa5..bdc71121 100644 --- a/train_mimic/tasks/tracking/config/constants.py +++ b/train_mimic/tasks/tracking/config/constants.py @@ -1,6 +1,6 @@ """Public constants for supported tracking tasks.""" -DEFAULT_TRAIN_MOTION_FILE = "data/datasets/seed_precomputed" +DEFAULT_TRAIN_MOTION_FILE = "data/datasets_precomputed" GENERAL_TRACKING_TASK = "General-Tracking-G1" GENERAL_TRACKING_EXPERIMENT_NAME = "g1_general_tracking"