syswonder · 1mujue · May 25, 2026 · May 25, 2026 · May 25, 2026 · Jun 1, 2026
diff --git a/system/scene/docker/Dockerfile b/system/scene/docker/Dockerfile
@@ -1,3 +1,4 @@
+# syntax=docker/dockerfile:1.7
 # SPDX-License-Identifier: MulanPSL-2.0
 # system/scene container — ROS Humble base + scene's Python deps.
 #
@@ -17,17 +18,45 @@
 #
 # The image is built on demand by `bash system/scene/scripts/build.sh`
 # the first time, then reused. Rebuild only when this Dockerfile or
-# requirements.txt change.
+# requirements/ change.
 
-# `ros:humble-ros-base` is the slim official image (no GUI / desktop
-# tooling). osrf/ros:* would also work but the user's registry mirror
-# can't always reach the osrf namespace.
 FROM ros:humble-ros-base
 
+# ── Build-time proxy helpers ──────────────────────────────────────
+# Usage:
+#   - apt / pip: use `without-proxy ...`
+#   - git / HuggingFace / OpenAI CDN: use `with-build-proxy ...`
+#
+# Note:
+#   HTTP_PROXY_HOST / HTTPS_PROXY_HOST / NO_PROXY_HOST are declared later,
+#   just before the first proxy-needed layer, so changing proxy args does
+#   not invalidate earlier apt/pip layers.
+RUN cat > /usr/local/bin/with-build-proxy <<'EOF' \
+ && chmod +x /usr/local/bin/with-build-proxy \
+ && cat > /usr/local/bin/without-proxy <<'EOF2' \
+ && chmod +x /usr/local/bin/without-proxy
+#!/usr/bin/env sh
+set -eu
+
+export HTTP_PROXY="${HTTP_PROXY_HOST:-}"
+export HTTPS_PROXY="${HTTPS_PROXY_HOST:-}"
+export http_proxy="${HTTP_PROXY_HOST:-}"
+export https_proxy="${HTTPS_PROXY_HOST:-}"
+export NO_PROXY="${NO_PROXY_HOST:-}"
+export no_proxy="${NO_PROXY_HOST:-}"
+
+exec "$@"
+EOF
+#!/usr/bin/env sh
+set -eu
+
+unset HTTP_PROXY HTTPS_PROXY http_proxy https_proxy
+unset ALL_PROXY all_proxy NO_PROXY no_proxy
+
+exec "$@"
+EOF2
+
 # ── TUNA mirrors for GFW-bound networks ─────────────────────────────
-# Swap apt sources (Ubuntu/ports + ROS 2) to TUNA. Use `find` + sed
-# --follow-symlinks because /etc/apt/sources.list.d/ros2.sources is
-# a symlink into /usr/share/ros-apt-source/ in the upstream ros image.
 RUN set -eux; \
     find /etc/apt/sources.list.d/ -maxdepth 1 \( -name '*.list' -o -name '*.sources' \) -print0 \
       | xargs -0 -r sed -i --follow-symlinks \
@@ -40,119 +69,148 @@ RUN set -eux; \
             -e 's|http://ports\.ubuntu\.com/ubuntu-ports/\?|https://mirrors.tuna.tsinghua.edu.cn/ubuntu-ports/|g' \
             /etc/apt/sources.list; \
     fi
+
 ENV PIP_INDEX_URL=https://pypi.tuna.tsinghua.edu.cn/simple \
     UV_INDEX_URL=https://pypi.tuna.tsinghua.edu.cn/simple \
     HF_ENDPOINT=https://hf-mirror.com
 
 ENV DEBIAN_FRONTEND=noninteractive \
     PYTHONUNBUFFERED=1 \
-    PIP_NO_CACHE_DIR=1 \
     PIP_DISABLE_PIP_VERSION_CHECK=1
 
-# System packages we need at runtime:
-#   python3-pip          — install scene deps
-#   python3-cv-bridge    — sensor_msgs/Image ↔ numpy (apt has the
-#                          rclpy-aware cv_bridge that pip doesn't)
-#   python3-numpy        — pinned to 1.x by ROS Humble; matches host
-#   ros-humble-tf2-ros   — tf transforms
-#   ros-humble-sensor-msgs / nav-msgs already in -ros-base
-RUN apt-get update \
- && apt-get install -y --no-install-recommends \
-        python3-pip \
-        python3-cv-bridge \
-        python3-numpy \
-        ros-humble-tf2-ros \
-        ros-humble-tf-transformations \
-        ros-humble-rmw-zenoh-cpp \
-        ros-humble-zenoh-bridge-dds \
- && rm -rf /var/lib/apt/lists/*
+# System packages we need at runtime.
+# apt: do NOT use proxy; we already switched to TUNA mirrors.
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+    without-proxy sh -c ' \
+        apt-get update && \
+        apt-get install -y --no-install-recommends \
+            python3-pip \
+            python3-cv-bridge \
+            python3-numpy \
+            ros-humble-tf2-ros \
+            ros-humble-tf-transformations \
+            ros-humble-rmw-zenoh-cpp \
+            ros-humble-zenoh-bridge-dds && \
+        rm -rf /var/lib/apt/lists/* \
+    '
+
+# pip: do NOT use proxy.
+RUN --mount=type=cache,target=/root/.cache/pip,sharing=locked \
+    without-proxy python3 -m pip install \
+        --progress-bar raw \
+        --timeout 60 \
+        --retries 3 \
+        --upgrade pip
 
 # torch + torchvision — sm_120 (Blackwell / RTX 5090) requires
 # torch 2.7+ with CUDA 12.8 kernels. cu124 wheels only cover up to
 # sm_90 and will fail with "no kernel image" on Blackwell.
-# Aliyun mirror carries cu128 wheels; fall back to upstream PyPI.
-RUN pip install --find-links https://mirrors.aliyun.com/pytorch-wheels/cu128/ \
+# Aliyun mirror carries cu128 wheels; fall back to upstream PyTorch index.
+RUN --mount=type=cache,target=/root/.cache/pip,sharing=locked \
+    without-proxy python3 -m pip install \
+        --progress-bar raw \
+        --timeout 60 \
+        --retries 3 \
+        -i https://mirrors.aliyun.com/pypi/simple/ \
+        --find-links https://mirrors.aliyun.com/pytorch-wheels/cu128/ \
         torch==2.7.1 torchvision==0.22.1 \
- || pip install --index-url https://download.pytorch.org/whl/cu128 \
+    || with-build-proxy python3 -m pip install \
+        --progress-bar raw \
+        --timeout 60 \
+        --retries 3 \
+        --index-url https://download.pytorch.org/whl/cu128 \
         torch==2.7.1 torchvision==0.22.1
 
-# Optional: pass from host via `scripts/build.sh` as HTTP_PROXY_HOST /
-# HTTPS_PROXY_HOST / NO_PROXY_HOST (not HTTP_PROXY — avoids BuildKit treating
-# them as automatic global proxy args). Mapped to standard names only for the
-# network-heavy layers below (pip requirements, git, HF, etc.). Use
-# http://host.docker.internal:7890 — not 127.0.0.1 (that is the build container).
+# Generic scene deps + concept-graphs perception stack.
+# pip: do NOT use proxy; use PIP_INDEX_URL=TUNA.
+
+# base
+COPY requirements/scene-base.txt /tmp/requirements/scene-base.txt
+RUN --mount=type=cache,target=/root/.cache/pip,sharing=locked \
+    without-proxy pip install -r /tmp/requirements/scene-base.txt \
+        --progress-bar raw \
+        --timeout 60 \
+        --retries 3 \
+    && rm /tmp/requirements/scene-base.txt
+
+# core
+COPY requirements/scene-perception-core.txt /tmp/requirements/scene-perception-core.txt
+RUN --mount=type=cache,target=/root/.cache/pip,sharing=locked \
+    without-proxy pip install -r /tmp/requirements/scene-perception-core.txt \
+        --progress-bar raw \
+        --timeout 60 \
+        --retries 3 \
+    && rm /tmp/requirements/scene-perception-core.txt
+
+# network heavy
+COPY requirements/scene-perception-heavy.txt /tmp/requirements/scene-perception-heavy.txt
+RUN --mount=type=cache,target=/root/.cache/pip,sharing=locked \
+    without-proxy pip install -r /tmp/requirements/scene-perception-heavy.txt \
+        --progress-bar raw \
+        --timeout 60 \
+        --retries 3 \
+    && rm /tmp/requirements/scene-perception-heavy.txt
+
+# pytorch3d (see requirements/scene-pytorch3d.txt to decide whether installing it or not.)
+# COPY requirements/scene-pytorch3d.txt /tmp/requirements/scene-pytorch3d.txt
+# RUN --mount=type=cache,target=/root/.cache/pip,sharing=locked \
+#     without-proxy pip install -r /tmp/requirements/scene-pytorch3d.txt \
+#         --progress-bar raw \
+#         --timeout 60 \
+#         --retries 3 \
+#     && rm /tmp/requirements/scene-pytorch3d.txt
+
+# ── Proxy-needed network layers start here ─────────────────────────
+# Declare build args as late as possible to avoid invalidating earlier
+# apt/pip layers when proxy values change.
 ARG HTTP_PROXY_HOST
 ARG HTTPS_PROXY_HOST
 ARG NO_PROXY_HOST
-ENV HTTP_PROXY=${HTTP_PROXY_HOST} \
-    HTTPS_PROXY=${HTTPS_PROXY_HOST} \
-    NO_PROXY=${NO_PROXY_HOST} \
-    http_proxy=${HTTP_PROXY_HOST} \
-    https_proxy=${HTTPS_PROXY_HOST} \
-    no_proxy=${NO_PROXY_HOST}
-
-# Generic scene deps + concept-graphs perception stack (no
-# pytorch3d / chamferdist / gradslam — those are needed only by
-# concept-graphs' full SLAM pipeline, not by detection + merge,
-# which is all we use).
-COPY requirements.txt /tmp/requirements.txt
-RUN pip install -r /tmp/requirements.txt \
- && rm /tmp/requirements.txt
-
-# concept-graphs source from the ali-dev branch (smaller, no
-# Grounded-SAM / pytorch3d dependency tangle). We need the
-# `conceptgraph.slam.utils` and `conceptgraph.slam.mapping` modules
-# (depth backproject + per-object pcd + DBSCAN-based merge). Install
-# editable so we can edit-and-test from inside the container without
-# rebuilding when iterating on their helpers.
-# git clone goes through ghfast.top mirror with fallback to direct.
-# --no-deps because concept-graphs's setup.py declares pytorch3d as
-# an install_requires which we explicitly skip on aarch64
-# (see requirements.txt).
-RUN ( git clone --depth 1 --branch ali-dev \
+
+# concept-graphs source from the ali-dev branch.
+# git clone: use proxy.
+RUN set -eux; \
+    rm -rf /opt/concept-graphs; \
+    with-build-proxy git clone --depth 1 --branch ali-dev \
         https://ghfast.top/https://github.com/concept-graphs/concept-graphs.git \
         /opt/concept-graphs \
-   || git clone --depth 1 --branch ali-dev \
-        https://github.com/concept-graphs/concept-graphs.git \
-        /opt/concept-graphs ) \
- && pip install --no-deps -e /opt/concept-graphs \
- || (echo "concept-graphs install failed — see comment in Dockerfile" \
-     && exit 1)
-
-# Bake CV model weights into the image — robonix invariant: a robot
-# booting on a customer network can't reach github releases /
-# huggingface CDN. ali-dev's recommended stack:
-#   yolov8l-world.pt — 91 MB, open-vocab YOLO via CLIP text encoder
-#   mobile_sam.pt    — 40 MB, MobileSAM for promptable masks
-# Plus open_clip ViT-B-32 LAION-2B for per-object CLIP features (~150 MB,
-# small enough for Jetson; 5090 build can swap to ViT-L/14 via env).
-#
-# YOLO + MobileSAM are pre-downloaded onto the host into docker/_weights/
-# (out-of-band, see scripts/build.sh's `pre_fetch_weights` block) and
-# COPY'd into the image. We tried `RUN curl` from inside buildx but
-# github CDN connections from CN drop mid-stream after ~10 minutes and
-# break a multi-hundred-MB build. Out-of-band download with retries is
-# more robust and lets buildx cache the COPY layer.
-#
-# open_clip ViT-B-32 LAION-2B comes from huggingface via HF_ENDPOINT
-# (hf-mirror.com); much smaller and reliable.
+    || ( \
+        rm -rf /opt/concept-graphs && \
+        with-build-proxy git clone --depth 1 --branch ali-dev \
+            https://github.com/concept-graphs/concept-graphs.git \
+            /opt/concept-graphs \
+    )
+
+# concept-graphs editable install.
+# pip: do NOT use proxy.
+RUN --mount=type=cache,target=/root/.cache/pip,sharing=locked \
+    without-proxy pip install --no-deps -e /opt/concept-graphs
+
+# Bake CV model weights into the image.
 ENV HF_HOME=/opt/models/hf
+
 RUN mkdir -p /opt/models /opt/models/hf /root/.cache/clip
+
 COPY _weights/yolov8l-world.pt /opt/models/yolov8l-world.pt
 COPY _weights/mobile_sam.pt    /opt/models/mobile_sam.pt
+
 # Two CLIP-flavored downloads bake in here:
-#   1. open_clip's ViT-B-32 LAION-2B (we use this for per-detection
-#      visual-similarity dedup in concept-graphs) — goes to HF_HOME.
-#   2. openai's `clip` ViT-B/32 (separate package; ultralytics'
-#      YOLO-World text encoder calls clip.load("ViT-B/32") during
-#      set_classes() and would otherwise pull 354 MB from openai's
-#      CDN at first inference — robonix invariant violation).
-RUN python3 -c "import open_clip; open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion2b_s34b_b79k')" \
- && python3 -c "import clip; clip.load('ViT-B/32', device='cpu', download_root='/root/.cache/clip')" \
- && ls -la /opt/models/ /root/.cache/clip/
-
-# Drop build-time HTTP proxy after all network-heavy build steps (pip/github/git/HF/openai CDN).
-ENV http_proxy= https_proxy= no_proxy= HTTP_PROXY= HTTPS_PROXY= NO_PROXY=
+#   1. open_clip's ViT-B-32 LAION-2B — goes to HF_HOME.
+#   2. openai's `clip` ViT-B/32 — goes to /root/.cache/clip.
+#
+# Non-pip model downloads: use proxy.
+RUN with-build-proxy sh <<'EOF'
+set -eux
+unset HF_ENDPOINT
+export HF_HUB_DOWNLOAD_TIMEOUT=120
+
+python3 -c "import open_clip; open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion2b_s34b_b79k')"
+python3 -c "import clip; clip.load('ViT-B/32', device='cpu', download_root='/root/.cache/clip')"
+
+ls -la /opt/models/ /root/.cache/clip/
+EOF
+
+# No global proxy ENV was set, so there is no build-time proxy to clear here.
 
 # Fixed env so perception_concept_graphs.py knows where the weights
 # live without each deployment having to override.
@@ -162,22 +220,16 @@ ENV SCENE_YOLO_WORLD_WEIGHTS=/opt/models/yolov8l-world.pt \
     SCENE_CLIP_PRETRAINED=laion2b_s34b_b79k
 
 WORKDIR /scene
+
 COPY entrypoint.sh /entrypoint.sh
 COPY no_shm_profile.xml /etc/fastrtps_no_shm.xml
+
 RUN chmod +x /entrypoint.sh
 
-# Force FastRTPS to skip SHM transport — see mapping_rbnx/docker for
-# full reasoning. UDP-only is the only path that data-flows reliably
-# across our containers (sim ↔ scene ↔ mapping) in this layout.
+# Force FastRTPS to skip SHM transport.
 ENV FASTRTPS_DEFAULT_PROFILES_FILE=/etc/fastrtps_no_shm.xml
 
-# Stick with FastRTPS (sim's default RMW). With --network host +
-# --ipc=host the SHM transport shares cleanly between sim and scene,
-# and FastRTPS UDP multicast discovery covers the rest. We tried
-# Zenoh first but `zenoh_bridge_dds` cannot see FastRTPS-only SHM
-# publishers reliably across containers, so this is the path that
-# actually delivers /amcl_pose, /odom, /scanner, /head_front_camera/*
-# into scene's rclpy hub.
+# Stick with FastRTPS.
 ENV RMW_IMPLEMENTATION=rmw_fastrtps_cpp
 
-ENTRYPOINT ["/entrypoint.sh"]
+ENTRYPOINT ["/entrypoint.sh"]
diff --git a/system/scene/docker/requirements.txt b/system/scene/docker/requirements.txt
@@ -1,3 +1,5 @@
+# NOTE: This file is a BACKUP of requirements/ !!!
+
 # Scene-container Python deps. Subset of pyproject.toml's
 # [project.dependencies] minus what the base image already provides
 # (numpy / cv-bridge / rclpy / tf2_ros).

diff --git a/system/scene/docker/requirements/scene-base.txt b/system/scene/docker/requirements/scene-base.txt
@@ -0,0 +1,15 @@
+# ── Generic scene ────────────────────────────────────────────────
+scipy>=1.11
+mcp
+fastmcp
+grpcio
+grpcio-tools
+protobuf
+uvicorn
+httpx
+Pillow
+PyYAML
+
+setuptools==69.5.1
+wheel
+packaging
diff --git a/system/scene/docker/requirements/scene-perception-core.txt b/system/scene/docker/requirements/scene-perception-core.txt
@@ -0,0 +1,30 @@
+# ── concept-graphs perception stack ──────────────────────────────
+# torch 2.5+/cu124 because cu118 wheels don't run on sm_120 (Blackwell);
+# ROS Humble's python3-numpy is pinned to 1.x so we let pip pick a
+# compatible scipy build automatically.
+opencv-python-headless>=4.8
+ultralytics>=8.3            # YOLO-World v2 + the SAM wrapper that
+                            # accepts `.predict(image, bboxes=...)`
+# ultralytics' YOLOWorld imports openai's `clip` (NOT the same as
+# open_clip below) for its CLIP text encoder. Without it the model
+# load raises "No module named 'clip'". `clip-anytorch` is the fork
+# that supports modern torch (the original openai `clip-by-openai`
+# uses a TorchScript Node API that's broken on torch 2.5+).
+clip-anytorch>=2.6
+open_clip_torch>=2.20       # ViT-B-32 LAION-2B; small enough for
+                            # Jetson, x86-5090 happily uses ViT-L/14
+                            # by swapping the env var.
+supervision==0.14.0         # sv.Detections is the in-memory container
+                            # gobs_to_detection_list expects.
+omegaconf==2.3.0            # concept-graphs' cfg dict format.
+hydra-core==1.3.2           # for loading their cfslam yaml.
+transformers>=4.40          # tokenizers for CLIP text path
+# concept-graphs's general_utils → vlm imports openai unconditionally,
+# even though we don't call any VLM helpers from our merge path. Add
+# it as a stub install so the slam module imports cleanly.
+openai>=1.0
+# concept-graphs also imports wandb (optional) and rich; wandb is a
+# soft import via OptionalWandB, but rich is mandatory.
+rich
+matplotlib                 # used by DetectionList.color_by_instance
+seaborn                    # used by some logging utilities
diff --git a/system/scene/docker/requirements/scene-perception-heavy.txt b/system/scene/docker/requirements/scene-perception-heavy.txt
@@ -0,0 +1,5 @@
+# Open3D is concept-graphs' point-cloud rep. Pcd ops are CPU-bound
+# but we don't accumulate tens of thousands per frame; one per object
+# per tick is fine.
+open3d>=0.17,<0.19
+faiss-cpu==1.7.4 
diff --git a/system/scene/docker/requirements/scene-pytorch3d.txt b/system/scene/docker/requirements/scene-pytorch3d.txt
@@ -0,0 +1,12 @@
+# pytorch3d was originally pulled in for concept-graphs's
+# compute_3d_iou_accurate_batch (used by compute_overlap_matrix_general
+# → merge_overlap_objects), but we replaced that path with a voxel-set
+# fraction-overlap impl (see perception_concept_graphs.py compute_*_
+# overlap_*); the ali-dev branch of concept-graphs does NOT import
+# pytorch3d in slam.{utils,mapping,slam_classes} or utils.model_utils.
+#
+# x86_64-only: miropsota's wheel index has no aarch64 binaries and
+# source build would need nvcc, so we skip on Jetson. Runtime path
+# does not exercise pytorch3d.
+--extra-index-url https://miropsota.github.io/torch_packages_builder
+pytorch3d==0.7.8+pt2.5.1cu124; platform_machine == "x86_64"