From 77f75389e9d6a5f37e24edaa9062370db7ad3471 Mon Sep 17 00:00:00 2001 From: ActivePeter <1020401660@qq.com> Date: Mon, 29 Jun 2026 11:44:02 +0800 Subject: [PATCH 1/2] a --- .github/workflows/all_test.yml | 28 +- .github/workflows/remote_testbed_bench.yml | 93 + .gitignore | 4 +- deployment/manual_dispatch_release.py | 2 +- ...13\350\257\225\346\265\201\347\250\213.md" | 49 +- fluxon_test_stack/ci_2_virt_node.py | 2 +- fluxon_test_stack/ci_remote_testbed.py | 1583 +++++++++++++++++ fluxon_test_stack/ci_scene_catalog.py | 13 + fluxon_test_stack/gitops/gitops.yaml | 2 +- fluxon_test_stack/test_bed_transport.py | 417 +++++ fluxon_test_stack/test_profile_adapter.py | 347 +--- fluxon_test_stack/test_runner.py | 317 +--- .../tests/test_ci_remote_testbed_contract.py | 405 +++++ .../tests/test_pack_test_stack_rsc_cli.py | 20 +- .../tests/test_runner_contract.py | 111 ++ .../tests/test_test_bed_transport_contract.py | 155 ++ scripts/source_selection_profiles.py | 2 +- .../test_doc_site_builder_image_workflow.py | 21 + .../tests/test_git_source_selection_utils.py | 2 +- "\351\234\200\346\261\202.md" | 27 + 20 files changed, 2946 insertions(+), 654 deletions(-) create mode 100644 .github/workflows/remote_testbed_bench.yml create mode 100644 fluxon_test_stack/ci_remote_testbed.py create mode 100644 fluxon_test_stack/ci_scene_catalog.py create mode 100644 fluxon_test_stack/test_bed_transport.py create mode 100644 fluxon_test_stack/tests/test_ci_remote_testbed_contract.py create mode 100644 fluxon_test_stack/tests/test_test_bed_transport_contract.py create mode 100644 "\351\234\200\346\261\202.md" diff --git a/.github/workflows/all_test.yml b/.github/workflows/all_test.yml index 7c001c4..da1fe0f 100644 --- a/.github/workflows/all_test.yml +++ b/.github/workflows/all_test.yml @@ -121,18 +121,18 @@ jobs: - name: Normalize ci_2_virt_node debug artifact permissions if: ${{ always() }} run: | - chmod -R a+rX .dever || true + chmod -R a+rX ci_2_virt_node_workdir || true chmod -R a+rX fluxon_release || true chmod -R a+rX setup_and_pack/nix/runs || true - find .dever -type f -name '*.yaml' -exec chmod a+r {} + || true - find .dever -type f -name '*.log' -exec chmod a+r {} + || true - find .dever -type f -name '*.json' -exec chmod a+r {} + || true - find .dever -type f -name '*.html' -exec chmod a+r {} + || true - find .dever -type f -name '*.txt' -exec chmod a+r {} + || true - find .dever -type f -name '*.sha256' -exec chmod a+r {} + || true - find .dever -type d -path '*/pack_release_runtime/*' -exec chmod a+rx {} + || true - find .dever -type d -path '*/pack_release_runtime/project-data/*' -exec chmod a+rx {} + || true - find .dever -path '*/pack_release_runtime/project-data/*' \ + find ci_2_virt_node_workdir -type f -name '*.yaml' -exec chmod a+r {} + || true + find ci_2_virt_node_workdir -type f -name '*.log' -exec chmod a+r {} + || true + find ci_2_virt_node_workdir -type f -name '*.json' -exec chmod a+r {} + || true + find ci_2_virt_node_workdir -type f -name '*.html' -exec chmod a+r {} + || true + find ci_2_virt_node_workdir -type f -name '*.txt' -exec chmod a+r {} + || true + find ci_2_virt_node_workdir -type f -name '*.sha256' -exec chmod a+r {} + || true + find ci_2_virt_node_workdir -type d -path '*/pack_release_runtime/*' -exec chmod a+rx {} + || true + find ci_2_virt_node_workdir -type d -path '*/pack_release_runtime/project-data/*' -exec chmod a+rx {} + || true + find ci_2_virt_node_workdir -path '*/pack_release_runtime/project-data/*' \ \( -path '*/instances/*/logs' -o -path '*/instances/*/release' -o -path '*/assemblies/*/profile' \) \ -exec chmod -R a+rX {} + || true @@ -144,9 +144,9 @@ jobs: if-no-files-found: warn compression-level: 1 path: | - .dever/** + ci_2_virt_node_workdir/** fluxon_release/** setup_and_pack/nix/runs/** - .dever/**/pack_release_runtime/project-data/**/instances/**/logs/** - .dever/**/pack_release_runtime/project-data/**/instances/**/release/** - .dever/**/pack_release_runtime/project-data/**/assemblies/**/profile/** + ci_2_virt_node_workdir/**/pack_release_runtime/project-data/**/instances/**/logs/** + ci_2_virt_node_workdir/**/pack_release_runtime/project-data/**/instances/**/release/** + ci_2_virt_node_workdir/**/pack_release_runtime/project-data/**/assemblies/**/profile/** diff --git a/.github/workflows/remote_testbed_bench.yml b/.github/workflows/remote_testbed_bench.yml new file mode 100644 index 0000000..428d241 --- /dev/null +++ b/.github/workflows/remote_testbed_bench.yml @@ -0,0 +1,93 @@ +name: remote_testbed_bench + +on: + workflow_dispatch: + inputs: + bootstrap_mode: + description: "Remote testbed bootstrap mode" + required: true + default: "bare_then_apply" + type: choice + options: + - bare_then_apply + - apply_only + - bare_only + +permissions: + contents: read + +jobs: + remote-testbed-bench: + runs-on: ubuntu-latest + steps: + - name: Check out repository + uses: actions/checkout@v6 + with: + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.10" + + - name: Install host dependencies + run: | + sudo apt-get update + sudo apt-get install -y --no-install-recommends \ + git \ + pigz \ + protobuf-compiler \ + python3-venv \ + rsync \ + sshpass + + - name: Install Python dependencies + run: python3 -m pip install PyYAML + + - name: Sync rather_no_git_submodule workspace inputs + run: python3 fluxon_rs/scripts/rather_no_git_submodule.py + + - name: Write local remote testbed config + env: + FLUXON_REMOTE_TESTBED_LOCAL_CONFIG_YAML: ${{ secrets.FLUXON_REMOTE_TESTBED_LOCAL_CONFIG_YAML }} + run: | + rm -f ci_remote_testbed.local.yaml + python3 - <<'PY' + import os + from pathlib import Path + import yaml + + raw = os.environ.get("FLUXON_REMOTE_TESTBED_LOCAL_CONFIG_YAML", "") + if not raw.strip(): + raise SystemExit("missing secret FLUXON_REMOTE_TESTBED_LOCAL_CONFIG_YAML") + payload = yaml.safe_load(raw) + if not isinstance(payload, dict): + raise SystemExit("FLUXON_REMOTE_TESTBED_LOCAL_CONFIG_YAML must decode to a YAML mapping") + Path("ci_remote_testbed.local.yaml").write_text( + yaml.safe_dump(payload, sort_keys=False, allow_unicode=False), + encoding="utf-8", + ) + PY + + - name: Run remote shared-testbed benchmark flow + run: | + python3 fluxon_test_stack/ci_remote_testbed.py \ + --bootstrap-mode "${{ inputs.bootstrap_mode }}" \ + --print-generated + + - name: Normalize remote testbed debug artifact permissions + if: ${{ always() }} + run: | + chmod -R a+rX ci_remote_testbed_workdir || true + chmod -R a+rX fluxon_release || true + + - name: Upload remote testbed debug artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v4 + with: + name: remote-testbed-bench-debug-${{ github.sha }} + if-no-files-found: warn + compression-level: 1 + path: | + ci_remote_testbed_workdir/** + fluxon_release/** diff --git a/.gitignore b/.gitignore index d198634..b6d4a32 100644 --- a/.gitignore +++ b/.gitignore @@ -52,7 +52,6 @@ node_modules fluxon_test_stack/bench_runner/ fluxon_test_stack/start_test_bed/ fluxon_test_stack/.manual_dispatch_release_tmp/ -.dever fluxon_release_* *.exit bench_suite.lock @@ -60,3 +59,6 @@ bench_suite.lock deployment/local/ setup_and_pack/pack_fluxonkv_pylib_env.yaml fluxon_rs/moka/ +/ci_remote_testbed.local.yaml +/ci_remote_testbed_workdir/ +/ci_2_virt_node_workdir/ diff --git a/deployment/manual_dispatch_release.py b/deployment/manual_dispatch_release.py index 9f24380..e9e9954 100644 --- a/deployment/manual_dispatch_release.py +++ b/deployment/manual_dispatch_release.py @@ -509,7 +509,7 @@ def _test_rsc_manifest_relpaths(*, src_release_dir: Path, dispatch_release_scope def _dispatch_tmp_root(*, deployconf_path: Path) -> Path: # English note: - # - Do not inherit TMPDIR from outer automation (it may point into a tool-managed .dever namespace). + # - Do not inherit TMPDIR from outer automation (it may point into a tool-managed workspace namespace). # - Keep temp artifacts next to the deployconf so the path is deterministic and discoverable. p = deployconf_path.resolve().parent / ".manual_dispatch_release_tmp" p.mkdir(parents=True, exist_ok=True) diff --git "a/fluxon_doc_cn/design/teststack_1_\345\275\223\345\211\215\346\236\266\346\236\204\344\270\216CI\346\265\213\350\257\225\346\265\201\347\250\213.md" "b/fluxon_doc_cn/design/teststack_1_\345\275\223\345\211\215\346\236\266\346\236\204\344\270\216CI\346\265\213\350\257\225\346\265\201\347\250\213.md" index 7134b00..e5f0b66 100644 --- "a/fluxon_doc_cn/design/teststack_1_\345\275\223\345\211\215\346\236\266\346\236\204\344\270\216CI\346\265\213\350\257\225\346\265\201\347\250\213.md" +++ "b/fluxon_doc_cn/design/teststack_1_\345\275\223\345\211\215\346\236\266\346\236\204\344\270\216CI\346\265\213\350\257\225\346\265\201\347\250\213.md" @@ -695,6 +695,53 @@ GitHub Actions 主窗口中的许多日志并非本地直接打印,而是由 ` 因此,GitHub Actions 现在覆盖的是“由单一 `ci_2_virt_node.py` 入口启动,并通过 top-attention CI scene 执行 workload”这条真实 CI 路径,而不是在 suite 里再并存一层旧 scene。 +### 9.2 GitHub Actions 远端集群扩展:`ci_remote_testbed.py` + +**稳定结论:** + +- `ci_remote_testbed.py` 不是另一套 runner,它是 `ci_2_virt_node.py` 的远端共享 testbed 扩展,最终仍然复用 `test_runner.py`。 +- 它把一次 GitHub Actions 触发固定拆成两个 phase:`ci` 和 `benchmark`。 +- `ci` phase 直接继承仓库里的 canonical CI scene catalog;`benchmark` phase 只保留远端集群 `supported_topologies > 1` 的多机拓扑。 +- 本地远端配置只走 `ci_remote_testbed.local.yaml`,且必须是 YAML mapping;敏感 SSH / bastion / controller exec 信息只进入 `remote_auth.yaml`,不进入 manifest。 + +| phase | 输入来源 | 选择规则 | 产物 | +| --- | --- | --- | --- | +| `ci` | `ci_test_list.yaml` | scene id 复用 canonical CI catalog,profile id 直接沿用 suite 声明 | `generated/ci.yaml` | +| `benchmark` | `benchmark_full_matrix.yaml` | 只保留远端集群 `supported_topologies > 1` 的 multi-machine topology,对应的 scene / scale 才进入执行计划 | `generated/benchmark.yaml` | + +固定执行链路如下: + +```text +GitHub Actions workflow_dispatch + -> write ci_remote_testbed.local.yaml from secret YAML + -> ci_remote_testbed.py + -> generate ci.yaml + benchmark.yaml + -> pack release once + -> dispatch once + -> start shared testbed once + -> test_runner.py once for ci + -> test_runner.py once for benchmark +``` + +- `phase_runs` 是这两个 runner 调用之间的稳定连接面,记录 `phase_name`、`suite_path`、`runner_workdir`、`scene_ids`、`profile_ids`、`allowed_scale_topologies`。 +- workflow 只负责触发和落地本地 YAML,不承载实际测试语义;测试语义仍由 `ci_remote_testbed.py` 和 `test_runner.py` 共同决定。 + +### 9.3 远端触发的实际链路 + +`ci_remote_testbed.py` 的远端执行不是“GitHub 每次都在远端直接跑一整套脚本再退出”,而是固定为: + +1. 在本地生成 `ci_remote_testbed.local.yaml` 和派生 bundle。 +2. 通过一次 SSH 触发 `controller_exec_host` 上的远端 launcher。 +3. 远端 launcher 在 `controller_exec_host` 上后台启动 `remote_runner.py`。 +4. GitHub Actions 继续通过同一个 `controller_exec_host` 轮询 `.remote_runner_exit_code` 和 `remote_runner.launch.log`。 +5. `remote_runner.py` 在远端按 phase 顺序调用 `test_runner.py`,先跑 `ci`,再跑 `benchmark`。 + +这里的关键边界是: + +- SSH 触发只发生一次; +- 后续状态收敛依赖轮询,而不是重复 SSH 启动; +- `test_runner.py` 始终运行在远端机器上,不在 GitHub runner 本地执行。 + ## 10. GitOps 与 UI 的归属 GitOps 挂在 test_runner UI 服务下。这里的约束是不额外拆出第二个独立控制面服务,不是要求 UI 随某一次测试 run 一起退出。 @@ -767,5 +814,5 @@ GitOps 挂在 test_runner UI 服务下。这里的约束是不额外拆出第二 - 先准备 / 启动 testbed; - 再由 `test_runner.py` 执行 suite。 - 对 `CI` 实现来说,远端 `ci_runner.sh` 负责执行命令,`test_runner.py` 持有 case 执行 authority。 -- `ci_2_virt_node.py` 只是把“本地双逻辑节点环境下的标准 CI 流程”封装出来,不改变 runner 的核心分层。 +- `ci_2_virt_node.py` 只是把“本地双逻辑节点环境下的标准 CI 流程”封装出来;`ci_remote_testbed.py` 则把同一套 runner 扩展到 GitHub Actions 触发的远端共享 testbed,不改变 runner 的核心分层。 - UI 和 GitOps 都属于 `test_runner` 服务面;其中 UI 应作为常驻服务运行,不构成额外的测试执行框架。 diff --git a/fluxon_test_stack/ci_2_virt_node.py b/fluxon_test_stack/ci_2_virt_node.py index 405c9a2..f62a69a 100644 --- a/fluxon_test_stack/ci_2_virt_node.py +++ b/fluxon_test_stack/ci_2_virt_node.py @@ -26,7 +26,7 @@ DEFAULT_RATHER_NO_GIT_SUBMODULE_SCRIPT = ( REPO_ROOT / "fluxon_rs" / "scripts" / "rather_no_git_submodule.py" ) -DEFAULT_CI_2_VIRT_NODE_WORKDIR = REPO_ROOT / ".dever" / "ci_2_virt_node" +DEFAULT_CI_2_VIRT_NODE_WORKDIR = REPO_ROOT / "ci_2_virt_node_workdir" DEFAULT_RELEASE_DIR = REPO_ROOT / "fluxon_release" PUBLIC_PROFILE_ID = "fluxon_tcp_thread" PUBLIC_ARTIFACT_SET_ID = "fluxon_tcp_thread" diff --git a/fluxon_test_stack/ci_remote_testbed.py b/fluxon_test_stack/ci_remote_testbed.py new file mode 100644 index 0000000..216d0f0 --- /dev/null +++ b/fluxon_test_stack/ci_remote_testbed.py @@ -0,0 +1,1583 @@ +#!/usr/bin/env python3 + +from __future__ import annotations + +import argparse +import copy +import json +import os +import shutil +import subprocess +import sys +import tempfile +import time +import uuid +from pathlib import Path +from typing import Any + +import yaml + + +REPO_ROOT = Path(__file__).resolve().parent.parent +RUNNER_STACK_DIR = REPO_ROOT / "fluxon_test_stack" +if str(RUNNER_STACK_DIR) not in sys.path: + sys.path.insert(0, str(RUNNER_STACK_DIR)) +DEPLOYMENT_DIR = REPO_ROOT / "deployment" +if str(DEPLOYMENT_DIR) not in sys.path: + sys.path.insert(0, str(DEPLOYMENT_DIR)) + +from ci_scene_catalog import canonical_ci_scene_ids +import manual_dispatch_release + + +DEFAULT_CI_SUITE_PATH = REPO_ROOT / "fluxon_test_stack" / "ci_test_list.yaml" +DEFAULT_BENCHMARK_SUITE_PATH = REPO_ROOT / "fluxon_test_stack" / "benchmark_full_matrix.yaml" +DEFAULT_SUITE_PATH = DEFAULT_BENCHMARK_SUITE_PATH +DEFAULT_DEPLOYCONF_TEMPLATE = REPO_ROOT / "fluxon_test_stack" / "deployconf_testbed.yml" +DEFAULT_START_TEST_BED_TEMPLATE = REPO_ROOT / "fluxon_test_stack" / "start_test_bed.yaml" +DEFAULT_WORKDIR = REPO_ROOT / "ci_remote_testbed_workdir" +DEFAULT_LOCAL_CONFIG_PATH = REPO_ROOT / "ci_remote_testbed.local.yaml" +DEFAULT_RELEASE_DIR = REPO_ROOT / "fluxon_release" +DEFAULT_REMOTE_WORKDIR_ROOT_NAME = "ci_remote_testbed_remote" +TEST_STACK_START_TEST_BED_CONFIG_ENV = "FLUXON_TEST_STACK_START_TEST_BED_CONFIG" +DEFAULT_REMOTE_CONTROLLER_REQUEST_MODE = "ssh_exec_per_request" +PLACEHOLDER_WHEEL_NAME = "fluxon-0.0.0-ci-placeholder-cp38-abi3-manylinux_2_28_x86_64.whl" +TESTBED_BUNDLE_DIRNAME = "testbed_bundle" +TESTBED_GENERATED_DIRNAME = "generated" +TESTBED_START_WORKDIR_DIRNAME = "start_test_bed" +TESTBED_RUNNER_WORKDIR_DIRNAME = "runner_run" +REMOTE_RUNNER_SCRIPT_FILENAME = "remote_runner.py" +REMOTE_RUNNER_EXIT_CODE_FILENAME = ".remote_runner_exit_code" +REMOTE_RUNNER_LAUNCH_LOG_FILENAME = "remote_runner.launch.log" + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description=( + "Canonical GitHub-triggered remote shared-testbed entrypoint. It packages release/test resources, " + "dispatches them to a bounded remote testbed cluster, and starts test_runner on the remote host." + ) + ) + parser.add_argument( + "--workdir", + type=Path, + default=DEFAULT_WORKDIR, + help="State root for generated configs, testbed bundle, and runner outputs.", + ) + parser.add_argument( + "--release-dir", + type=Path, + default=DEFAULT_RELEASE_DIR, + help="Release artifact root used for dispatch and runner reuse.", + ) + parser.add_argument( + "--skip-pack", + action="store_true", + help="Skip release/test_rsc packaging and assume artifacts already exist.", + ) + parser.add_argument( + "--skip-dispatch", + action="store_true", + help="Skip deployment/manual_dispatch_release.py.", + ) + parser.add_argument( + "--runner-workdir", + type=Path, + default=None, + help="Optional explicit test_runner workdir. Defaults to /runner_run.", + ) + parser.add_argument( + "--bootstrap-mode", + choices=("bare_then_apply", "apply_only", "bare_only"), + default="bare_then_apply", + help="Bootstrap mode recorded in the generated manifest.", + ) + parser.add_argument( + "--print-generated", + action="store_true", + help="Print generated config and bundle paths before executing commands.", + ) + return parser.parse_args() + + +def _resolve_repo_root_cli_path(raw_path: Path) -> Path: + if raw_path.is_absolute(): + return raw_path.resolve() + return (REPO_ROOT / raw_path).resolve() + + +def _load_yaml_mapping(path: Path, *, ctx: str) -> dict[str, Any]: + raw = yaml.safe_load(path.read_text(encoding="utf-8")) + if not isinstance(raw, dict): + raise ValueError(f"{ctx} must be a YAML mapping: {path}") + return raw + + +def _load_remote_testbed_local_config() -> dict[str, Any]: + config_path = DEFAULT_LOCAL_CONFIG_PATH + if not config_path.exists(): + raise ValueError(f"remote testbed local config not found: {config_path}") + if not config_path.is_file(): + raise ValueError(f"remote testbed local config must be a YAML file: {config_path}") + return _load_yaml_mapping(config_path, ctx="remote testbed local config") + + +# Expected local YAML shape: +# testbed_cluster_id: