From 2f58ef06555e6c46e2b1cf7adab4eb442cd9e44c Mon Sep 17 00:00:00 2001 From: Hanwen Cheng Date: Sun, 31 May 2026 18:03:12 +0800 Subject: [PATCH 1/4] fix(setup-cloud): self-heal the SSM precondition for step 15 (prod + test brokers) setup-cloud.sh step 15 (SSM SendCommand to bring up the MCP server) assumed the broker EC2 was already a registered SSM managed instance but never ensured it. Operators hit `SendCommand -> InvalidInstanceId` because the broker-host role was created WITHOUT AmazonSSMManagedInstanceCore, so the on-host amazon-ssm-agent can't register. (And separately, a caller lacking ssm:SendCommand got a misleading "does the instance have the agent?" message.) - ensure_ssm_managed(): runs before SendCommand. Resolves the role from the INSTANCE's attached profile (naming-agnostic, so the SAME code fixes BOTH the prod `agentkeys-broker-host` and the test broker's own profile), idempotently attaches AmazonSSMManagedInstanceCore if missing, then polls describe-instance-information until PingStatus=Online. If the agent never registers (role now correct => the agent itself isn't running), it dies with the exact restart remediation (ssh-broker.sh + setup-broker-host.sh --upgrade, or reboot). Idempotent: a re-run with the policy already attached skips. - SendCommand now captures stderr and distinguishes a CALLER ssm:SendCommand AccessDenied (identity-based policy gap) from a real instance problem, with a precise remediation (put-user-policy; see provision-ci-deploy-role.sh for the policy shape) instead of the misleading instance-agent message. - aws iam calls are global (no --region); ec2/ssm reads pass --region "$REGION" per the agentkeys-admin-defaults-to-us-west-2 trap (CLAUDE.md). Env-agnostic + idempotent: works for both broker envs and converges on re-run. --- scripts/setup-cloud.sh | 88 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 84 insertions(+), 4 deletions(-) diff --git a/scripts/setup-cloud.sh b/scripts/setup-cloud.sh index 37cb4546..04fd2777 100755 --- a/scripts/setup-cloud.sh +++ b/scripts/setup-cloud.sh @@ -697,10 +697,81 @@ do_step_14() { ok "mail bucket policy applied" } +# Step-15 precondition: the broker EC2 must be a REGISTERED SSM managed instance +# before SendCommand, or AWS returns InvalidInstanceId / "not in a valid state". +# The on-host amazon-ssm-agent only registers if the instance's role carries +# AmazonSSMManagedInstanceCore — and operators repeatedly hit this because the +# broker-host role (prod `agentkeys-broker-host`; the test broker uses its own +# profile) was created WITHOUT it. We self-heal idempotently, deriving the role +# from the INSTANCE's actual attached profile so the SAME code fixes BOTH the +# prod and test brokers regardless of role naming, then poll until the agent +# registers. `aws iam` is global (no --region); ec2/ssm reads pass --region +# "$REGION" per the agentkeys-admin-defaults-to-us-west-2 trap (CLAUDE.md). +ensure_ssm_managed() { + local ssm_core="arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + + # Resolve the role behind THIS instance's profile (naming-agnostic). + local prof_arn + prof_arn="$(aws --region "$REGION" ec2 describe-instances --instance-ids "$INSTANCE_ID" \ + --query 'Reservations[0].Instances[0].IamInstanceProfile.Arn' --output text 2>/dev/null || true)" + if [ -z "$prof_arn" ] || [ "$prof_arn" = "None" ]; then + die "SSM precondition: $INSTANCE_ID has NO instance profile — it can never register with SSM. Attach one carrying AmazonSSMManagedInstanceCore (docs/cloud-bootstrap.md §6), then re-run." + fi + local prof_name role + prof_name="${prof_arn##*/}" + role="$(aws iam get-instance-profile --instance-profile-name "$prof_name" \ + --query 'InstanceProfile.Roles[0].RoleName' --output text 2>/dev/null || true)" + if [ -z "$role" ] || [ "$role" = "None" ]; then + die "SSM precondition: instance profile $prof_name has no role attached — re-create it (docs/cloud-bootstrap.md §6)." + fi + + # Idempotently ensure AmazonSSMManagedInstanceCore on that role. + local have + have="$(aws iam list-attached-role-policies --role-name "$role" \ + --query "length(AttachedPolicies[?PolicyArn=='$ssm_core'])" --output text 2>/dev/null || echo 0)" + if [ "$have" = "1" ]; then + ok "SSM: AmazonSSMManagedInstanceCore already on $role" + elif [ "$DRY_RUN" = "1" ]; then + warn "DRY: would attach AmazonSSMManagedInstanceCore to $role (instance profile $prof_name)" + else + if aws iam attach-role-policy --role-name "$role" --policy-arn "$ssm_core" >/dev/null 2>&1; then + ok "SSM: attached AmazonSSMManagedInstanceCore to $role (idempotent)" + else + die "SSM: could not attach AmazonSSMManagedInstanceCore to $role — your caller needs iam:AttachRolePolicy (\`awsp agentkeys-admin\`)." + fi + fi + if [ "$DRY_RUN" = "1" ]; then return 0; fi + + # Poll until the on-host agent registers (it refreshes IMDS creds, ~30s cadence; + # a freshly-attached policy is usually picked up within 1–2 min). + local ping="" i + for i in $(seq 1 18); do + ping="$(aws --region "$REGION" ssm describe-instance-information \ + --filters "Key=InstanceIds,Values=$INSTANCE_ID" \ + --query 'InstanceInformationList[0].PingStatus' --output text 2>/dev/null || true)" + if [ "$ping" = "Online" ]; then + ok "SSM: $INSTANCE_ID registered (PingStatus=Online)" + return 0 + fi + if [ "$i" = "1" ]; then + printf " waiting up to ~3min for %s to register with SSM (PingStatus=%s)…\n" "$INSTANCE_ID" "${ping:-none}" >&2 + fi + sleep 10 + done + die "SSM: $INSTANCE_ID never reached PingStatus=Online (last: ${ping:-none}) after ~3min. The role now carries AmazonSSMManagedInstanceCore, so the on-host amazon-ssm-agent likely isn't running. (Re)start it via the single entry point, then re-run this step: + bash scripts/ssh-broker.sh $([ "$TEST_MODE" = "1" ] && echo test || echo prod) + sudo bash /opt/agentkeys-src/scripts/setup-broker-host.sh --upgrade # installs+enables the agent + or reboot (broker auto-restarts via systemd): aws --region $REGION ec2 reboot-instances --instance-ids $INSTANCE_ID" +} + do_step_15() { CUR_STEP=15; step "Bring up agentkeys-mcp-server on broker (via SSM)" : "${INSTANCE_ID:?INSTANCE_ID missing — broker EC2 needs to exist (re-run step 4 first)}" + # Make the broker EC2 SSM-ready BEFORE SendCommand (self-heals the missing + # AmazonSSMManagedInstanceCore that bit both prod + test brokers). Idempotent. + ensure_ssm_managed + REPO_URL_FOR_MCP="${AGENTKEYS_REPO_URL:-https://github.com/litentry/agentKeys.git}" REV_FOR_MCP="${AGENTKEYS_REV:-main}" MCP_HOST_FLAGS="" @@ -744,15 +815,24 @@ sudo -E AGENTKEYS_REPO_URL=${REPO_URL_FOR_MCP} AGENTKEYS_REV=${REV_FOR_MCP} \\ EOSH ) - local cmd_id - cmd_id=$(aws ssm send-command \ + local cmd_id _send_err + _send_err="$(mktemp)" + if ! cmd_id=$(aws ssm send-command \ --region "$REGION" \ --instance-ids "$INSTANCE_ID" \ --document-name "AWS-RunShellScript" \ --comment "agentkeys-mcp-server bring-up ($([ "$TEST_MODE" = "1" ] && echo test || echo prod))" \ --parameters "{\"commands\": $(jq -Rs . <<<"$mcp_bring_up_script" | jq -s .)}" \ - --query "Command.CommandId" --output text) \ - || die "aws ssm send-command failed — does $INSTANCE_ID have amazon-ssm-agent + the SSM instance profile?" + --query "Command.CommandId" --output text 2>"$_send_err"); then + local _err; _err="$(cat "$_send_err" 2>/dev/null || true)"; rm -f "$_send_err" + # ensure_ssm_managed already verified the instance is registered, so a + # SendCommand AccessDenied here is a CALLER policy gap, not an instance one. + if printf '%s' "$_err" | grep -qiE 'accessdenied|not authorized|ssm:sendcommand'; then + die "ssm:SendCommand DENIED for your CALLER (identity-based policy gap — $INSTANCE_ID is SSM-registered, verified just above, so this is NOT an instance problem). Your operator IAM user needs ssm:SendCommand on the instance + the AWS-RunShellScript document, plus ssm:GetCommandInvocation/ListCommandInvocations. Grant it with \`aws iam put-user-policy\` on your user (see scripts/provision-ci-deploy-role.sh for the exact policy shape it grants the CI deploy role), then re-run. Detail: $_err" + fi + die "aws ssm send-command failed — does $INSTANCE_ID have amazon-ssm-agent + the SSM instance profile? Detail: $_err" + fi + rm -f "$_send_err" ok "SSM command $cmd_id queued on $INSTANCE_ID; polling for completion (max 10 min)" # Poll every 10s for up to 10 min. setup-mcp-host.sh is normally <3 min; From a7014e709f35b6ce97775661123788f9c51771de Mon Sep 17 00:00:00 2001 From: Hanwen Cheng Date: Sun, 31 May 2026 18:10:32 +0800 Subject: [PATCH 2/4] fix(setup-cloud,docs): drop no-op --upgrade from setup-broker-host.sh refs + add CLAUDE.md rule setup-broker-host.sh treats --upgrade (and --skip-pull) as back-compat NO-OPS (it is idempotent + auto-detects bootstrap vs upgrade), so emitting it is misleading. Replace active-path references with --ref main (the canonical idempotent deploy invocation per CLAUDE.md) and codify the rule: - setup-cloud.sh: ensure_ssm_managed remediation suggests --ref main. - docs/ci-setup.md: prod-broker manual deploy uses --ref main. - CLAUDE.md (Remote broker host): explicit never-pass---upgrade rule. --- CLAUDE.md | 2 ++ docs/ci-setup.md | 2 +- scripts/setup-cloud.sh | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 2f76ec6d..3d99fae8 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -90,6 +90,8 @@ Also: never gloss over a partial implementation in a demo doc or runbook. If the ## Remote broker host (single entry point) All remote-host changes (binary upgrades, systemd edits, nginx/certbot, env tweaks, mock-server redeploys) MUST go through `bash scripts/setup-broker-host.sh` — it's idempotent and auto-detects bootstrap vs upgrade. No ad-hoc `systemctl` edits or hand-built `scp`. +**NEVER pass `--upgrade` (or `--skip-pull`) to `setup-broker-host.sh`.** They are back-compat **no-ops** — the script is idempotent and auto-detects bootstrap vs upgrade; there is no "upgrade mode" to opt into. Invoke it **plain** (`bash scripts/setup-broker-host.sh`, optionally with `--test`/`--yes`/`--clean`), or pass **`--ref main`** when you also want it to fetch + checkout + redeploy `main`. Do not add an `--upgrade` flag to any new script, runbook, doc, or CLI guidance; if you find an existing `--upgrade` reference in an active (non-archived) operator path, replace it with `--ref main` (deploy) or plain (ensure) in the same change. + ### SSH access to the remote broker host On the operator machine, **SSH into the prod broker with the zsh alias `ssh-agentkeys`** (= `bash $AGENTKEYS_REPO/scripts/ssh-broker.sh prod`, which uses EC2 Instance Connect under AWS profile `agentkeys-broker`). Use it for read-only diagnostics (worker logs, env, status) — it is the sanctioned remote-shell entry point; do not hand-roll `aws ec2-instance-connect ssh` or raw `ssh`. Pass a trailing command to run non-interactively: `ssh-agentkeys 'systemctl status agentkeys-worker-memory'`. The login user is `agentkey` (uid 1001); it is in the `sudo` group but sudo **requires a password and a TTY**, so `journalctl`/reading `/etc/agentkeys/*.env` (owned `agentkeys:agentkeys 0600`) need an interactive session — non-interactive `ssh-agentkeys ''` can only run unprivileged commands. For privileged log reads, open an interactive `ssh-agentkeys` shell and run `sudo` there. (`ssh-broker.sh test` / `--fallback` reach the test stack / use the `.pem` when EC2-IC is down.) diff --git a/docs/ci-setup.md b/docs/ci-setup.md index 0e270bc4..ed8bd943 100644 --- a/docs/ci-setup.md +++ b/docs/ci-setup.md @@ -477,7 +477,7 @@ The IAM role can stay provisioned indefinitely — without the secret it can't b Per [issue #101](https://github.com/litentry/agentKeys/issues/101) "Out of scope": -- **Prod broker auto-deploy** — never. The prod broker EC2 stays manual via `bash scripts/setup-broker-host.sh --upgrade` from the operator laptop, per CLAUDE.md "Remote broker host (single entry point)". +- **Prod broker auto-deploy** — never. The prod broker EC2 stays manual via `bash scripts/setup-broker-host.sh --ref main` from the operator laptop, per CLAUDE.md "Remote broker host (single entry point)". - **Auto-deploy of test Heima EVM contracts** — deferred to a follow-up PR (issue #101 rollout plan step 7). Contract redeploys mint new addresses and require the `SECRETS_REWRITE_PAT` token to update six `TEST_*_ADDRESS_HEIMA` secrets — more risk than the broker deploy, so it ships separately. - **Mainnet prod contract redeploy** — never automatic. Manual via `bash scripts/setup-heima.sh` only. diff --git a/scripts/setup-cloud.sh b/scripts/setup-cloud.sh index 04fd2777..7003a72b 100755 --- a/scripts/setup-cloud.sh +++ b/scripts/setup-cloud.sh @@ -760,7 +760,7 @@ ensure_ssm_managed() { done die "SSM: $INSTANCE_ID never reached PingStatus=Online (last: ${ping:-none}) after ~3min. The role now carries AmazonSSMManagedInstanceCore, so the on-host amazon-ssm-agent likely isn't running. (Re)start it via the single entry point, then re-run this step: bash scripts/ssh-broker.sh $([ "$TEST_MODE" = "1" ] && echo test || echo prod) - sudo bash /opt/agentkeys-src/scripts/setup-broker-host.sh --upgrade # installs+enables the agent + sudo bash /opt/agentkeys-src/scripts/setup-broker-host.sh --ref main # idempotent; installs+enables the agent or reboot (broker auto-restarts via systemd): aws --region $REGION ec2 reboot-instances --instance-ids $INSTANCE_ID" } From c3e7989504fe50d4a79b239aec06fe0a6e715060 Mon Sep 17 00:00:00 2001 From: Hanwen Cheng Date: Sun, 31 May 2026 18:29:39 +0800 Subject: [PATCH 3/4] fix(setup-cloud): default HOME in the step-15 SSM remote script (set -u unbound-var) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SSM RunShellScript executes the step-15 mcp-bring-up script as root with a MINIMAL env (no HOME). Under set -euo pipefail the first $HOME use (export PATH=$HOME/.cargo/bin) aborted with 'HOME: unbound variable' — a latent bug that only surfaced once SSM delivery started working (previously it failed at send-command). Default HOME to /root before any $HOME use. Also document --only-step 15 in the script's re-run examples. --- scripts/setup-cloud.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/scripts/setup-cloud.sh b/scripts/setup-cloud.sh index 7003a72b..7c98d194 100755 --- a/scripts/setup-cloud.sh +++ b/scripts/setup-cloud.sh @@ -793,6 +793,10 @@ do_step_15() { mcp_bring_up_script=$(cat < /root). +export HOME="\${HOME:-/root}" export PATH="\$HOME/.cargo/bin:\$PATH" if ! command -v cargo >/dev/null 2>&1; then @@ -901,6 +905,7 @@ do_step_16() { printf " bash scripts/setup-cloud.sh --only-step 6 # re-UPSERT DNS\n" >&2 printf " bash scripts/setup-cloud.sh --only-step 12 # re-create SSH user (e.g. after EC2 replace)\n" >&2 printf " bash scripts/setup-cloud.sh --only-step 13 # re-run per-data-class provisioning\n" >&2 + printf " bash scripts/setup-cloud.sh --only-step 15 # re-run the MCP bring-up on the broker via SSM (add --test for the test broker)\n" >&2 printf " bash scripts/setup-cloud.sh --only-step 15 # re-deploy agentkeys-mcp-server on broker (cargo install --git)\n" >&2 printf " bash scripts/setup-cloud.sh --only-step 15 --test # same for test-mcp.\${ZONE}\n\n" >&2 } From 41bc0d584cbd9df226de055d1ffd67fe7ce76e8e Mon Sep 17 00:00:00 2001 From: Hanwen Cheng Date: Sun, 31 May 2026 18:35:14 +0800 Subject: [PATCH 4/4] docs(CLAUDE): generalize the never---upgrade rule to any idempotent setup script Broaden the rule from setup-broker-host.sh to all idempotent setup scripts (setup-cloud.sh, setup-heima.sh, heima-* helpers) and restore the actionable guidance (invoke plain / --only-step N, or --ref main for a broker redeploy; replace existing active-path --upgrade references). --- CLAUDE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CLAUDE.md b/CLAUDE.md index 3d99fae8..408c7183 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -90,7 +90,7 @@ Also: never gloss over a partial implementation in a demo doc or runbook. If the ## Remote broker host (single entry point) All remote-host changes (binary upgrades, systemd edits, nginx/certbot, env tweaks, mock-server redeploys) MUST go through `bash scripts/setup-broker-host.sh` — it's idempotent and auto-detects bootstrap vs upgrade. No ad-hoc `systemctl` edits or hand-built `scp`. -**NEVER pass `--upgrade` (or `--skip-pull`) to `setup-broker-host.sh`.** They are back-compat **no-ops** — the script is idempotent and auto-detects bootstrap vs upgrade; there is no "upgrade mode" to opt into. Invoke it **plain** (`bash scripts/setup-broker-host.sh`, optionally with `--test`/`--yes`/`--clean`), or pass **`--ref main`** when you also want it to fetch + checkout + redeploy `main`. Do not add an `--upgrade` flag to any new script, runbook, doc, or CLI guidance; if you find an existing `--upgrade` reference in an active (non-archived) operator path, replace it with `--ref main` (deploy) or plain (ensure) in the same change. +**NEVER pass `--upgrade` (or `--skip-pull`) to any idempotent setup script** (`setup-broker-host.sh`, `setup-cloud.sh`, the `heima-*` / `setup-heima.sh` helpers, etc.). They are back-compat **no-ops** — these scripts are idempotent and auto-detect bootstrap vs upgrade; there is no "upgrade mode" to opt into. Invoke them **plain** (optionally with `--test` / `--yes` / `--clean` / `--only-step N`), or pass **`--ref main`** to `setup-broker-host.sh` when you also want it to fetch + checkout + redeploy `main`. Do not add an `--upgrade` flag to any new script, runbook, doc, or CLI guidance; if you find an existing `--upgrade` reference in an active (non-archived) operator path, replace it with the idempotent invocation (`--ref main` for deploy, plain for ensure) in the same change. ### SSH access to the remote broker host On the operator machine, **SSH into the prod broker with the zsh alias `ssh-agentkeys`** (= `bash $AGENTKEYS_REPO/scripts/ssh-broker.sh prod`, which uses EC2 Instance Connect under AWS profile `agentkeys-broker`). Use it for read-only diagnostics (worker logs, env, status) — it is the sanctioned remote-shell entry point; do not hand-roll `aws ec2-instance-connect ssh` or raw `ssh`. Pass a trailing command to run non-interactively: `ssh-agentkeys 'systemctl status agentkeys-worker-memory'`. The login user is `agentkey` (uid 1001); it is in the `sudo` group but sudo **requires a password and a TTY**, so `journalctl`/reading `/etc/agentkeys/*.env` (owned `agentkeys:agentkeys 0600`) need an interactive session — non-interactive `ssh-agentkeys ''` can only run unprivileged commands. For privileged log reads, open an interactive `ssh-agentkeys` shell and run `sudo` there. (`ssh-broker.sh test` / `--fallback` reach the test stack / use the `.pem` when EC2-IC is down.)