From 2f58ef06555e6c46e2b1cf7adab4eb442cd9e44c Mon Sep 17 00:00:00 2001
From: Hanwen Cheng <heawen.cheng@gmail.com>
Date: Sun, 31 May 2026 18:03:12 +0800
Subject: [PATCH 1/4] fix(setup-cloud): self-heal the SSM precondition for step
 15 (prod + test brokers)

setup-cloud.sh step 15 (SSM SendCommand to bring up the MCP server) assumed the
broker EC2 was already a registered SSM managed instance but never ensured it.
Operators hit `SendCommand -> InvalidInstanceId` because the broker-host role was
created WITHOUT AmazonSSMManagedInstanceCore, so the on-host amazon-ssm-agent
can't register. (And separately, a caller lacking ssm:SendCommand got a
misleading "does the instance have the agent?" message.)

- ensure_ssm_managed(): runs before SendCommand. Resolves the role from the
  INSTANCE's attached profile (naming-agnostic, so the SAME code fixes BOTH the
  prod `agentkeys-broker-host` and the test broker's own profile), idempotently
  attaches AmazonSSMManagedInstanceCore if missing, then polls
  describe-instance-information until PingStatus=Online. If the agent never
  registers (role now correct => the agent itself isn't running), it dies with
  the exact restart remediation (ssh-broker.sh + setup-broker-host.sh --upgrade,
  or reboot). Idempotent: a re-run with the policy already attached skips.
- SendCommand now captures stderr and distinguishes a CALLER ssm:SendCommand
  AccessDenied (identity-based policy gap) from a real instance problem, with a
  precise remediation (put-user-policy; see provision-ci-deploy-role.sh for the
  policy shape) instead of the misleading instance-agent message.
- aws iam calls are global (no --region); ec2/ssm reads pass --region "$REGION"
  per the agentkeys-admin-defaults-to-us-west-2 trap (CLAUDE.md).

Env-agnostic + idempotent: works for both broker envs and converges on re-run.
---
 scripts/setup-cloud.sh | 88 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 84 insertions(+), 4 deletions(-)

diff --git a/scripts/setup-cloud.sh b/scripts/setup-cloud.sh
index 37cb4546..04fd2777 100755
--- a/scripts/setup-cloud.sh
+++ b/scripts/setup-cloud.sh
@@ -697,10 +697,81 @@ do_step_14() {
   ok "mail bucket policy applied"
 }
 
+# Step-15 precondition: the broker EC2 must be a REGISTERED SSM managed instance
+# before SendCommand, or AWS returns InvalidInstanceId / "not in a valid state".
+# The on-host amazon-ssm-agent only registers if the instance's role carries
+# AmazonSSMManagedInstanceCore — and operators repeatedly hit this because the
+# broker-host role (prod `agentkeys-broker-host`; the test broker uses its own
+# profile) was created WITHOUT it. We self-heal idempotently, deriving the role
+# from the INSTANCE's actual attached profile so the SAME code fixes BOTH the
+# prod and test brokers regardless of role naming, then poll until the agent
+# registers. `aws iam` is global (no --region); ec2/ssm reads pass --region
+# "$REGION" per the agentkeys-admin-defaults-to-us-west-2 trap (CLAUDE.md).
+ensure_ssm_managed() {
+  local ssm_core="arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
+
+  # Resolve the role behind THIS instance's profile (naming-agnostic).
+  local prof_arn
+  prof_arn="$(aws --region "$REGION" ec2 describe-instances --instance-ids "$INSTANCE_ID" \
+    --query 'Reservations[0].Instances[0].IamInstanceProfile.Arn' --output text 2>/dev/null || true)"
+  if [ -z "$prof_arn" ] || [ "$prof_arn" = "None" ]; then
+    die "SSM precondition: $INSTANCE_ID has NO instance profile — it can never register with SSM. Attach one carrying AmazonSSMManagedInstanceCore (docs/cloud-bootstrap.md §6), then re-run."
+  fi
+  local prof_name role
+  prof_name="${prof_arn##*/}"
+  role="$(aws iam get-instance-profile --instance-profile-name "$prof_name" \
+    --query 'InstanceProfile.Roles[0].RoleName' --output text 2>/dev/null || true)"
+  if [ -z "$role" ] || [ "$role" = "None" ]; then
+    die "SSM precondition: instance profile $prof_name has no role attached — re-create it (docs/cloud-bootstrap.md §6)."
+  fi
+
+  # Idempotently ensure AmazonSSMManagedInstanceCore on that role.
+  local have
+  have="$(aws iam list-attached-role-policies --role-name "$role" \
+    --query "length(AttachedPolicies[?PolicyArn=='$ssm_core'])" --output text 2>/dev/null || echo 0)"
+  if [ "$have" = "1" ]; then
+    ok "SSM: AmazonSSMManagedInstanceCore already on $role"
+  elif [ "$DRY_RUN" = "1" ]; then
+    warn "DRY: would attach AmazonSSMManagedInstanceCore to $role (instance profile $prof_name)"
+  else
+    if aws iam attach-role-policy --role-name "$role" --policy-arn "$ssm_core" >/dev/null 2>&1; then
+      ok "SSM: attached AmazonSSMManagedInstanceCore to $role (idempotent)"
+    else
+      die "SSM: could not attach AmazonSSMManagedInstanceCore to $role — your caller needs iam:AttachRolePolicy (\`awsp agentkeys-admin\`)."
+    fi
+  fi
+  if [ "$DRY_RUN" = "1" ]; then return 0; fi
+
+  # Poll until the on-host agent registers (it refreshes IMDS creds, ~30s cadence;
+  # a freshly-attached policy is usually picked up within 1–2 min).
+  local ping="" i
+  for i in $(seq 1 18); do
+    ping="$(aws --region "$REGION" ssm describe-instance-information \
+      --filters "Key=InstanceIds,Values=$INSTANCE_ID" \
+      --query 'InstanceInformationList[0].PingStatus' --output text 2>/dev/null || true)"
+    if [ "$ping" = "Online" ]; then
+      ok "SSM: $INSTANCE_ID registered (PingStatus=Online)"
+      return 0
+    fi
+    if [ "$i" = "1" ]; then
+      printf "    waiting up to ~3min for %s to register with SSM (PingStatus=%s)…\n" "$INSTANCE_ID" "${ping:-none}" >&2
+    fi
+    sleep 10
+  done
+  die "SSM: $INSTANCE_ID never reached PingStatus=Online (last: ${ping:-none}) after ~3min. The role now carries AmazonSSMManagedInstanceCore, so the on-host amazon-ssm-agent likely isn't running. (Re)start it via the single entry point, then re-run this step:
+     bash scripts/ssh-broker.sh $([ "$TEST_MODE" = "1" ] && echo test || echo prod)
+     sudo bash /opt/agentkeys-src/scripts/setup-broker-host.sh --upgrade   # installs+enables the agent
+   or reboot (broker auto-restarts via systemd): aws --region $REGION ec2 reboot-instances --instance-ids $INSTANCE_ID"
+}
+
 do_step_15() {
   CUR_STEP=15; step "Bring up agentkeys-mcp-server on broker (via SSM)"
   : "${INSTANCE_ID:?INSTANCE_ID missing — broker EC2 needs to exist (re-run step 4 first)}"
 
+  # Make the broker EC2 SSM-ready BEFORE SendCommand (self-heals the missing
+  # AmazonSSMManagedInstanceCore that bit both prod + test brokers). Idempotent.
+  ensure_ssm_managed
+
   REPO_URL_FOR_MCP="${AGENTKEYS_REPO_URL:-https://github.com/litentry/agentKeys.git}"
   REV_FOR_MCP="${AGENTKEYS_REV:-main}"
   MCP_HOST_FLAGS=""
@@ -744,15 +815,24 @@ sudo -E AGENTKEYS_REPO_URL=${REPO_URL_FOR_MCP} AGENTKEYS_REV=${REV_FOR_MCP} \\
 EOSH
 )
 
-  local cmd_id
-  cmd_id=$(aws ssm send-command \
+  local cmd_id _send_err
+  _send_err="$(mktemp)"
+  if ! cmd_id=$(aws ssm send-command \
     --region "$REGION" \
     --instance-ids "$INSTANCE_ID" \
     --document-name "AWS-RunShellScript" \
     --comment "agentkeys-mcp-server bring-up ($([ "$TEST_MODE" = "1" ] && echo test || echo prod))" \
     --parameters "{\"commands\": $(jq -Rs . <<<"$mcp_bring_up_script" | jq -s .)}" \
-    --query "Command.CommandId" --output text) \
-    || die "aws ssm send-command failed — does $INSTANCE_ID have amazon-ssm-agent + the SSM instance profile?"
+    --query "Command.CommandId" --output text 2>"$_send_err"); then
+    local _err; _err="$(cat "$_send_err" 2>/dev/null || true)"; rm -f "$_send_err"
+    # ensure_ssm_managed already verified the instance is registered, so a
+    # SendCommand AccessDenied here is a CALLER policy gap, not an instance one.
+    if printf '%s' "$_err" | grep -qiE 'accessdenied|not authorized|ssm:sendcommand'; then
+      die "ssm:SendCommand DENIED for your CALLER (identity-based policy gap — $INSTANCE_ID is SSM-registered, verified just above, so this is NOT an instance problem). Your operator IAM user needs ssm:SendCommand on the instance + the AWS-RunShellScript document, plus ssm:GetCommandInvocation/ListCommandInvocations. Grant it with \`aws iam put-user-policy\` on your user (see scripts/provision-ci-deploy-role.sh for the exact policy shape it grants the CI deploy role), then re-run. Detail: $_err"
+    fi
+    die "aws ssm send-command failed — does $INSTANCE_ID have amazon-ssm-agent + the SSM instance profile? Detail: $_err"
+  fi
+  rm -f "$_send_err"
   ok "SSM command $cmd_id queued on $INSTANCE_ID; polling for completion (max 10 min)"
 
   # Poll every 10s for up to 10 min. setup-mcp-host.sh is normally <3 min;

From a7014e709f35b6ce97775661123788f9c51771de Mon Sep 17 00:00:00 2001
From: Hanwen Cheng <heawen.cheng@gmail.com>
Date: Sun, 31 May 2026 18:10:32 +0800
Subject: [PATCH 2/4] fix(setup-cloud,docs): drop no-op --upgrade from
 setup-broker-host.sh refs + add CLAUDE.md rule

setup-broker-host.sh treats --upgrade (and --skip-pull) as back-compat NO-OPS
(it is idempotent + auto-detects bootstrap vs upgrade), so emitting it is
misleading. Replace active-path references with --ref main (the canonical
idempotent deploy invocation per CLAUDE.md) and codify the rule:
- setup-cloud.sh: ensure_ssm_managed remediation suggests --ref main.
- docs/ci-setup.md: prod-broker manual deploy uses --ref main.
- CLAUDE.md (Remote broker host): explicit never-pass---upgrade rule.
---
 CLAUDE.md              | 2 ++
 docs/ci-setup.md       | 2 +-
 scripts/setup-cloud.sh | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index 2f76ec6d..3d99fae8 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -90,6 +90,8 @@ Also: never gloss over a partial implementation in a demo doc or runbook. If the
 ## Remote broker host (single entry point)
 All remote-host changes (binary upgrades, systemd edits, nginx/certbot, env tweaks, mock-server redeploys) MUST go through `bash scripts/setup-broker-host.sh` — it's idempotent and auto-detects bootstrap vs upgrade. No ad-hoc `systemctl` edits or hand-built `scp`.
 
+**NEVER pass `--upgrade` (or `--skip-pull`) to `setup-broker-host.sh`.** They are back-compat **no-ops** — the script is idempotent and auto-detects bootstrap vs upgrade; there is no "upgrade mode" to opt into. Invoke it **plain** (`bash scripts/setup-broker-host.sh`, optionally with `--test`/`--yes`/`--clean`), or pass **`--ref main`** when you also want it to fetch + checkout + redeploy `main`. Do not add an `--upgrade` flag to any new script, runbook, doc, or CLI guidance; if you find an existing `--upgrade` reference in an active (non-archived) operator path, replace it with `--ref main` (deploy) or plain (ensure) in the same change.
+
 ### SSH access to the remote broker host
 On the operator machine, **SSH into the prod broker with the zsh alias `ssh-agentkeys`** (= `bash $AGENTKEYS_REPO/scripts/ssh-broker.sh prod`, which uses EC2 Instance Connect under AWS profile `agentkeys-broker`). Use it for read-only diagnostics (worker logs, env, status) — it is the sanctioned remote-shell entry point; do not hand-roll `aws ec2-instance-connect ssh` or raw `ssh`. Pass a trailing command to run non-interactively: `ssh-agentkeys 'systemctl status agentkeys-worker-memory'`. The login user is `agentkey` (uid 1001); it is in the `sudo` group but sudo **requires a password and a TTY**, so `journalctl`/reading `/etc/agentkeys/*.env` (owned `agentkeys:agentkeys 0600`) need an interactive session — non-interactive `ssh-agentkeys '<cmd>'` can only run unprivileged commands. For privileged log reads, open an interactive `ssh-agentkeys` shell and run `sudo` there. (`ssh-broker.sh test` / `--fallback` reach the test stack / use the `.pem` when EC2-IC is down.)
 
diff --git a/docs/ci-setup.md b/docs/ci-setup.md
index 0e270bc4..ed8bd943 100644
--- a/docs/ci-setup.md
+++ b/docs/ci-setup.md
@@ -477,7 +477,7 @@ The IAM role can stay provisioned indefinitely — without the secret it can't b
 
 Per [issue #101](https://github.com/litentry/agentKeys/issues/101) "Out of scope":
 
-- **Prod broker auto-deploy** — never. The prod broker EC2 stays manual via `bash scripts/setup-broker-host.sh --upgrade` from the operator laptop, per CLAUDE.md "Remote broker host (single entry point)".
+- **Prod broker auto-deploy** — never. The prod broker EC2 stays manual via `bash scripts/setup-broker-host.sh --ref main` from the operator laptop, per CLAUDE.md "Remote broker host (single entry point)".
 - **Auto-deploy of test Heima EVM contracts** — deferred to a follow-up PR (issue #101 rollout plan step 7). Contract redeploys mint new addresses and require the `SECRETS_REWRITE_PAT` token to update six `TEST_*_ADDRESS_HEIMA` secrets — more risk than the broker deploy, so it ships separately.
 - **Mainnet prod contract redeploy** — never automatic. Manual via `bash scripts/setup-heima.sh` only.
 
diff --git a/scripts/setup-cloud.sh b/scripts/setup-cloud.sh
index 04fd2777..7003a72b 100755
--- a/scripts/setup-cloud.sh
+++ b/scripts/setup-cloud.sh
@@ -760,7 +760,7 @@ ensure_ssm_managed() {
   done
   die "SSM: $INSTANCE_ID never reached PingStatus=Online (last: ${ping:-none}) after ~3min. The role now carries AmazonSSMManagedInstanceCore, so the on-host amazon-ssm-agent likely isn't running. (Re)start it via the single entry point, then re-run this step:
      bash scripts/ssh-broker.sh $([ "$TEST_MODE" = "1" ] && echo test || echo prod)
-     sudo bash /opt/agentkeys-src/scripts/setup-broker-host.sh --upgrade   # installs+enables the agent
+     sudo bash /opt/agentkeys-src/scripts/setup-broker-host.sh --ref main   # idempotent; installs+enables the agent
    or reboot (broker auto-restarts via systemd): aws --region $REGION ec2 reboot-instances --instance-ids $INSTANCE_ID"
 }
 

From c3e7989504fe50d4a79b239aec06fe0a6e715060 Mon Sep 17 00:00:00 2001
From: Hanwen Cheng <heawen.cheng@gmail.com>
Date: Sun, 31 May 2026 18:29:39 +0800
Subject: [PATCH 3/4] fix(setup-cloud): default HOME in the step-15 SSM remote
 script (set -u unbound-var)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SSM RunShellScript executes the step-15 mcp-bring-up script as root with a
MINIMAL env (no HOME). Under set -euo pipefail the first $HOME use
(export PATH=$HOME/.cargo/bin) aborted with 'HOME: unbound variable' — a latent
bug that only surfaced once SSM delivery started working (previously it failed at
send-command). Default HOME to /root before any $HOME use. Also document
--only-step 15 in the script's re-run examples.
---
 scripts/setup-cloud.sh | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/scripts/setup-cloud.sh b/scripts/setup-cloud.sh
index 7003a72b..7c98d194 100755
--- a/scripts/setup-cloud.sh
+++ b/scripts/setup-cloud.sh
@@ -793,6 +793,10 @@ do_step_15() {
   mcp_bring_up_script=$(cat <<EOSH
 #!/usr/bin/env bash
 set -euo pipefail
+# SSM RunShellScript runs this as root with a MINIMAL env (no HOME). Under set -u
+# an unset \$HOME is a fatal "unbound variable", which broke step 15 the moment
+# SSM delivery started working. Default it before any \$HOME use (root => /root).
+export HOME="\${HOME:-/root}"
 export PATH="\$HOME/.cargo/bin:\$PATH"
 
 if ! command -v cargo >/dev/null 2>&1; then
@@ -901,6 +905,7 @@ do_step_16() {
   printf "    bash scripts/setup-cloud.sh --only-step 6   # re-UPSERT DNS\n" >&2
   printf "    bash scripts/setup-cloud.sh --only-step 12  # re-create SSH user (e.g. after EC2 replace)\n" >&2
   printf "    bash scripts/setup-cloud.sh --only-step 13  # re-run per-data-class provisioning\n" >&2
+  printf "    bash scripts/setup-cloud.sh --only-step 15  # re-run the MCP bring-up on the broker via SSM (add --test for the test broker)\n" >&2
   printf "    bash scripts/setup-cloud.sh --only-step 15  # re-deploy agentkeys-mcp-server on broker (cargo install --git)\n" >&2
   printf "    bash scripts/setup-cloud.sh --only-step 15 --test  # same for test-mcp.\${ZONE}\n\n" >&2
 }

From 41bc0d584cbd9df226de055d1ffd67fe7ce76e8e Mon Sep 17 00:00:00 2001
From: Hanwen Cheng <heawen.cheng@gmail.com>
Date: Sun, 31 May 2026 18:35:14 +0800
Subject: [PATCH 4/4] docs(CLAUDE): generalize the never---upgrade rule to any
 idempotent setup script

Broaden the rule from setup-broker-host.sh to all idempotent setup scripts
(setup-cloud.sh, setup-heima.sh, heima-* helpers) and restore the actionable
guidance (invoke plain / --only-step N, or --ref main for a broker redeploy;
replace existing active-path --upgrade references).
---
 CLAUDE.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index 3d99fae8..408c7183 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -90,7 +90,7 @@ Also: never gloss over a partial implementation in a demo doc or runbook. If the
 ## Remote broker host (single entry point)
 All remote-host changes (binary upgrades, systemd edits, nginx/certbot, env tweaks, mock-server redeploys) MUST go through `bash scripts/setup-broker-host.sh` — it's idempotent and auto-detects bootstrap vs upgrade. No ad-hoc `systemctl` edits or hand-built `scp`.
 
-**NEVER pass `--upgrade` (or `--skip-pull`) to `setup-broker-host.sh`.** They are back-compat **no-ops** — the script is idempotent and auto-detects bootstrap vs upgrade; there is no "upgrade mode" to opt into. Invoke it **plain** (`bash scripts/setup-broker-host.sh`, optionally with `--test`/`--yes`/`--clean`), or pass **`--ref main`** when you also want it to fetch + checkout + redeploy `main`. Do not add an `--upgrade` flag to any new script, runbook, doc, or CLI guidance; if you find an existing `--upgrade` reference in an active (non-archived) operator path, replace it with `--ref main` (deploy) or plain (ensure) in the same change.
+**NEVER pass `--upgrade` (or `--skip-pull`) to any idempotent setup script** (`setup-broker-host.sh`, `setup-cloud.sh`, the `heima-*` / `setup-heima.sh` helpers, etc.). They are back-compat **no-ops** — these scripts are idempotent and auto-detect bootstrap vs upgrade; there is no "upgrade mode" to opt into. Invoke them **plain** (optionally with `--test` / `--yes` / `--clean` / `--only-step N`), or pass **`--ref main`** to `setup-broker-host.sh` when you also want it to fetch + checkout + redeploy `main`. Do not add an `--upgrade` flag to any new script, runbook, doc, or CLI guidance; if you find an existing `--upgrade` reference in an active (non-archived) operator path, replace it with the idempotent invocation (`--ref main` for deploy, plain for ensure) in the same change.
 
 ### SSH access to the remote broker host
 On the operator machine, **SSH into the prod broker with the zsh alias `ssh-agentkeys`** (= `bash $AGENTKEYS_REPO/scripts/ssh-broker.sh prod`, which uses EC2 Instance Connect under AWS profile `agentkeys-broker`). Use it for read-only diagnostics (worker logs, env, status) — it is the sanctioned remote-shell entry point; do not hand-roll `aws ec2-instance-connect ssh` or raw `ssh`. Pass a trailing command to run non-interactively: `ssh-agentkeys 'systemctl status agentkeys-worker-memory'`. The login user is `agentkey` (uid 1001); it is in the `sudo` group but sudo **requires a password and a TTY**, so `journalctl`/reading `/etc/agentkeys/*.env` (owned `agentkeys:agentkeys 0600`) need an interactive session — non-interactive `ssh-agentkeys '<cmd>'` can only run unprivileged commands. For privileged log reads, open an interactive `ssh-agentkeys` shell and run `sudo` there. (`ssh-broker.sh test` / `--fallback` reach the test stack / use the `.pem` when EC2-IC is down.)