diff --git a/CLAUDE.md b/CLAUDE.md index 2f76ec6..408c718 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -90,6 +90,8 @@ Also: never gloss over a partial implementation in a demo doc or runbook. If the ## Remote broker host (single entry point) All remote-host changes (binary upgrades, systemd edits, nginx/certbot, env tweaks, mock-server redeploys) MUST go through `bash scripts/setup-broker-host.sh` — it's idempotent and auto-detects bootstrap vs upgrade. No ad-hoc `systemctl` edits or hand-built `scp`. +**NEVER pass `--upgrade` (or `--skip-pull`) to any idempotent setup script** (`setup-broker-host.sh`, `setup-cloud.sh`, the `heima-*` / `setup-heima.sh` helpers, etc.). They are back-compat **no-ops** — these scripts are idempotent and auto-detect bootstrap vs upgrade; there is no "upgrade mode" to opt into. Invoke them **plain** (optionally with `--test` / `--yes` / `--clean` / `--only-step N`), or pass **`--ref main`** to `setup-broker-host.sh` when you also want it to fetch + checkout + redeploy `main`. Do not add an `--upgrade` flag to any new script, runbook, doc, or CLI guidance; if you find an existing `--upgrade` reference in an active (non-archived) operator path, replace it with the idempotent invocation (`--ref main` for deploy, plain for ensure) in the same change. + ### SSH access to the remote broker host On the operator machine, **SSH into the prod broker with the zsh alias `ssh-agentkeys`** (= `bash $AGENTKEYS_REPO/scripts/ssh-broker.sh prod`, which uses EC2 Instance Connect under AWS profile `agentkeys-broker`). Use it for read-only diagnostics (worker logs, env, status) — it is the sanctioned remote-shell entry point; do not hand-roll `aws ec2-instance-connect ssh` or raw `ssh`. Pass a trailing command to run non-interactively: `ssh-agentkeys 'systemctl status agentkeys-worker-memory'`. The login user is `agentkey` (uid 1001); it is in the `sudo` group but sudo **requires a password and a TTY**, so `journalctl`/reading `/etc/agentkeys/*.env` (owned `agentkeys:agentkeys 0600`) need an interactive session — non-interactive `ssh-agentkeys ''` can only run unprivileged commands. For privileged log reads, open an interactive `ssh-agentkeys` shell and run `sudo` there. (`ssh-broker.sh test` / `--fallback` reach the test stack / use the `.pem` when EC2-IC is down.) diff --git a/docs/ci-setup.md b/docs/ci-setup.md index 0e270bc..ed8bd94 100644 --- a/docs/ci-setup.md +++ b/docs/ci-setup.md @@ -477,7 +477,7 @@ The IAM role can stay provisioned indefinitely — without the secret it can't b Per [issue #101](https://github.com/litentry/agentKeys/issues/101) "Out of scope": -- **Prod broker auto-deploy** — never. The prod broker EC2 stays manual via `bash scripts/setup-broker-host.sh --upgrade` from the operator laptop, per CLAUDE.md "Remote broker host (single entry point)". +- **Prod broker auto-deploy** — never. The prod broker EC2 stays manual via `bash scripts/setup-broker-host.sh --ref main` from the operator laptop, per CLAUDE.md "Remote broker host (single entry point)". - **Auto-deploy of test Heima EVM contracts** — deferred to a follow-up PR (issue #101 rollout plan step 7). Contract redeploys mint new addresses and require the `SECRETS_REWRITE_PAT` token to update six `TEST_*_ADDRESS_HEIMA` secrets — more risk than the broker deploy, so it ships separately. - **Mainnet prod contract redeploy** — never automatic. Manual via `bash scripts/setup-heima.sh` only. diff --git a/scripts/setup-cloud.sh b/scripts/setup-cloud.sh index 37cb454..7c98d19 100755 --- a/scripts/setup-cloud.sh +++ b/scripts/setup-cloud.sh @@ -697,10 +697,81 @@ do_step_14() { ok "mail bucket policy applied" } +# Step-15 precondition: the broker EC2 must be a REGISTERED SSM managed instance +# before SendCommand, or AWS returns InvalidInstanceId / "not in a valid state". +# The on-host amazon-ssm-agent only registers if the instance's role carries +# AmazonSSMManagedInstanceCore — and operators repeatedly hit this because the +# broker-host role (prod `agentkeys-broker-host`; the test broker uses its own +# profile) was created WITHOUT it. We self-heal idempotently, deriving the role +# from the INSTANCE's actual attached profile so the SAME code fixes BOTH the +# prod and test brokers regardless of role naming, then poll until the agent +# registers. `aws iam` is global (no --region); ec2/ssm reads pass --region +# "$REGION" per the agentkeys-admin-defaults-to-us-west-2 trap (CLAUDE.md). +ensure_ssm_managed() { + local ssm_core="arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + + # Resolve the role behind THIS instance's profile (naming-agnostic). + local prof_arn + prof_arn="$(aws --region "$REGION" ec2 describe-instances --instance-ids "$INSTANCE_ID" \ + --query 'Reservations[0].Instances[0].IamInstanceProfile.Arn' --output text 2>/dev/null || true)" + if [ -z "$prof_arn" ] || [ "$prof_arn" = "None" ]; then + die "SSM precondition: $INSTANCE_ID has NO instance profile — it can never register with SSM. Attach one carrying AmazonSSMManagedInstanceCore (docs/cloud-bootstrap.md §6), then re-run." + fi + local prof_name role + prof_name="${prof_arn##*/}" + role="$(aws iam get-instance-profile --instance-profile-name "$prof_name" \ + --query 'InstanceProfile.Roles[0].RoleName' --output text 2>/dev/null || true)" + if [ -z "$role" ] || [ "$role" = "None" ]; then + die "SSM precondition: instance profile $prof_name has no role attached — re-create it (docs/cloud-bootstrap.md §6)." + fi + + # Idempotently ensure AmazonSSMManagedInstanceCore on that role. + local have + have="$(aws iam list-attached-role-policies --role-name "$role" \ + --query "length(AttachedPolicies[?PolicyArn=='$ssm_core'])" --output text 2>/dev/null || echo 0)" + if [ "$have" = "1" ]; then + ok "SSM: AmazonSSMManagedInstanceCore already on $role" + elif [ "$DRY_RUN" = "1" ]; then + warn "DRY: would attach AmazonSSMManagedInstanceCore to $role (instance profile $prof_name)" + else + if aws iam attach-role-policy --role-name "$role" --policy-arn "$ssm_core" >/dev/null 2>&1; then + ok "SSM: attached AmazonSSMManagedInstanceCore to $role (idempotent)" + else + die "SSM: could not attach AmazonSSMManagedInstanceCore to $role — your caller needs iam:AttachRolePolicy (\`awsp agentkeys-admin\`)." + fi + fi + if [ "$DRY_RUN" = "1" ]; then return 0; fi + + # Poll until the on-host agent registers (it refreshes IMDS creds, ~30s cadence; + # a freshly-attached policy is usually picked up within 1–2 min). + local ping="" i + for i in $(seq 1 18); do + ping="$(aws --region "$REGION" ssm describe-instance-information \ + --filters "Key=InstanceIds,Values=$INSTANCE_ID" \ + --query 'InstanceInformationList[0].PingStatus' --output text 2>/dev/null || true)" + if [ "$ping" = "Online" ]; then + ok "SSM: $INSTANCE_ID registered (PingStatus=Online)" + return 0 + fi + if [ "$i" = "1" ]; then + printf " waiting up to ~3min for %s to register with SSM (PingStatus=%s)…\n" "$INSTANCE_ID" "${ping:-none}" >&2 + fi + sleep 10 + done + die "SSM: $INSTANCE_ID never reached PingStatus=Online (last: ${ping:-none}) after ~3min. The role now carries AmazonSSMManagedInstanceCore, so the on-host amazon-ssm-agent likely isn't running. (Re)start it via the single entry point, then re-run this step: + bash scripts/ssh-broker.sh $([ "$TEST_MODE" = "1" ] && echo test || echo prod) + sudo bash /opt/agentkeys-src/scripts/setup-broker-host.sh --ref main # idempotent; installs+enables the agent + or reboot (broker auto-restarts via systemd): aws --region $REGION ec2 reboot-instances --instance-ids $INSTANCE_ID" +} + do_step_15() { CUR_STEP=15; step "Bring up agentkeys-mcp-server on broker (via SSM)" : "${INSTANCE_ID:?INSTANCE_ID missing — broker EC2 needs to exist (re-run step 4 first)}" + # Make the broker EC2 SSM-ready BEFORE SendCommand (self-heals the missing + # AmazonSSMManagedInstanceCore that bit both prod + test brokers). Idempotent. + ensure_ssm_managed + REPO_URL_FOR_MCP="${AGENTKEYS_REPO_URL:-https://github.com/litentry/agentKeys.git}" REV_FOR_MCP="${AGENTKEYS_REV:-main}" MCP_HOST_FLAGS="" @@ -722,6 +793,10 @@ do_step_15() { mcp_bring_up_script=$(cat < /root). +export HOME="\${HOME:-/root}" export PATH="\$HOME/.cargo/bin:\$PATH" if ! command -v cargo >/dev/null 2>&1; then @@ -744,15 +819,24 @@ sudo -E AGENTKEYS_REPO_URL=${REPO_URL_FOR_MCP} AGENTKEYS_REV=${REV_FOR_MCP} \\ EOSH ) - local cmd_id - cmd_id=$(aws ssm send-command \ + local cmd_id _send_err + _send_err="$(mktemp)" + if ! cmd_id=$(aws ssm send-command \ --region "$REGION" \ --instance-ids "$INSTANCE_ID" \ --document-name "AWS-RunShellScript" \ --comment "agentkeys-mcp-server bring-up ($([ "$TEST_MODE" = "1" ] && echo test || echo prod))" \ --parameters "{\"commands\": $(jq -Rs . <<<"$mcp_bring_up_script" | jq -s .)}" \ - --query "Command.CommandId" --output text) \ - || die "aws ssm send-command failed — does $INSTANCE_ID have amazon-ssm-agent + the SSM instance profile?" + --query "Command.CommandId" --output text 2>"$_send_err"); then + local _err; _err="$(cat "$_send_err" 2>/dev/null || true)"; rm -f "$_send_err" + # ensure_ssm_managed already verified the instance is registered, so a + # SendCommand AccessDenied here is a CALLER policy gap, not an instance one. + if printf '%s' "$_err" | grep -qiE 'accessdenied|not authorized|ssm:sendcommand'; then + die "ssm:SendCommand DENIED for your CALLER (identity-based policy gap — $INSTANCE_ID is SSM-registered, verified just above, so this is NOT an instance problem). Your operator IAM user needs ssm:SendCommand on the instance + the AWS-RunShellScript document, plus ssm:GetCommandInvocation/ListCommandInvocations. Grant it with \`aws iam put-user-policy\` on your user (see scripts/provision-ci-deploy-role.sh for the exact policy shape it grants the CI deploy role), then re-run. Detail: $_err" + fi + die "aws ssm send-command failed — does $INSTANCE_ID have amazon-ssm-agent + the SSM instance profile? Detail: $_err" + fi + rm -f "$_send_err" ok "SSM command $cmd_id queued on $INSTANCE_ID; polling for completion (max 10 min)" # Poll every 10s for up to 10 min. setup-mcp-host.sh is normally <3 min; @@ -821,6 +905,7 @@ do_step_16() { printf " bash scripts/setup-cloud.sh --only-step 6 # re-UPSERT DNS\n" >&2 printf " bash scripts/setup-cloud.sh --only-step 12 # re-create SSH user (e.g. after EC2 replace)\n" >&2 printf " bash scripts/setup-cloud.sh --only-step 13 # re-run per-data-class provisioning\n" >&2 + printf " bash scripts/setup-cloud.sh --only-step 15 # re-run the MCP bring-up on the broker via SSM (add --test for the test broker)\n" >&2 printf " bash scripts/setup-cloud.sh --only-step 15 # re-deploy agentkeys-mcp-server on broker (cargo install --git)\n" >&2 printf " bash scripts/setup-cloud.sh --only-step 15 --test # same for test-mcp.\${ZONE}\n\n" >&2 }