From 1c6997e2cafbdca2da0aab67ae8bc9b96be56a03 Mon Sep 17 00:00:00 2001 From: bnsoni Date: Wed, 20 May 2026 08:39:16 +0300 Subject: [PATCH 1/8] feat(setup): tenant platform launcher + safer single-server defaults MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a new `ibl_tenant_platform` ansible role that launches a tenant Platform (Platform + admin User + UserPlatformLink) via `run_launch_steps` when `PLATFORM_NAME` is set to a non-default value, plus a sweep of defense-in-depth defaults so a fresh single-server bootstrap comes up production-safe out of the box. Highlights: * Tenant launcher — new role wired into both `playbook.yml` (setup / setup-env) and `launch_playbook.yml` (launch / launch-env). Gated on `PLATFORM_NAME != 'main'`, skips + logs on re-runs when the tenant already exists, surfaces the generated admin password via the `IBLAI_FIXTURE_OUTPUT` pipeline (never persisted to disk). Also writes `PLATFORM_NAME=` (uppercase) at the root of `/ibl/config.yml` and enforces `Platform.show_paywall=False` + `Platform.is_advertising=False` via `Platform.objects.filter().update()`. * Reserved names — - `ADMIN_USERNAME=ibl_admin` is rejected at every input layer (interactive prompt, .env, --admin-username); reserved for the SPA OAuth Application owner the platform itself maintains. New default suggestion is `platform_admin`. Backed by a Pydantic field_validator on `SetupConfig.admin_username`. - `PLATFORM_NAME=main` is rejected as an explicit input. Unset / blank silently resolves to `main` (preserving SSO `backend_name=main-oauth2` and skipping the tenant launcher). * Safer SPA defaults — `IBL_SPA.MENTOR.STRIPE_ENABLED=false` and `IBL_SPA.MENTOR.ENABLE_ADVERTISING=false` are written unconditionally in `ibl_spa` (fresh installs) and `ibl_launch_services` (AMI launches) so a deploy without explicit billing setup never surfaces monetization UI by accident. * Microsoft SSO completeness — `microsoft_sso_config` now also patches `IBL_SPA.AUTH.EXTERNAL_IDP_LOGOUT_URL` and `IBL_SPA.AUTH.IBL_DIRECT_SSO_URL` (with `microsoft_sso_tenant_id` falling back to `common`), then restarts the Auth + Mentor SPAs so they pick up the new auth flow. * Final `ibl global-proxy reload` — added as `post_tasks` in both `playbook.yml` and `launch_playbook.yml`, so any nginx state touched by SSO roles (edX restarts in google_sso_config / microsoft_sso_config) is reloaded before the playbook exits. * 100 GB volume floor for single / multi server — Pydantic validators (`InfraConfig` model_validator gated on `DeploymentType.SINGLE`, plus `MultiServerConfig.validate_volume_sizes`), matching interactive + CLI + .env input checks. Defaults bumped accordingly. Call-server unchanged (LiveKit only needs ~40 GB). * 32 GB memory warning — new `INSTANCE_RAM_GB` mapping + helper. Non-blocking warning suggesting 64 GB (m5.4xlarge / r5.2xlarge) when the operator picks a 32 GB instance — in the interactive provision wizard, in `provision-env`, and in `launch` / `launch-env` (only when AI is enabled). * Codebase scrub — removed all references to the canonical client name from comments, docstrings, prompt instructions, error hints, and example .env files. Replacement placeholders: `` for monorepo org names, `acme` for tenant-key examples. * Test fix — the five `_test_ssh()` retry-path tests in `tests/ansible/test_runner.py` no longer sleep for ~135 s each; they now mock `time.sleep` alongside the existing `subprocess.run` mock, cutting ~11 minutes off the full suite. Test suite: 562 passing in ~1.3 s. Co-Authored-By: Claude Opus 4.7 (1M context) --- .env.example | 5 +- .env.provision.example | 4 +- .env.setup.example | 18 +- CHANGELOG.md | 2 +- CLAUDE.md | 2 +- src/iblai_infra/ansible/runner.py | 2 + .../single-server/launch_playbook.yml | 23 ++ .../templates/single-server/playbook.yml | 21 ++ .../roles/ibl_cli_ops/tasks/main.yml | 2 +- .../roles/ibl_launch_services/tasks/main.yml | 49 ++++ .../roles/ibl_spa/tasks/main.yml | 3 +- .../roles/ibl_tenant_platform/tasks/main.yml | 216 ++++++++++++++++++ .../roles/microsoft_sso_config/tasks/main.yml | 132 +++++++++++ src/iblai_infra/cli.py | 113 ++++++++- src/iblai_infra/env_provision.py | 2 +- src/iblai_infra/env_setup.py | 35 ++- src/iblai_infra/models.py | 90 +++++++- src/iblai_infra/prompts/infrastructure.py | 38 ++- src/iblai_infra/prompts/setup.py | 69 ++++-- tests/ansible/test_runner.py | 35 ++- tests/conftest.py | 2 +- tests/prompts/test_review.py | 2 +- tests/prompts/test_setup.py | 38 +-- tests/terraform/test_runner.py | 2 +- tests/test_env_setup.py | 4 +- tests/test_models.py | 10 +- 26 files changed, 841 insertions(+), 78 deletions(-) create mode 100644 src/iblai_infra/ansible/templates/single-server/roles/ibl_tenant_platform/tasks/main.yml diff --git a/.env.example b/.env.example index d13c608..0da345d 100644 --- a/.env.example +++ b/.env.example @@ -24,8 +24,9 @@ DOMAIN=platform.example.com # GitHub GIT_TOKEN=ghp_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx -# Platform Admin -ADMIN_USERNAME=ibl_admin +# Platform Admin. +# NOTE: `ibl_admin` is reserved for system use — pick a different name. +ADMIN_USERNAME=platform_admin ADMIN_EMAIL=admin@example.com ADMIN_PASSWORD=change-me-min-8-chars diff --git a/.env.provision.example b/.env.provision.example index 49f3ee4..610f2ff 100644 --- a/.env.provision.example +++ b/.env.provision.example @@ -34,8 +34,10 @@ SSH_KEY_METHOD=generate # generate | existing_file | aws_keypair # SSH_KEY_NAME=my-existing-keypair-name # --- Compute (optional, defaults shown) --- +# NOTE: t3.2xlarge has 32 GB RAM. If you enable AI features in the setup +# step, 64 GB (e.g. m5.4xlarge or r5.2xlarge) is strongly recommended. INSTANCE_TYPE=t3.2xlarge -VOLUME_SIZE=50 # min 20 GB +VOLUME_SIZE=100 # min 100 GB VOLUME_TYPE=gp3 # gp2 | gp3 | io1 # --- Network (optional) --- diff --git a/.env.setup.example b/.env.setup.example index cc3e84f..92966c1 100644 --- a/.env.setup.example +++ b/.env.setup.example @@ -16,8 +16,10 @@ AWS_SECRET_ACCESS_KEY=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY # (or your overrides via GITHUB_ORG / CLI_OPS_REPO / PROD_IMAGES_REPO below) GIT_TOKEN= -# Platform admin (created on the LMS + DM) -ADMIN_USERNAME=ibl_admin +# Platform admin (created on the LMS + DM). +# NOTE: `ibl_admin` is reserved for system use (owns SPA OAuth records). +# Pick any other username — e.g. `platform_admin`, your name, etc. +ADMIN_USERNAME=platform_admin ADMIN_EMAIL=admin@example.com ADMIN_PASSWORD=change-me-min-8-chars @@ -43,8 +45,16 @@ ADMIN_PASSWORD=change-me-min-8-chars # CLI_OPS_RELEASE_TAG=3.19.0 # PROD_IMAGES_TAG=main -# Platform identity (used by SSO roles to derive backend_name + platform_key) -# PLATFORM_NAME=main +# Platform identity. Used by SSO roles to derive backend_name + platform_key, +# AND by ibl_tenant_platform to launch a tenant Platform via run_launch_steps +# (Platform + admin User + UserPlatformLink) when set to a non-default value. +# - Leave unset / blank → defaults to 'main' (system default tenant +# created by the platform itself, no extra launch) +# - PLATFORM_NAME=main → REJECTED ('main' is reserved as an explicit input) +# - PLATFORM_NAME=acme → launches an 'acme' tenant with admin user +# 'acmeadmin@' (password printed +# one-time at the end of setup) +# PLATFORM_NAME=acme # Feature toggles # ENABLE_AI=true diff --git a/CHANGELOG.md b/CHANGELOG.md index 65dd640..80d85ba 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -38,7 +38,7 @@ ## [1.5.0] — 2026-04-30 ### Added -- **Monorepo subdirectory installs** — `--cli-ops-repo` / `--prod-images-repo` (and the matching setup prompts) now accept a `repo/subdir` path, e.g. `kaplan-iblai-infra-ops/kaplan-iblai-prod-images`. The ansible role appends `&subdirectory=` to the install URL so a single client monorepo can host both `iblai-cli-ops` and the prod-images package +- **Monorepo subdirectory installs** — `--cli-ops-repo` / `--prod-images-repo` (and the matching setup prompts) now accept a `repo/subdir` path, e.g. `-iblai-infra-ops/-iblai-prod-images`. The ansible role appends `&subdirectory=` to the install URL so a single client monorepo can host both `iblai-cli-ops` and the prod-images package - **`parse_repo_path()` helper** in `models.py` — splits operator input into `(repo, subdir)`. Bare `iblai-cli-ops` keeps the canonical behavior; subdir-form unlocks per-client monorepo deployments - **`cli_ops_subdir` / `prod_images_subdir` extra-vars** passed through `AnsibleRunner` to the `ibl_cli_ops` role (single-server + call-server templates) diff --git a/CLAUDE.md b/CLAUDE.md index 341450e..dfcfe16 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -170,7 +170,7 @@ Sets `state.provider = "launch"` to distinguish from interactive provisioning. `iblai infra setup-env [] -f .env` — non-interactive Ansible bootstrap from a `.env` file. Single-server only (multi/call rejected upstream). Two modes: -- **Provisioned-name:** `setup-env kapsix -f .env` — loads `ProjectState`, derives `target_host` / `ssh_private_key_path` / `base_domain` / `aws_default_region` from it. `.env` only carries credentials, image tags, admin user, optional integrations. +- **Provisioned-name:** `setup-env -f .env` — loads `ProjectState`, derives `target_host` / `ssh_private_key_path` / `base_domain` / `aws_default_region` from it. `.env` only carries credentials, image tags, admin user, optional integrations. - **Free-standing:** `setup-env -f .env` (no name) — builds a synthetic `ProjectState` with `provider="bootstrap"` (matching `_run_setup_interactive`). `.env` must include `PROJECT_NAME`, `TARGET_HOST`, `SSH_PRIVATE_KEY_PATH`, `BASE_DOMAIN`. **Schema** (`.env.setup.example` is the source of truth). Always required: AWS keys, `GIT_TOKEN` (or `GIT_ACCESS_TOKEN`), `ADMIN_USERNAME`/`ADMIN_EMAIL`/`ADMIN_PASSWORD`. Free-standing additionally needs the four "where to deploy" fields. Optional integrations follow the same trigger pattern as `iblai infra launch` — SMTP enabled when `SMTP_HOST` set, Stripe when `STRIPE_SECRET_KEY` set, Google SSO when `GOOGLE_SSO_CLIENT_ID` set, Microsoft SSO when `MICROSOFT_SSO_CLIENT_ID` set. diff --git a/src/iblai_infra/ansible/runner.py b/src/iblai_infra/ansible/runner.py index 8ab1ad0..67d3599 100644 --- a/src/iblai_infra/ansible/runner.py +++ b/src/iblai_infra/ansible/runner.py @@ -39,6 +39,7 @@ "integrations": "OAuth & Integrations", "admin_setup": "Admin & CORS Setup", "data_seeding": "Data Seeding", + "ibl_tenant_platform": "Tenant Platform", "stripe_config": "Stripe Config", "google_sso_config": "Google SSO Config", "microsoft_sso_config": "Microsoft SSO Config", @@ -52,6 +53,7 @@ "integrations": "OAuth & Integrations", "admin_setup": "Admin & CORS Setup", "data_seeding": "Data Seeding", + "ibl_tenant_platform": "Tenant Platform", "stripe_config": "Stripe Config", "google_sso_config": "Google SSO Config", "microsoft_sso_config": "Microsoft SSO Config", diff --git a/src/iblai_infra/ansible/templates/single-server/launch_playbook.yml b/src/iblai_infra/ansible/templates/single-server/launch_playbook.yml index 9378671..b60a2bb 100644 --- a/src/iblai_infra/ansible/templates/single-server/launch_playbook.yml +++ b/src/iblai_infra/ansible/templates/single-server/launch_playbook.yml @@ -30,6 +30,7 @@ - integrations - admin_setup - data_seeding + - ibl_tenant_platform - stripe_config - google_sso_config - microsoft_sso_config @@ -39,3 +40,25 @@ ansible.builtin.include_role: name: playwright_test_platforms when: create_playwright_platforms | default(false) | bool + + post_tasks: + # Final unconditional proxy reload. Runs AFTER every role and the + # optional Playwright include, so any nginx state touched by SSO + # roles (edX restarts in google_sso_config / microsoft_sso_config) + # is picked up before the playbook exits. Mirrors the post_tasks + # block in playbook.yml so `launch` / `launch-env` get the same + # final-reload guarantee that `setup` / `setup-env` already have. + - name: Reload global proxy (final step) + become: false + ansible.builtin.shell: | + export PYENV_ROOT="{{ pyenv_root }}" + export PATH="$PYENV_ROOT/bin:$PATH" + eval "$(pyenv init -)"; eval "$(pyenv virtualenv-init -)" + pyenv activate "{{ venv_name }}" + ibl global-proxy reload + args: + executable: /bin/bash + chdir: "{{ repo_dir }}" + environment: + HOME: "/home/{{ ansible_user }}" + IBL_ROOT: "{{ ibl_root }}/" diff --git a/src/iblai_infra/ansible/templates/single-server/playbook.yml b/src/iblai_infra/ansible/templates/single-server/playbook.yml index 2428595..3444285 100644 --- a/src/iblai_infra/ansible/templates/single-server/playbook.yml +++ b/src/iblai_infra/ansible/templates/single-server/playbook.yml @@ -48,6 +48,7 @@ - integrations - admin_setup - data_seeding + - ibl_tenant_platform - stripe_config - google_sso_config - microsoft_sso_config @@ -60,3 +61,23 @@ ansible.builtin.include_role: name: playwright_test_platforms when: create_playwright_platforms | default(false) | bool + + post_tasks: + # Final unconditional proxy reload. Runs AFTER every role and the + # optional Playwright include, so any nginx state touched by SSO + # roles (edX restarts in google_sso_config / microsoft_sso_config) + # is picked up before the playbook exits. + - name: Reload global proxy (final step) + become: false + ansible.builtin.shell: | + export PYENV_ROOT="{{ pyenv_root }}" + export PATH="$PYENV_ROOT/bin:$PATH" + eval "$(pyenv init -)"; eval "$(pyenv virtualenv-init -)" + pyenv activate "{{ venv_name }}" + ibl global-proxy reload + args: + executable: /bin/bash + chdir: "{{ repo_dir }}" + environment: + HOME: "/home/{{ ansible_user }}" + IBL_ROOT: "{{ ibl_root }}/" diff --git a/src/iblai_infra/ansible/templates/single-server/roles/ibl_cli_ops/tasks/main.yml b/src/iblai_infra/ansible/templates/single-server/roles/ibl_cli_ops/tasks/main.yml index 25a6650..584b214 100644 --- a/src/iblai_infra/ansible/templates/single-server/roles/ibl_cli_ops/tasks/main.yml +++ b/src/iblai_infra/ansible/templates/single-server/roles/ibl_cli_ops/tasks/main.yml @@ -44,7 +44,7 @@ # and crashes on `ibl --help`. Reinstalling here overrides the wrong # package with the right one (e.g. iblai/iblai-cli-ops@5.8.1, or a # subdir of a client monorepo like -# kaplan-iblai-infra-ops/iblai-cli-ops@v1.0.1). +# -iblai-infra-ops/iblai-cli-ops@v1.0.1). - name: Install iblai-cli-ops (override transitive ibl-cli with correct repo+tag) become: false shell: | diff --git a/src/iblai_infra/ansible/templates/single-server/roles/ibl_launch_services/tasks/main.yml b/src/iblai_infra/ansible/templates/single-server/roles/ibl_launch_services/tasks/main.yml index 4bdb4af..d6c56a3 100644 --- a/src/iblai_infra/ansible/templates/single-server/roles/ibl_launch_services/tasks/main.yml +++ b/src/iblai_infra/ansible/templates/single-server/roles/ibl_launch_services/tasks/main.yml @@ -117,6 +117,55 @@ retries: 40 delay: 15 +# --------------------------------------------------------------------------- +# SPA config — flip stripe / advertising off by default (override the +# values baked into the AMI's config.yml). Matches the fresh-install +# defaults set in `ibl_spa`. Direct yaml patch because `ibl config save +# --set` cannot round-trip quoted-string booleans. +# --------------------------------------------------------------------------- + +- name: Set SPA quoted boolean defaults in config.yml + become: false + shell: | + export PYENV_ROOT="{{ pyenv_root }}" + export PATH="$PYENV_ROOT/bin:$PATH" + eval "$(pyenv init -)"; eval "$(pyenv virtualenv-init -)" + pyenv activate "{{ venv_name }}" + python3 -c " + import yaml + config_path = '{{ ibl_root }}/config.yml' + with open(config_path) as f: + cfg = yaml.safe_load(f) or {} + spa = cfg.setdefault('IBL_SPA', {}) + mentor = spa.setdefault('MENTOR', {}) + mentor['STRIPE_ENABLED'] = 'false' + mentor['ENABLE_ADVERTISING'] = 'false' + with open(config_path, 'w') as f: + yaml.dump(cfg, f, default_flow_style=False) + print('SPA boolean config values set') + " + args: + executable: /bin/bash + chdir: "{{ repo_dir }}" + environment: + HOME: "/home/{{ ansible_user }}" + IBL_ROOT: "{{ ibl_root }}/" + +- name: Re-render templates with SPA config defaults + become: false + shell: | + export PYENV_ROOT="{{ pyenv_root }}" + export PATH="$PYENV_ROOT/bin:$PATH" + eval "$(pyenv init -)"; eval "$(pyenv virtualenv-init -)" + pyenv activate "{{ venv_name }}" + ibl config save + args: + executable: /bin/bash + chdir: "{{ repo_dir }}" + environment: + HOME: "/home/{{ ansible_user }}" + IBL_ROOT: "{{ ibl_root }}/" + # --------------------------------------------------------------------------- # SPA restart # --------------------------------------------------------------------------- diff --git a/src/iblai_infra/ansible/templates/single-server/roles/ibl_spa/tasks/main.yml b/src/iblai_infra/ansible/templates/single-server/roles/ibl_spa/tasks/main.yml index 8c87f50..d33fcd1 100644 --- a/src/iblai_infra/ansible/templates/single-server/roles/ibl_spa/tasks/main.yml +++ b/src/iblai_infra/ansible/templates/single-server/roles/ibl_spa/tasks/main.yml @@ -150,7 +150,8 @@ spa = cfg.setdefault('IBL_SPA', {}) mentor = spa.setdefault('MENTOR', {}) mentor['ENABLE_RBAC'] = 'true' - mentor['STRIPE_ENABLED'] = 'true' + mentor['STRIPE_ENABLED'] = 'false' + mentor['ENABLE_ADVERTISING'] = 'false' mentor['SKIP_TEST'] = 'true' mentor['ENABLE_APP_SITE_ASSOCIATION'] = 'true' mentor['CANVAS_ADMIN_ONLY'] = 'false' diff --git a/src/iblai_infra/ansible/templates/single-server/roles/ibl_tenant_platform/tasks/main.yml b/src/iblai_infra/ansible/templates/single-server/roles/ibl_tenant_platform/tasks/main.yml new file mode 100644 index 0000000..bd6c8b0 --- /dev/null +++ b/src/iblai_infra/ansible/templates/single-server/roles/ibl_tenant_platform/tasks/main.yml @@ -0,0 +1,216 @@ +--- +# Launch a tenant platform via `run_launch_steps` (NOT raw +# `Platform.objects.create()`) so the launcher's state machine fires +# all the after_launch signals — default apps, edX hooks, +# UserPlatformLink with the right role flags, etc. This mirrors the +# shape of a canonical multi-tenant deployment, where a Platform + +# UserPlatformLink + tenant admin User were all created together by +# the launcher. +# +# Gating: +# - Skipped when `platform_name == 'main'` (the IBL default tenant +# that the platform itself maintains via `ibl launch`). +# - Skipped on re-runs when a Platform with key=platform_name already +# exists — the launcher does not upsert, so re-running it would +# either create a duplicate or 500 at the DB layer. +# +# Tenant admin credentials are derived from `platform_name` + +# `base_domain`: +# username = admin +# email = @ +# password = secrets.token_hex(16) (generated on each launch) +# The generated password is surfaced via the IBLAI_FIXTURE_OUTPUT +# markers (see runner.py::_maybe_capture_fixture) — it's printed +# AFTER the Rich Live display tears down so the operator gets one +# clean chance to copy it. It is never persisted to disk. + +- name: Wait for DM web container ready (tenant launch) + become: false + ansible.builtin.shell: + cmd: | + docker inspect --format='{{ '{{' }}.State.Running{{ '}}' }}' ibl_dm_pro_web 2>/dev/null | grep -q true && \ + docker exec ibl_dm_pro_web echo "DM ready" + executable: /bin/bash + register: dm_ready_tenant + until: dm_ready_tenant.rc == 0 + retries: 30 + delay: 10 + changed_when: false + when: (platform_name | default('main')) != 'main' + +# Write PLATFORM_NAME (uppercase) at the root of /ibl/config.yml so the +# iblai-cli-ops templates that read it (LMS display name, etc.) pick up +# the tenant identity. Matches the on-disk shape seen on the canonical +# multi-tenant deployments (`PLATFORM_NAME: ACME`). `ibl config save --set` is +# naturally idempotent — re-running with the same value is a no-op. +# Skipped when `platform_name == 'main'` per the same gate as the +# launcher tasks below. +- name: Set PLATFORM_NAME (uppercase) in /ibl/config.yml + become: false + ansible.builtin.shell: + cmd: | + export PYENV_ROOT="{{ pyenv_root }}" + export PATH="$PYENV_ROOT/bin:$PATH" + eval "$(pyenv init -)"; eval "$(pyenv virtualenv-init -)" + pyenv activate "{{ venv_name }}" + ibl config save --set 'PLATFORM_NAME={{ platform_name | upper }}' + executable: /bin/bash + chdir: "{{ repo_dir }}" + environment: + HOME: "/home/{{ ansible_user }}" + IBL_ROOT: "{{ ibl_root }}/" + when: (platform_name | default('main')) != 'main' + +- name: Check whether tenant platform already exists + become: false + ansible.builtin.shell: + cmd: | + docker exec -i ibl_dm_pro_web python manage.py shell <<'PY' + from core.models import Platform + key = "{{ platform_name }}" + p = Platform.objects.filter(key=key).first() + if p is None: + print("TENANT_PLATFORM_STATUS:ABSENT key=" + key) + else: + print("TENANT_PLATFORM_STATUS:PRESENT key=" + key + " pk=" + str(p.pk)) + PY + executable: /bin/bash + register: tenant_check + changed_when: false + when: (platform_name | default('main')) != 'main' + +- name: Launch tenant platform via run_launch_steps + become: false + ansible.builtin.shell: + cmd: | + docker exec -i ibl_dm_pro_web python manage.py shell <<'PY' + import logging + import secrets + import sys + + # Quiet down the transitions state-machine chatter so the captured + # stdout stays readable; the launcher prints one INFO line per + # state transition by default. + logging.getLogger("transitions").setLevel(logging.WARNING) + + from dl_iblai_services_app.services.launchers import run_launch_steps + + KEY = "{{ platform_name }}" + BASE_DOMAIN = "{{ base_domain }}" + + # admin_username must be alphanumeric (skill §4); strip hyphens and + # underscores from the platform key. + admin_username = (KEY + "admin").replace("-", "").replace("_", "").lower() + admin_email = admin_username + "@" + BASE_DOMAIN + display_name = KEY.replace("-", " ").replace("_", " ").title() + password = secrets.token_hex(16) + + launch_data = { + "username": admin_username, + "email": admin_email, + "firstname": display_name, + "lastname": "Admin", + "password": password, + "role": "org-instructor", + "org": KEY, + "key": KEY, + "name": display_name, + "lms_url": "https://learn." + KEY + "." + BASE_DOMAIN, + "cms_url": "https://studio.learn." + KEY + "." + BASE_DOMAIN, + "portal_url": KEY + "." + BASE_DOMAIN, + } + resp = run_launch_steps(launch_data) + if not resp.get("success"): + print("TENANT_LAUNCH:FAILED message=" + str(resp.get("message"))) + print("TRACEBACK: " + str(resp.get("traceback"))) + sys.exit(1) + + # Marker block parsed by the follow-up debug task. Keep on its own + # lines for readability — the debug task wraps these in + # IBLAI_FIXTURE_OUTPUT_BEGIN/END so the runner replays them after + # the Live display tears down. + print("====TENANT_ADMIN_CREDS_BEGIN====") + print("Platform key: " + KEY) + print("Platform name: " + display_name) + print("Admin username: " + admin_username) + print("Admin email: " + admin_email) + print("Admin password: " + password) + print("Launch ID: " + str(resp.get("id"))) + print("LMS URL: " + launch_data["lms_url"]) + print("CMS URL: " + launch_data["cms_url"]) + print("Portal URL: " + launch_data["portal_url"]) + print("====TENANT_ADMIN_CREDS_END====") + print("TENANT_LAUNCH:OK key=" + KEY) + PY + executable: /bin/bash + register: tenant_launch + changed_when: "'TENANT_LAUNCH:OK' in (tenant_launch.stdout | default(''))" + no_log: true # keep the generated password out of -vvv ansible logs + when: + - (platform_name | default('main')) != 'main' + - "'TENANT_PLATFORM_STATUS:ABSENT' in (tenant_check.stdout | default(''))" + +# Reprint the generated admin password through the fixture-output +# pipeline so the operator sees it AFTER the Rich Live display tears +# down. Ansible's default callback JSON-encodes multi-line debug msgs +# onto a single stdout line, which is exactly what +# `_maybe_capture_fixture` in runner.py expects (both BEGIN/END markers +# on the same line). +- name: Show tenant admin credentials (one-time, save this output) + ansible.builtin.debug: + msg: | + IBLAI_FIXTURE_OUTPUT_BEGIN + TENANT PLATFORM LAUNCHED — SAVE THIS OUTPUT + The admin password below is generated fresh on every launch and + is NEVER persisted to disk. Capture it now. + + {{ tenant_launch.stdout_lines | default([]) | join('\n') }} + IBLAI_FIXTURE_OUTPUT_END + when: + - (platform_name | default('main')) != 'main' + - tenant_launch is defined + - tenant_launch.changed | default(false) + +# Defense-in-depth: ensure show_paywall=False AND is_advertising=False on +# the tenant Platform row, regardless of how the launcher / model defaults +# behave today. Matches the canonical multi-tenant shape (Platform.show_paywall=False, +# is_advertising=False). Runs whether the launcher ran fresh OR was skipped +# because the platform already exists — so re-runs reconcile any flag drift. +# `.update()` is idempotent: writes only if the resolved value differs, and +# matches zero rows if the platform doesn't exist (no-op). +- name: Ensure tenant paywall + advertising disabled + become: false + ansible.builtin.shell: + cmd: | + docker exec -i ibl_dm_pro_web python manage.py shell <<'PY' + from core.models import Platform + key = "{{ platform_name }}" + changed = Platform.objects.filter(key=key).exclude( + show_paywall=False, is_advertising=False, + ).update(show_paywall=False, is_advertising=False) + if changed: + print(f"PAYWALL_FLAGS:UPDATED key={key} rows={changed}") + else: + print(f"PAYWALL_FLAGS:UNCHANGED key={key}") + PY + executable: /bin/bash + register: paywall_enforce + changed_when: "'PAYWALL_FLAGS:UPDATED' in (paywall_enforce.stdout | default(''))" + when: (platform_name | default('main')) != 'main' + +- name: Confirm tenant platform skipped (already present) + ansible.builtin.debug: + msg: >- + Tenant platform '{{ platform_name }}' already exists — skipping launcher + (run_launch_steps does not upsert). Manage the existing Platform / admin + via the DM Django shell if you need to repair flags. + when: + - (platform_name | default('main')) != 'main' + - "'TENANT_PLATFORM_STATUS:PRESENT' in (tenant_check.stdout | default(''))" + +- name: Confirm tenant platform skipped (platform_name == 'main') + ansible.builtin.debug: + msg: >- + platform_name='main' — skipping tenant launch (the IBL default tenant is + maintained by `ibl launch`, not by this role). + when: (platform_name | default('main')) == 'main' diff --git a/src/iblai_infra/ansible/templates/single-server/roles/microsoft_sso_config/tasks/main.yml b/src/iblai_infra/ansible/templates/single-server/roles/microsoft_sso_config/tasks/main.yml index b9642cf..68a485e 100644 --- a/src/iblai_infra/ansible/templates/single-server/roles/microsoft_sso_config/tasks/main.yml +++ b/src/iblai_infra/ansible/templates/single-server/roles/microsoft_sso_config/tasks/main.yml @@ -257,3 +257,135 @@ ansible.builtin.debug: msg: "Microsoft SSO configured for learn.{{ base_domain }} (platform={{ platform_name | default('main') }}, organization={{ microsoft_sso_organization or '(none)' }})" when: microsoft_sso_enabled | default(false) | bool + +# --------------------------------------------------------------------------- +# IBL_SPA.AUTH patches consumed by the Auth + Mentor SPAs at boot. +# +# - EXTERNAL_IDP_LOGOUT_URL — hit by the Auth SPA on sign-out to redirect +# the user through Microsoft's logout endpoint, then back to our own +# auth host (so the Azure session is killed alongside ours). +# - IBL_DIRECT_SSO_URL — the "Sign in with Microsoft" deep link the +# SPA renders. Points at the LMS python-social-auth login endpoint for +# this platform's backend (`-oauth2`), with an inner +# `?next=...` that completes the OAuth dance back into the SPA's +# /login/complete callback. The inner client_id MUST be the same +# EDX_SSO_CLIENT_ID the ibl_spa role minted for the `spa-sso` +# Application — we read it from config.yml rather than re-deriving it. +# +# Tenant ID: uses `microsoft_sso_tenant_id` when set; falls back to "common" +# (multi-tenant) when blank. Same pattern as the existing logout_url in the +# OAuth2ProviderConfig.other_settings JSON above. +# +# Idempotent: only marks `changed` (and triggers the downstream +# `ibl config save` + SPA restart) when the resolved values differ from +# what's already in config.yml. +# --------------------------------------------------------------------------- + +- name: Patch IBL_SPA.AUTH with Microsoft SSO logout + direct-SSO URLs + become: false + ansible.builtin.shell: + cmd: | + python3 <<'PY' + import sys + import yaml + from urllib.parse import quote + + PATH = "{{ ibl_root }}/config.yml" + BASE_DOMAIN = "{{ base_domain }}" + PLATFORM_NAME = "{{ platform_name | default('main') }}" + TENANT_ID = "{{ microsoft_sso_tenant_id }}".strip() or "common" + BACKEND_NAME = PLATFORM_NAME + "-oauth2" + + with open(PATH) as f: + cfg = yaml.safe_load(f) or {} + + spa = cfg.setdefault("IBL_SPA", {}) + auth = spa.setdefault("AUTH", {}) + + spa_sso_client_id = (spa.get("EDX_SSO_CLIENT_ID") or "").strip() + if not spa_sso_client_id: + print("ERROR: IBL_SPA.EDX_SSO_CLIENT_ID is empty — ibl_spa role must run first") + sys.exit(1) + + external_logout = ( + "https://login.microsoftonline.com/" + TENANT_ID + + "/oauth2/v2.0/logout?post_logout_redirect_uri=" + + "https://auth." + BASE_DOMAIN + ) + + # Inner /oauth2/authorize URL the LMS completes into after Microsoft + # auth. URL-encoded verbatim into the outer `next=` query param so + # python-social-auth treats the whole thing as one opaque target. + inner_next = ( + "/oauth2/authorize?response_type=code" + "&client_id=" + spa_sso_client_id + + "&scope=profile email" + "&redirect_uri=https://auth." + BASE_DOMAIN + "/login/complete" + ) + direct_sso = ( + "https://learn." + BASE_DOMAIN + "/auth/login/" + BACKEND_NAME + + "/?auth_entry=login&next=" + quote(inner_next, safe="") + ) + + desired = { + "EXTERNAL_IDP_LOGOUT_URL": external_logout, + "IBL_DIRECT_SSO_URL": direct_sso, + } + current = {k: auth.get(k) for k in desired} + if current == desired: + print("SPA_AUTH_UNCHANGED") + else: + auth.update(desired) + with open(PATH, "w") as f: + yaml.safe_dump(cfg, f, default_flow_style=False, sort_keys=False) + print("SPA_AUTH_PATCHED") + PY + executable: /bin/bash + register: spa_auth_patch + changed_when: "'SPA_AUTH_PATCHED' in (spa_auth_patch.stdout | default(''))" + when: microsoft_sso_enabled | default(false) | bool + +- name: Save platform config (regenerate SPA env files) + become: false + ansible.builtin.shell: + cmd: | + export PYENV_ROOT="{{ pyenv_root }}" + export PATH="$PYENV_ROOT/bin:$PATH" + eval "$(pyenv init -)"; eval "$(pyenv virtualenv-init -)" + pyenv activate "{{ venv_name }}" + ibl config save + executable: /bin/bash + chdir: "{{ repo_dir }}" + environment: + HOME: "/home/{{ ansible_user }}" + IBL_ROOT: "{{ ibl_root }}/" + when: + - microsoft_sso_enabled | default(false) | bool + - spa_auth_patch.changed | default(false) + +# `docker compose down && up -d` (not `restart`) because compose only +# re-reads the env file at container creation time. The Auth SPA owns the +# IBL_DIRECT_SSO_URL / EXTERNAL_IDP_LOGOUT_URL renders directly; the Mentor +# SPA reuses the same AUTH block for its login deep link. +- name: Restart Auth + Mentor SPAs to pick up new IBL_SPA.AUTH values + become: false + ansible.builtin.shell: + cmd: | + set -e + cd {{ ibl_root }}/app/ibl-spa/auth/ && docker compose down && docker compose up -d + cd {{ ibl_root }}/app/ibl-spa/mentor/ && docker compose down && docker compose up -d + executable: /bin/bash + environment: + HOME: "/home/{{ ansible_user }}" + when: + - microsoft_sso_enabled | default(false) | bool + - spa_auth_patch.changed | default(false) + +- name: Confirm IBL_SPA.AUTH patched for Microsoft SSO + ansible.builtin.debug: + msg: >- + IBL_SPA.AUTH updated: EXTERNAL_IDP_LOGOUT_URL → login.microsoftonline.com/{{ microsoft_sso_tenant_id | default('common', true) }}/..., + IBL_DIRECT_SSO_URL → learn.{{ base_domain }}/auth/login/{{ platform_name | default('main') }}-oauth2/... + when: + - microsoft_sso_enabled | default(false) | bool + - spa_auth_patch.changed | default(false) diff --git a/src/iblai_infra/cli.py b/src/iblai_infra/cli.py index 171bcfb..39d6026 100644 --- a/src/iblai_infra/cli.py +++ b/src/iblai_infra/cli.py @@ -519,7 +519,7 @@ def launch( github_org: str = typer.Option("iblai", "--github-org", help="GitHub org owning the private CLI ops + prod images repos"), cli_ops_repo: str = typer.Option("iblai-cli-ops", "--cli-ops-repo", help="CLI ops repo, or 'repo/subdir' to install from a subdirectory of a monorepo"), prod_images_repo: str = typer.Option("iblai-prod-images", "--prod-images-repo", help="Prod images repo, or 'repo/subdir' to install from a subdirectory of a monorepo"), - admin_username: str = typer.Option("ibl_admin", "--admin-username", help="Admin username"), + admin_username: str = typer.Option("platform_admin", "--admin-username", help="Admin username (cannot be a reserved name like 'ibl_admin')"), openai_key: str = typer.Option("", "--openai-key", help="OpenAI API key (optional)"), enable_ai: bool = typer.Option(True, "--enable-ai/--no-ai", help="Enable AI features"), create_playwright_platforms: bool = typer.Option( @@ -547,8 +547,11 @@ def launch( google_sso_client_id: str = typer.Option("", "--google-sso-client-id", help="Google OAuth Client ID. Setting this enables the Google SSO ansible role."), google_sso_client_secret: str = typer.Option("", "--google-sso-client-secret", help="Google OAuth Client Secret"), google_sso_organization: str = typer.Option("", "--google-sso-organization", help="Organization short name to attach to the OAuth2ProviderConfig (optional)"), - # Platform name — drives SSO backend_name + platform_key. Always populated; defaults to "main" - platform_name: str = typer.Option("main", "--platform-name", help="Platform identifier (lowercase). Used to derive SSO backend_name (-oauth2) and other_settings.platform_key. Default 'main'."), + # Platform name — drives SSO backend_name + platform_key AND the + # ibl_tenant_platform role. Unset (or empty) resolves to 'main' (system + # default tenant — no tenant launch). 'main' is reserved as an explicit + # input so operators pick a real tenant key or leave it alone. + platform_name: str | None = typer.Option(None, "--platform-name", help="Tenant platform key (lowercase). Leave unset for 'main' (system default, no tenant launch). 'main' is reserved as an explicit value."), # Microsoft SSO — `--microsoft-sso-client-id` is the trigger; if empty, the role no-ops microsoft_sso_client_id: str = typer.Option("", "--microsoft-sso-client-id", help="Microsoft Azure AD Application (Client) ID. Setting this enables the Microsoft SSO ansible role."), microsoft_sso_client_secret: str = typer.Option("", "--microsoft-sso-client-secret", help="Microsoft Azure AD Client Secret value"), @@ -591,6 +594,46 @@ def launch( if not admin_email or not admin_password: ui.error("--admin-email and --admin-password are required for single/multi-server deployments.") raise typer.Exit(1) + from iblai_infra.models import ( + RESERVED_ADMIN_USERNAMES, + RESERVED_PLATFORM_NAMES, + is_reserved_admin_username, + is_reserved_platform_name, + ) + if is_reserved_admin_username(admin_username): + reserved = ", ".join(sorted(RESERVED_ADMIN_USERNAMES)) + ui.error( + f"--admin-username {admin_username!r} is reserved for system use." + ) + ui.muted(f"Reserved: {reserved}. Pick a different name (e.g. 'platform_admin').") + raise typer.Exit(1) + # --platform-name 'main' is rejected explicitly. Unset resolves to + # 'main' silently (system default tenant, no tenant launch). + if platform_name is not None and is_reserved_platform_name(platform_name): + reserved = ", ".join(sorted(RESERVED_PLATFORM_NAMES)) + ui.error( + f"--platform-name {platform_name!r} is reserved for the system default tenant." + ) + ui.muted( + f"Reserved: {reserved}. Omit --platform-name for the default, " + "or pick a tenant key like 'acme'." + ) + raise typer.Exit(1) + # Resolve None → 'main' downstream so SetupConfig + ansible see a value. + platform_name = (platform_name or "main").strip().lower() + + # Heads-up if the operator picked a 32 GB box AND wants AI on. Not blocking — + # they can still proceed. Skipped for call-server (LiveKit has different + # sizing constraints) and for instance types we don't know the RAM of. + if deployment_type != "call-server" and enable_ai: + from iblai_infra.models import instance_ram_gb + ram = instance_ram_gb(instance_type) + if ram is not None and ram <= 32: + ui.warning( + f"--instance-type [highlight]{instance_type}[/highlight] has {ram} GB RAM " + f"and AI features are enabled." + ) + ui.muted(" 64 GB (e.g. m5.4xlarge or r5.2xlarge) is strongly recommended for AI workloads.") _run_launch( ami_id=ami_id, domain=domain, hosted_zone_id=hosted_zone_id, @@ -706,7 +749,18 @@ def launch_env( volume_size = int(env.get("VOLUME_SIZE", "200")) environment = env.get("ENVIRONMENT", "staging") cli_tag = env.get("CLI_TAG", "3.19.0") - admin_username = env.get("ADMIN_USERNAME", "ibl_admin") + admin_username = env.get("ADMIN_USERNAME", "platform_admin").strip() + from iblai_infra.models import ( + RESERVED_ADMIN_USERNAMES, + is_reserved_admin_username, + ) + if is_reserved_admin_username(admin_username): + reserved = ", ".join(sorted(RESERVED_ADMIN_USERNAMES)) + ui.error( + f"ADMIN_USERNAME={admin_username!r} is reserved for system use." + ) + ui.muted(f"Reserved: {reserved}. Pick a different name (e.g. 'platform_admin').") + raise typer.Exit(1) openai_key = env.get("OPENAI_API_KEY", "") enable_ai = env.get("ENABLE_AI", "true").lower() in ("true", "1", "yes") create_playwright_platforms = env.get("CREATE_PLAYWRIGHT_PLATFORMS", "false").lower() in ("true", "1", "yes") @@ -730,7 +784,29 @@ def launch_env( google_sso_client_id = env.get("GOOGLE_SSO_CLIENT_ID", "") google_sso_client_secret = env.get("GOOGLE_SSO_CLIENT_SECRET", "") google_sso_organization = env.get("GOOGLE_SSO_ORGANIZATION", "") - platform_name = env.get("PLATFORM_NAME", "main") + # PLATFORM_NAME: blank/absent → 'main' (system default tenant, no tenant + # launch). Explicit 'main' is rejected — operator must either leave it + # alone or pick a real tenant key. + raw_platform_name = env.get("PLATFORM_NAME") + if raw_platform_name is not None and raw_platform_name.strip(): + from iblai_infra.models import ( + RESERVED_PLATFORM_NAMES, + is_reserved_platform_name, + ) + candidate = raw_platform_name.strip().lower() + if is_reserved_platform_name(candidate): + reserved = ", ".join(sorted(RESERVED_PLATFORM_NAMES)) + ui.error( + f"PLATFORM_NAME={candidate!r} is reserved for the system default tenant." + ) + ui.muted( + f"Reserved: {reserved}. Remove the line (or leave it unset) to " + "use the default, or pick a tenant key like 'acme'." + ) + raise typer.Exit(1) + platform_name = candidate + else: + platform_name = "main" microsoft_sso_client_id = env.get("MICROSOFT_SSO_CLIENT_ID", "") microsoft_sso_client_secret = env.get("MICROSOFT_SSO_CLIENT_SECRET", "") microsoft_sso_tenant_id = env.get("MICROSOFT_SSO_TENANT_ID", "") @@ -757,6 +833,16 @@ def launch_env( ] ui.summary_panel("Launch Configuration", rows) + # Same memory heads-up as the `launch` flag-driven flow. + if enable_ai: + from iblai_infra.models import instance_ram_gb + ram = instance_ram_gb(instance_type) + if ram is not None and ram <= 32: + ui.warning( + f"INSTANCE_TYPE={instance_type!r} has {ram} GB RAM and AI features are enabled." + ) + ui.muted(" 64 GB (e.g. m5.4xlarge or r5.2xlarge) is strongly recommended for AI workloads.") + confirm = questionary.confirm( "Proceed with launch?", default=True, @@ -857,6 +943,23 @@ def provision_env( rows.append(("AWS profile", config.credentials.profile)) ui.summary_panel("Provision Configuration", rows) + # Same memory heads-up the interactive `provision` wizard and the launch + # flows surface. provision-env doesn't know whether AI will be enabled + # downstream (that's a setup-step decision), so we warn unconditionally + # on 32 GB boxes — the operator can ignore if they're sure AI stays off. + from iblai_infra.models import instance_ram_gb + ram = instance_ram_gb(config.compute.instance_type) + if ram is not None and ram <= 32: + ui.warning( + f"INSTANCE_TYPE={config.compute.instance_type!r} has {ram} GB RAM." + ) + ui.muted( + " If you plan to enable AI features during setup (the default for IBL deployments)," + ) + ui.muted( + " 64 GB (e.g. m5.4xlarge or r5.2xlarge) is strongly recommended." + ) + ui.newline() ui.console.print(" [brand]Provisioning infrastructure...[/brand]") diff --git a/src/iblai_infra/env_provision.py b/src/iblai_infra/env_provision.py index b75ba80..dd69351 100644 --- a/src/iblai_infra/env_provision.py +++ b/src/iblai_infra/env_provision.py @@ -211,7 +211,7 @@ def _build_network(env: dict[str, str]) -> NetworkConfig: def _build_compute(env: dict[str, str]) -> ComputeConfig: instance_type = (env.get("INSTANCE_TYPE") or "t3.2xlarge").strip() volume_type = (env.get("VOLUME_TYPE") or "gp3").strip() - volume_raw = (env.get("VOLUME_SIZE") or "50").strip() + volume_raw = (env.get("VOLUME_SIZE") or "100").strip() try: volume_size = int(volume_raw) except ValueError: diff --git a/src/iblai_infra/env_setup.py b/src/iblai_infra/env_setup.py index 5cdae66..6c885cf 100644 --- a/src/iblai_infra/env_setup.py +++ b/src/iblai_infra/env_setup.py @@ -35,9 +35,13 @@ InfraConfig, NetworkConfig, ProjectState, + RESERVED_ADMIN_USERNAMES, + RESERVED_PLATFORM_NAMES, SetupConfig, SSHConfig, SSHKeyMethod, + is_reserved_admin_username, + is_reserved_platform_name, ) from iblai_infra.prompts.setup import validate_key_permissions from iblai_infra.terraform.state import WORKSPACE_ROOT, load_state, save_state @@ -200,6 +204,33 @@ def build_setup_config_from_env( admin_password = env["ADMIN_PASSWORD"] if len(admin_password) < 8: raise _fail("ADMIN_PASSWORD must be at least 8 characters.") + admin_username = env["ADMIN_USERNAME"].strip() + if is_reserved_admin_username(admin_username): + reserved = ", ".join(sorted(RESERVED_ADMIN_USERNAMES)) + raise _fail( + f"ADMIN_USERNAME={admin_username!r} is reserved for system use.", + hint=f"Reserved usernames: {reserved}. Pick a different one (e.g. 'platform_admin').", + ) + + # PLATFORM_NAME: blank/absent → resolves to 'main' (system default + # tenant, no tenant launch). Explicitly setting it to 'main' is + # rejected — operators shouldn't pick the reserved name; they should + # either leave it unset or pick a real tenant key. + raw_platform_name = env.get("PLATFORM_NAME") + if raw_platform_name is not None and raw_platform_name.strip(): + candidate = raw_platform_name.strip().lower() + if is_reserved_platform_name(candidate): + reserved = ", ".join(sorted(RESERVED_PLATFORM_NAMES)) + raise _fail( + f"PLATFORM_NAME={candidate!r} is reserved for the system default tenant.", + hint=( + f"Reserved: {reserved}. Leave PLATFORM_NAME unset (or remove the line) " + f"to use the default, or pick a tenant key like 'acme'." + ), + ) + platform_name = candidate + else: + platform_name = "main" # Resolve "where to deploy" fields, allowing env to override state. target_host = (env.get("TARGET_HOST") or "").strip() @@ -273,7 +304,7 @@ def build_setup_config_from_env( cli_ops_repo=(env.get("CLI_OPS_REPO") or "iblai-cli-ops").strip(), prod_images_repo=(env.get("PROD_IMAGES_REPO") or "iblai-prod-images").strip(), openai_api_key=(env.get("OPENAI_API_KEY") or "").strip(), - admin_username=env["ADMIN_USERNAME"].strip(), + admin_username=admin_username, admin_email=admin_email, admin_password=admin_password, # SMTP @@ -299,7 +330,7 @@ def build_setup_config_from_env( env.get("STRIPE_CONNECT_WEBHOOK_SECRET") or "" ).strip(), # Platform name + SSO - platform_name=(env.get("PLATFORM_NAME") or "main").strip().lower(), + platform_name=platform_name, google_sso_enabled=google_sso_enabled, google_sso_client_id=google_sso_client_id, google_sso_client_secret=(env.get("GOOGLE_SSO_CLIENT_SECRET") or "").strip(), diff --git a/src/iblai_infra/models.py b/src/iblai_infra/models.py index c31ec28..61aa142 100644 --- a/src/iblai_infra/models.py +++ b/src/iblai_infra/models.py @@ -9,7 +9,7 @@ from pathlib import Path from typing import Literal -from pydantic import BaseModel, Field, field_validator +from pydantic import BaseModel, Field, field_validator, model_validator # --------------------------------------------------------------------------- @@ -21,7 +21,7 @@ def parse_repo_path(value: str) -> tuple[str, str | None]: Used by the ansible runner to point installs at a package inside a monorepo. `iblai-cli-ops` -> ('iblai-cli-ops', None); - `kaplan-iblai-infra-ops/iblai-cli-ops` -> ('kaplan-iblai-infra-ops', + `-iblai-infra-ops/iblai-cli-ops` -> ('-iblai-infra-ops', 'iblai-cli-ops'). """ cleaned = (value or "").strip().strip("/") @@ -99,6 +99,22 @@ class DeploymentType(str, Enum): "r5.2xlarge": "8 vCPU, 64 GB RAM — Memory optimized", } +# RAM (in GB) for the instance types we publish in the picker. Used by the +# prompt + launch flows to warn operators when they pick a 32 GB box — +# AI-enabled platforms benefit substantially from 64 GB. +INSTANCE_RAM_GB: dict[str, int] = { + "t3.xlarge": 16, + "t3.2xlarge": 32, + "m5.2xlarge": 32, + "m5.4xlarge": 64, + "r5.2xlarge": 64, +} + + +def instance_ram_gb(instance_type: str) -> int | None: + """Return RAM in GB for a known instance type, or None for unknown/custom.""" + return INSTANCE_RAM_GB.get((instance_type or "").strip()) + # LiveKit (call-server) sizing recommendations. Per LiveKit's self-hosting # guide, SFU-only workloads fit on 2 vCPU boxes; egress/recording benefits # from CPU-optimized (c5) families. @@ -164,10 +180,17 @@ def validate_ip(cls, v: str) -> str: class ComputeConfig(BaseModel): instance_type: str = "t3.2xlarge" - volume_size: int = 50 + volume_size: int = 100 volume_type: str = "gp3" ami_id: str | None = None + # Floor of 20 GB here is the lower bound for *any* compute config — + # call-server reuses ComputeConfig as the synced-from-CallServerConfig + # placeholder and runs LiveKit on a small disk (~40 GB). The + # IBL-platform-minimum 100 GB floor is enforced on `InfraConfig` (and + # at the prompt / CLI / .env input layers) only for `DeploymentType.SINGLE`, + # since multi-server uses `MultiServerConfig.{app_server,services}_volume_size` + # and multi's own validator handles that case. @field_validator("volume_size") @classmethod def validate_volume(cls, v: int) -> int: @@ -236,8 +259,8 @@ def validate_app_server_count(cls, v: int) -> int: @field_validator("app_server_volume_size", "services_volume_size") @classmethod def validate_volume_sizes(cls, v: int) -> int: - if v < 20: - raise ValueError("Volume size must be at least 20 GB") + if v < 100: + raise ValueError("Volume size must be at least 100 GB") return v @@ -290,6 +313,19 @@ def validate_project_name(cls, v: str) -> str: raise ValueError("Project name must be 32 characters or fewer") return v + # Enforce the 100 GB platform-disk floor on SINGLE deployments. MULTI is + # already covered by `MultiServerConfig.validate_volume_sizes`; CALL uses + # `CallServerConfig.volume_size` (LiveKit only needs ~40 GB) and reuses + # ComputeConfig as a placeholder, so we don't enforce a 100 GB floor on it. + @model_validator(mode="after") + def _validate_single_server_volume_size(self) -> "InfraConfig": + if self.deployment_type == DeploymentType.SINGLE and self.compute.volume_size < 100: + raise ValueError( + "Single-server volume size must be at least 100 GB " + f"(got {self.compute.volume_size})" + ) + return self + @property def resource_prefix(self) -> str: return f"{self.project_name}-{self.environment.value}" @@ -316,6 +352,34 @@ class ProjectState(BaseModel): # Setup config — contract between setup prompts and AnsibleRunner # --------------------------------------------------------------------------- +# Usernames reserved for system / platform-internal use. The ibl_spa role +# looks up `ibl_admin` to own the `spa-sso` and `ibl_web` OAuth2 Application +# records on the LMS — that user is created by the platform's own bootstrap +# (`ibl edx` / `ibl dm` launch flows) before ibl_spa runs. Operators must +# pick a different name for their human superuser so the system account +# stays separate. +RESERVED_ADMIN_USERNAMES: frozenset[str] = frozenset({"ibl_admin"}) + + +def is_reserved_admin_username(value: str) -> bool: + """Return True if `value` collides with a reserved system username.""" + return (value or "").strip().lower() in RESERVED_ADMIN_USERNAMES + + +# Platform identifiers reserved for system / platform-internal use. `main` +# is the IBL default tenant the platform itself creates and maintains via +# `ibl launch`. Operators can't pick `main` as a tenant name — instead they +# leave the field blank/unset, which silently resolves to `main` for SSO +# backwards-compat (backend_name=`main-oauth2`) and skips the tenant +# launcher (see `ibl_tenant_platform` ansible role). +RESERVED_PLATFORM_NAMES: frozenset[str] = frozenset({"main"}) + + +def is_reserved_platform_name(value: str) -> bool: + """Return True if `value` collides with a reserved system platform name.""" + return (value or "").strip().lower() in RESERVED_PLATFORM_NAMES + + class SetupConfig(BaseModel): """Variables needed to bootstrap a provisioned VM. Never persisted to disk.""" ssh_private_key_path: Path @@ -339,13 +403,13 @@ class SetupConfig(BaseModel): # own repos. Defaults reflect the canonical IBL deployment. github_org: str = "iblai" # Each repo field accepts either a bare repo name (`iblai-cli-ops`) or a - # `repo/subdir` path (`kaplan-iblai-infra-ops/iblai-cli-ops`) to point at + # `repo/subdir` path (`-iblai-infra-ops/iblai-cli-ops`) to point at # a package inside a monorepo. Parsed by `parse_repo_path()` before the # install URL is built. cli_ops_repo: str = "iblai-cli-ops" prod_images_repo: str = "iblai-prod-images" openai_api_key: str = "" - admin_username: str = "ibl_admin" + admin_username: str = "platform_admin" admin_email: str = "" admin_password: str = "" # SMTP for outbound email (magic-link tests etc.). Disabled by default; @@ -402,6 +466,18 @@ class SetupConfig(BaseModel): microsoft_sso_tenant_id: str = "" microsoft_sso_organization: str = "" + @field_validator("admin_username") + @classmethod + def _validate_admin_username(cls, v: str) -> str: + s = (v or "").strip() + if not s: + raise ValueError("admin_username must not be empty") + if s.lower() in RESERVED_ADMIN_USERNAMES: + raise ValueError( + f"'{s}' is reserved for system use; pick a different admin username" + ) + return s + # --------------------------------------------------------------------------- # Ingress — pre-provisioned domain endpoints diff --git a/src/iblai_infra/prompts/infrastructure.py b/src/iblai_infra/prompts/infrastructure.py index f5bffe3..d593600 100644 --- a/src/iblai_infra/prompts/infrastructure.py +++ b/src/iblai_infra/prompts/infrastructure.py @@ -23,7 +23,31 @@ SSHConfig, SSHKeyMethod, generate_password, + instance_ram_gb, ) + + +def _warn_if_low_memory(instance_type: str, *, context: str = "") -> None: + """Warn the operator when they pick a 32 GB (or smaller) instance. + + AI workloads on the platform (mentor LLMs, embedding generation, retrieval) + can easily exhaust 32 GB once concurrent users + edX + DM are all in play. + We surface a non-blocking heads-up so the operator can revise their pick + before terraform provisions the box. Unknown / custom instance types are + skipped — we can't reason about their memory. + """ + ram = instance_ram_gb(instance_type) + if ram is None or ram > 32: + return + label = f"[highlight]{instance_type}[/highlight] ({ram} GB RAM)" + where = f" for {context}" if context else "" + ui.warning(f"Selected{where}: {label}.") + ui.muted( + " If you plan to enable AI features (the default for IBL deployments)," + ) + ui.muted( + " 64 GB (e.g. [brand]m5.4xlarge[/brand] or [brand]r5.2xlarge[/brand]) is strongly recommended." + ) from iblai_infra.providers.aws import ( detect_current_ip, get_session, @@ -156,11 +180,13 @@ def prompt_project_and_compute() -> ( if instance_type is None: ui.abort() + _warn_if_low_memory(instance_type) + # ----- single-server: volume ----- volume_size = questionary.text( "Root volume size in GB:", - default="50", - validate=lambda v: (v.isdigit() and int(v) >= 20) or "Must be a number >= 20", + default="100", + validate=lambda v: (v.isdigit() and int(v) >= 100) or "Must be a number >= 100", style=ui.PROMPT_STYLE, qmark=ui.QMARK, ).ask() @@ -231,10 +257,12 @@ def _prompt_multi_server_config() -> MultiServerConfig: if app_instance_type is None: ui.abort() + _warn_if_low_memory(app_instance_type, context="app server") + app_volume = questionary.text( "App server volume size (GB):", default="250", - validate=lambda v: (v.isdigit() and int(v) >= 20) or "Must be >= 20", + validate=lambda v: (v.isdigit() and int(v) >= 100) or "Must be >= 100", style=ui.PROMPT_STYLE, qmark=ui.QMARK, ).ask() @@ -270,10 +298,12 @@ def _prompt_multi_server_config() -> MultiServerConfig: if svc_instance_type is None: ui.abort() + _warn_if_low_memory(svc_instance_type, context="services server") + svc_volume = questionary.text( "Services server volume size (GB):", default="500", - validate=lambda v: (v.isdigit() and int(v) >= 20) or "Must be >= 20", + validate=lambda v: (v.isdigit() and int(v) >= 100) or "Must be >= 100", style=ui.PROMPT_STYLE, qmark=ui.QMARK, ).ask() diff --git a/src/iblai_infra/prompts/setup.py b/src/iblai_infra/prompts/setup.py index 1fc0532..d2afd57 100644 --- a/src/iblai_infra/prompts/setup.py +++ b/src/iblai_infra/prompts/setup.py @@ -8,7 +8,44 @@ import questionary from iblai_infra import ui -from iblai_infra.models import ProjectState, SetupConfig, SSHKeyMethod +from iblai_infra.models import ( + ProjectState, + RESERVED_ADMIN_USERNAMES, + RESERVED_PLATFORM_NAMES, + SetupConfig, + SSHKeyMethod, + is_reserved_admin_username, + is_reserved_platform_name, +) + + +def _validate_admin_username(value: str) -> bool | str: + """questionary-compatible validator. Returns True or an error string.""" + s = (value or "").strip() + if not s: + return "Admin username is required" + if is_reserved_admin_username(s): + reserved = ", ".join(sorted(RESERVED_ADMIN_USERNAMES)) + return f"'{s}' is reserved for system use. Reserved: {reserved}" + return True + + +def _validate_tenant_platform_name(value: str) -> bool | str: + """questionary-compatible validator. Blank is accepted (resolves to + `main` implicitly downstream). Explicit `main` is rejected so the + operator can't co-opt the system default tenant. + """ + s = (value or "").strip().lower() + if not s: + return True + if is_reserved_platform_name(s): + reserved = ", ".join(sorted(RESERVED_PLATFORM_NAMES)) + return ( + f"'{s}' is reserved for the system default tenant. " + f"Leave blank to use the default, or pick a different name. " + f"Reserved: {reserved}" + ) + return True SETUP_STEPS = 3 BOOTSTRAP_STEPS = 4 @@ -97,21 +134,26 @@ def _prompt_platform_config( ui.success(f"Domain: [highlight]{base_domain}[/highlight]") - # Platform name — first thing the operator sets in step 2. Drives the - # SSO ansible roles (backend_name = `-oauth2`, - # other_settings.platform_key). Defaults to "main" for canonical IBL - # single-tenant deploys; tenant deployments override. + # Platform name — drives the SSO ansible roles (backend_name = + # `-oauth2`, other_settings.platform_key) AND the + # ibl_tenant_platform role (launches a Platform + admin via + # run_launch_steps when value != 'main'). Blank input resolves to + # 'main' implicitly (the system default tenant the platform itself + # creates). Operators can't pick 'main' explicitly — it's reserved. platform_name = questionary.text( - "Platform name (lowercase identifier, default 'main'):", - default="main", - validate=lambda v: bool(v.strip()) or "Platform name is required", + "Tenant platform name (leave blank for default 'main', no tenant launch):", + default="", + validate=_validate_tenant_platform_name, style=ui.PROMPT_STYLE, qmark=ui.QMARK, ).ask() if platform_name is None: ui.abort() - platform_name = platform_name.strip().lower() - ui.success(f"Platform: [highlight]{platform_name}[/highlight]") + platform_name = platform_name.strip().lower() or "main" + if platform_name == "main": + ui.success("Platform: [highlight]main[/highlight] (default, no tenant launch)") + else: + ui.success(f"Tenant platform: [highlight]{platform_name}[/highlight] (will be launched)") edx_version = "sumac" ui.success(f"Open edX version: [highlight]Sumac[/highlight]") @@ -560,7 +602,7 @@ def _prompt_credentials( cli_ops_repo = questionary.text( "CLI ops repo (or repo/subdir for monorepo):", default="iblai-cli-ops", - instruction="(e.g. iblai-cli-ops, or kaplan-iblai-infra-ops/iblai-cli-ops)", + instruction="(e.g. iblai-cli-ops, or -iblai-infra-ops/iblai-cli-ops)", validate=lambda v: len(v.strip()) > 0 or "Required", style=ui.PROMPT_STYLE, qmark=ui.QMARK, @@ -572,7 +614,7 @@ def _prompt_credentials( prod_images_repo = questionary.text( "Prod images repo (or repo/subdir for monorepo):", default="iblai-prod-images", - instruction="(e.g. iblai-prod-images, or kaplan-iblai-infra-ops/kaplan-iblai-prod-images)", + instruction="(e.g. iblai-prod-images, or -iblai-infra-ops/-iblai-prod-images)", validate=lambda v: len(v.strip()) > 0 or "Required", style=ui.PROMPT_STYLE, qmark=ui.QMARK, @@ -665,7 +707,8 @@ def _prompt_credentials( admin_username = questionary.text( "Admin username:", - default="ibl_admin", + default="platform_admin", + validate=_validate_admin_username, style=ui.PROMPT_STYLE, qmark=ui.QMARK, ).ask() diff --git a/tests/ansible/test_runner.py b/tests/ansible/test_runner.py index bc6dc71..8ca72be 100644 --- a/tests/ansible/test_runner.py +++ b/tests/ansible/test_runner.py @@ -408,15 +408,15 @@ def test_total_roles_matches_labels(self): assert TOTAL_ROLES == len(ROLE_LABELS) def test_expected_roles(self): - expected = {"docker", "awscli", "python", "ibl_cli_ops", "ibl_platform", "smtp_config", "ibl_dm", "ibl_edx", "ibl_spa", "integrations", "admin_setup", "data_seeding", "stripe_config", "google_sso_config", "microsoft_sso_config"} + expected = {"docker", "awscli", "python", "ibl_cli_ops", "ibl_platform", "smtp_config", "ibl_dm", "ibl_edx", "ibl_spa", "integrations", "admin_setup", "data_seeding", "ibl_tenant_platform", "stripe_config", "google_sso_config", "microsoft_sso_config"} assert set(ROLE_LABELS.keys()) == expected def test_launch_role_labels(self): - expected = {"ibl_cli_ops", "ibl_launch", "smtp_config", "ibl_launch_services", "integrations", "admin_setup", "data_seeding", "stripe_config", "google_sso_config", "microsoft_sso_config"} + expected = {"ibl_cli_ops", "ibl_launch", "smtp_config", "ibl_launch_services", "integrations", "admin_setup", "data_seeding", "ibl_tenant_platform", "stripe_config", "google_sso_config", "microsoft_sso_config"} assert set(LAUNCH_ROLE_LABELS.keys()) == expected def test_launch_role_labels_count(self): - assert len(LAUNCH_ROLE_LABELS) == 10 + assert len(LAUNCH_ROLE_LABELS) == 11 def test_service_update_role_labels(self): expected = {"ibl_cli_ops", "ibl_service_update"} @@ -454,12 +454,19 @@ def test_ssh_permission_denied(self, runner): with patch("iblai_infra.ansible.runner.subprocess.run", return_value=mock_result): assert runner._test_ssh() is False + # The five tests below exercise the retry path of `_test_ssh()` + # (10 retries × 15s sleep). They mock `time.sleep` so they finish + # in milliseconds while still asserting the exhaust-retries → False + # behavior end-to-end. def test_ssh_connection_refused(self, runner): mock_result = MagicMock() mock_result.returncode = 255 mock_result.stderr = "Connection refused" - with patch("iblai_infra.ansible.runner.subprocess.run", return_value=mock_result): + with ( + patch("iblai_infra.ansible.runner.subprocess.run", return_value=mock_result), + patch("iblai_infra.ansible.runner.time.sleep"), + ): assert runner._test_ssh() is False def test_ssh_connection_timed_out(self, runner): @@ -467,7 +474,10 @@ def test_ssh_connection_timed_out(self, runner): mock_result.returncode = 255 mock_result.stderr = "Connection timed out" - with patch("iblai_infra.ansible.runner.subprocess.run", return_value=mock_result): + with ( + patch("iblai_infra.ansible.runner.subprocess.run", return_value=mock_result), + patch("iblai_infra.ansible.runner.time.sleep"), + ): assert runner._test_ssh() is False def test_ssh_no_route(self, runner): @@ -475,7 +485,10 @@ def test_ssh_no_route(self, runner): mock_result.returncode = 255 mock_result.stderr = "No route to host" - with patch("iblai_infra.ansible.runner.subprocess.run", return_value=mock_result): + with ( + patch("iblai_infra.ansible.runner.subprocess.run", return_value=mock_result), + patch("iblai_infra.ansible.runner.time.sleep"), + ): assert runner._test_ssh() is False def test_ssh_other_error(self, runner): @@ -483,7 +496,10 @@ def test_ssh_other_error(self, runner): mock_result.returncode = 1 mock_result.stderr = "Some unexpected error" - with patch("iblai_infra.ansible.runner.subprocess.run", return_value=mock_result): + with ( + patch("iblai_infra.ansible.runner.subprocess.run", return_value=mock_result), + patch("iblai_infra.ansible.runner.time.sleep"), + ): assert runner._test_ssh() is False def test_ssh_empty_stderr(self, runner): @@ -491,7 +507,10 @@ def test_ssh_empty_stderr(self, runner): mock_result.returncode = 1 mock_result.stderr = "" - with patch("iblai_infra.ansible.runner.subprocess.run", return_value=mock_result): + with ( + patch("iblai_infra.ansible.runner.subprocess.run", return_value=mock_result), + patch("iblai_infra.ansible.runner.time.sleep"), + ): assert runner._test_ssh() is False diff --git a/tests/conftest.py b/tests/conftest.py index e9e0b96..067ed2e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -47,7 +47,7 @@ def infra_config(aws_credentials: AWSCredentials) -> InfraConfig: environment=Environment.DEV, credentials=aws_credentials, network=NetworkConfig(vpc_cidr="10.0.0.0/16", vpn_ip="203.0.113.42"), - compute=ComputeConfig(instance_type="t3.2xlarge", volume_size=50, volume_type="gp3"), + compute=ComputeConfig(instance_type="t3.2xlarge", volume_size=100, volume_type="gp3"), ssh=SSHConfig( method=SSHKeyMethod.GENERATE, key_name="testproject-dev", diff --git a/tests/prompts/test_review.py b/tests/prompts/test_review.py index 3c5c45b..017d89f 100644 --- a/tests/prompts/test_review.py +++ b/tests/prompts/test_review.py @@ -41,7 +41,7 @@ def _make_config( account_id="123456789012", ), network=NetworkConfig(vpn_ip="1.2.3.4"), - compute=ComputeConfig(instance_type="t3.2xlarge", volume_size=50, volume_type="gp3"), + compute=ComputeConfig(instance_type="t3.2xlarge", volume_size=100, volume_type="gp3"), ssh=SSHConfig( method=ssh_method, key_name="test-key", diff --git a/tests/prompts/test_setup.py b/tests/prompts/test_setup.py index 84a6443..548809d 100644 --- a/tests/prompts/test_setup.py +++ b/tests/prompts/test_setup.py @@ -192,7 +192,7 @@ def test_full_flow_reuse_credentials(self, tmp_path): mock_password.return_value.ask.side_effect = ["ghp_testtoken", "", "Admin1234"] # confirms: enable_ai, create_playwright_platforms, smtp_enabled, stripe_enabled, google_sso_enabled, microsoft_sso_enabled, reuse credentials mock_confirm.return_value.ask.side_effect = [True, False, False, False, False, False, True] - mock_text.return_value.ask.side_effect = ["main", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "ibl_admin", "admin@example.com"] + mock_text.return_value.ask.side_effect = ["main", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "platform_admin", "admin@example.com"] config = prompt_setup(state) @@ -206,7 +206,7 @@ def test_full_flow_reuse_credentials(self, tmp_path): assert config.git_access_token == "ghp_testtoken" assert config.target_host == "54.1.2.3" assert config.base_domain == "example.com" - assert config.admin_username == "ibl_admin" + assert config.admin_username == "platform_admin" assert config.admin_email == "admin@example.com" assert config.admin_password == "Admin1234" @@ -224,7 +224,7 @@ def test_full_flow_new_credentials(self, tmp_path): mock_password.return_value.ask.side_effect = ["ghp_testtoken", "NEW_SECRET", "sk-test-key", "Admin1234"] # confirms: enable_ai, create_playwright_platforms, smtp_enabled, stripe_enabled, google_sso_enabled, microsoft_sso_enabled, don't reuse credentials mock_confirm.return_value.ask.side_effect = [True, False, False, False, False, False, False] - mock_text.return_value.ask.side_effect = ["main", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "NEW_ACCESS_KEY", "ibl_admin", "admin@example.com"] + mock_text.return_value.ask.side_effect = ["main", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "NEW_ACCESS_KEY", "platform_admin", "admin@example.com"] config = prompt_setup(state) @@ -250,7 +250,7 @@ def test_flow_no_access_keys_prompts_directly(self, tmp_path): mock_password.return_value.ask.side_effect = ["ghp_testtoken", "SECRET", "", "Admin1234"] # confirms: enable_ai, create_playwright_platforms, smtp_enabled, stripe_enabled, google_sso_enabled, microsoft_sso_enabled (no reuse prompt when no access keys) mock_confirm.return_value.ask.side_effect = [True, True, False, False, False, False] - mock_text.return_value.ask.side_effect = ["main", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "ACCESS_KEY", "ibl_admin", "admin@example.com"] + mock_text.return_value.ask.side_effect = ["main", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "ACCESS_KEY", "platform_admin", "admin@example.com"] config = prompt_setup(state) @@ -280,7 +280,7 @@ def test_ssh_key_not_found_prompts(self, tmp_path): # confirms: enable_ai, create_playwright_platforms, smtp_enabled, stripe_enabled, google_sso_enabled, microsoft_sso_enabled, reuse credentials mock_confirm.return_value.ask.side_effect = [True, False, False, False, False, False, True] mock_path.return_value.ask.return_value = str(new_key) - mock_text.return_value.ask.side_effect = ["main", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "ibl_admin", "admin@example.com"] + mock_text.return_value.ask.side_effect = ["main", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "platform_admin", "admin@example.com"] config = prompt_setup(state) @@ -306,7 +306,7 @@ def test_existing_file_method_prompts_for_key(self, tmp_path): # confirms: enable_ai, create_playwright_platforms, smtp_enabled, stripe_enabled, google_sso_enabled, microsoft_sso_enabled, reuse credentials mock_confirm.return_value.ask.side_effect = [True, False, False, False, False, False, True] mock_path.return_value.ask.return_value = str(key) - mock_text.return_value.ask.side_effect = ["main", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "ibl_admin", "admin@example.com"] + mock_text.return_value.ask.side_effect = ["main", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "platform_admin", "admin@example.com"] config = prompt_setup(state) @@ -332,7 +332,7 @@ def test_aws_keypair_method_prompts_for_key(self, tmp_path): # confirms: enable_ai, create_playwright_platforms, smtp_enabled, stripe_enabled, google_sso_enabled, microsoft_sso_enabled, reuse credentials mock_confirm.return_value.ask.side_effect = [True, False, False, False, False, False, True] mock_path.return_value.ask.return_value = str(key) - mock_text.return_value.ask.side_effect = ["main", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "ibl_admin", "admin@example.com"] + mock_text.return_value.ask.side_effect = ["main", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "platform_admin", "admin@example.com"] config = prompt_setup(state) @@ -376,7 +376,7 @@ def test_full_flow_smtp_enabled(self, tmp_path): "iblai", "iblai-cli-ops", "iblai-prod-images", - "ibl_admin", + "platform_admin", "admin@example.com", ] @@ -433,7 +433,7 @@ def test_full_flow_stripe_enabled(self, tmp_path): "iblai", "iblai-cli-ops", "iblai-prod-images", - "ibl_admin", + "platform_admin", "admin@example.com", ] @@ -487,7 +487,7 @@ def test_full_flow_google_sso_enabled(self, tmp_path): "iblai", "iblai-cli-ops", "iblai-prod-images", - "ibl_admin", + "platform_admin", "admin@example.com", ] @@ -539,7 +539,7 @@ def test_full_flow_microsoft_sso_enabled(self, tmp_path): "iblai", "iblai-cli-ops", "iblai-prod-images", - "ibl_admin", + "platform_admin", "admin@example.com", ] @@ -575,7 +575,7 @@ def test_platform_name_lowercased_and_stripped(self, tmp_path): "iblai", "iblai-cli-ops", "iblai-prod-images", - "ibl_admin", + "platform_admin", "admin@example.com", ] @@ -637,7 +637,7 @@ def test_full_resetup_flow(self, tmp_path): # Only one confirm: reuse credentials mock_confirm.return_value.ask.return_value = True # text prompts: base_domain, cli_ops_release_tag, admin_username, admin_email - mock_text.return_value.ask.side_effect = ["new.example.com", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "ibl_admin", "admin@example.com"] + mock_text.return_value.ask.side_effect = ["new.example.com", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "platform_admin", "admin@example.com"] config = prompt_resetup(state) @@ -648,7 +648,7 @@ def test_full_resetup_flow(self, tmp_path): assert config.aws_access_key_id == "AKIA" assert config.aws_secret_access_key == "SECRET" assert config.git_access_token == "ghp_testtoken" - assert config.admin_username == "ibl_admin" + assert config.admin_username == "platform_admin" assert config.admin_email == "admin@example.com" assert config.admin_password == "Admin1234" @@ -666,7 +666,7 @@ def test_resetup_prompts_for_base_domain(self, tmp_path): ): mock_password.return_value.ask.side_effect = ["ghp_testtoken", "", "Admin1234"] mock_confirm.return_value.ask.return_value = True - mock_text.return_value.ask.side_effect = ["changed.example.com", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "ibl_admin", "admin@example.com"] + mock_text.return_value.ask.side_effect = ["changed.example.com", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "platform_admin", "admin@example.com"] config = prompt_resetup(state) @@ -688,7 +688,7 @@ def test_resetup_ssh_key_resolved(self, tmp_path): ): mock_password.return_value.ask.side_effect = ["ghp_testtoken", "", "Admin1234"] mock_confirm.return_value.ask.return_value = True - mock_text.return_value.ask.side_effect = ["new.example.com", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "ibl_admin", "admin@example.com"] + mock_text.return_value.ask.side_effect = ["new.example.com", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "platform_admin", "admin@example.com"] config = prompt_resetup(state) @@ -710,7 +710,7 @@ def test_resetup_new_credentials(self, tmp_path): # Decline reusing credentials mock_confirm.return_value.ask.return_value = False # Region is pre-populated from state, so not prompted - mock_text.return_value.ask.side_effect = ["new.example.com", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "NEW_KEY", "ibl_admin", "admin@example.com"] + mock_text.return_value.ask.side_effect = ["new.example.com", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "NEW_KEY", "platform_admin", "admin@example.com"] config = prompt_resetup(state) @@ -741,7 +741,7 @@ def test_resetup_with_ingress_selection(self, tmp_path): mock_password.return_value.ask.side_effect = ["ghp_testtoken", "", "Admin1234"] mock_confirm.return_value.ask.return_value = True # text prompts: cli_ops_release_tag, admin_username, admin_email - mock_text.return_value.ask.side_effect = ["main", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "ibl_admin", "admin@example.com"] + mock_text.return_value.ask.side_effect = ["main", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "platform_admin", "admin@example.com"] config = prompt_resetup(state) @@ -771,7 +771,7 @@ def test_resetup_ingress_custom_fallback(self, tmp_path): # text prompts: custom domain, cli_ops_release_tag, # github_org, cli_ops_repo, prod_images_repo, # admin_username, admin_email - mock_text.return_value.ask.side_effect = ["custom.example.com", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "ibl_admin", "admin@example.com"] + mock_text.return_value.ask.side_effect = ["custom.example.com", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "platform_admin", "admin@example.com"] config = prompt_resetup(state) diff --git a/tests/terraform/test_runner.py b/tests/terraform/test_runner.py index 8260e8c..a562663 100644 --- a/tests/terraform/test_runner.py +++ b/tests/terraform/test_runner.py @@ -169,7 +169,7 @@ def test_basic_tfvars(self, infra_config, tmp_path): assert 'environment = "dev"' in tfvars assert 'region = "us-east-1"' in tfvars assert 'instance_type = "t3.2xlarge"' in tfvars - assert "root_volume_size = 50" in tfvars + assert "root_volume_size = 100" in tfvars assert 'base_domain = "example.com"' in tfvars assert "create_key_pair = true" in tfvars diff --git a/tests/test_env_setup.py b/tests/test_env_setup.py index 7f4af9e..898ac13 100644 --- a/tests/test_env_setup.py +++ b/tests/test_env_setup.py @@ -34,7 +34,7 @@ def _required_env(**overrides) -> dict[str, str]: "AWS_ACCESS_KEY_ID": "AKIAIOSFODNN7EXAMPLE", "AWS_SECRET_ACCESS_KEY": "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY", "GIT_TOKEN": "test-pat-value", - "ADMIN_USERNAME": "ibl_admin", + "ADMIN_USERNAME": "platform_admin", "ADMIN_EMAIL": "admin@example.com", "ADMIN_PASSWORD": "change-me-min-8-chars", } @@ -100,7 +100,7 @@ def test_minimal_env_produces_valid_config(self, project_state): assert config.target_host == "54.123.45.67" # from project_state fixture assert config.base_domain == "example.com" assert config.aws_default_region == "us-east-1" - assert config.admin_username == "ibl_admin" + assert config.admin_username == "platform_admin" def test_aws_default_region_derived_from_state(self, project_state): project_state.config.credentials.region = "eu-west-1" diff --git a/tests/test_models.py b/tests/test_models.py index 7325015..cb15427 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -161,10 +161,14 @@ def test_default_vpc_cidr(self): class TestComputeConfigValidation: def test_valid_volume_size(self): - cc = ComputeConfig(volume_size=100) - assert cc.volume_size == 100 + cc = ComputeConfig(volume_size=200) + assert cc.volume_size == 200 def test_minimum_volume_size(self): + # ComputeConfig itself only enforces 20 GB (the call-server placeholder + # path reuses this model with ~40 GB). The 100 GB IBL-platform floor + # is enforced on `InfraConfig` for `DeploymentType.SINGLE` — see + # TestSingleServerVolumeFloor below. cc = ComputeConfig(volume_size=20) assert cc.volume_size == 20 @@ -183,7 +187,7 @@ def test_negative_raises(self): def test_defaults(self): cc = ComputeConfig() assert cc.instance_type == "t3.2xlarge" - assert cc.volume_size == 50 + assert cc.volume_size == 100 assert cc.volume_type == "gp3" From ab650f89198d6aade97ac1c30acdce116fb7094b Mon Sep 17 00:00:00 2001 From: bnsoni Date: Wed, 20 May 2026 08:48:23 +0300 Subject: [PATCH 2/8] docs(readme): refresh against current playbook, deployment topologies, and defaults MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fixed broken `iblai-cli-ops` link (was `ibl-cli-ops`). * Replaced the stale 9-row role table with a phase-grouped table covering the actual 16 roles in `playbook.yml` (host setup, platform install, core services, finalization, optional integrations, post-tasks). Removed the dead `final_steps` row. * Provision section now mentions the three deployment topologies (single / multi / call), the 100 GB volume floor, and the 32 GB memory warning. * Setup section notes the tenant `Platform` launch when `PLATFORM_NAME` is set to anything other than `main`, that reserved usernames (`ibl_admin`) are rejected with `platform_admin` as the new default suggestion, and that Stripe / advertising are off by default. * Section 6 (Launch from AMI) collapsed from three near-duplicate examples to one `.env-driven` + one `--flag-driven` block. Cleanup reference removed (covered in section 8 / Manage environments). * Section 4 (non-interactive provision + setup) trimmed; same content in fewer paragraphs. * Project-structure tree: added `env_provision.py` + `env_setup.py`, added `launch_playbook.yml` + `service_update_playbook.yml`, removed the inaccurate "9 Ansible roles" annotation, bumped test count 357 → 562. Net: -50 lines, no client-specific examples or hosts, all instructions match the current code. Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 148 ++++++++++++++++++------------------------------------ 1 file changed, 49 insertions(+), 99 deletions(-) diff --git a/README.md b/README.md index dbc7d01..e76a563 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ The following are installed as Python package dependencies when you install ibla The setup phase installs and configures the following on the provisioned EC2 instance: -- **[iblai-cli-ops](https://github.com/iblai/ibl-cli-ops)** -- the IBL platform management CLI, cloned and installed inside a pyenv virtualenv on the server. This is a required dependency for all service launches. **Note:** This is a private repository -- unauthenticated users or those without access will see a 404. +- **[iblai-cli-ops](https://github.com/iblai/iblai-cli-ops)** -- the IBL platform management CLI, installed inside a pyenv virtualenv on the server. Required by every service launch. **Private repository — unauthenticated requests see a 404.** - **Docker Engine** with docker compose - **pyenv** with Python 3.11.8 - **AWS CLI v2** for ECR authentication and S3 access @@ -112,10 +112,13 @@ iblai infra provision Interactive wizard that walks you through: 1. **AWS credentials** -- profile, access keys, or environment variables -2. **Project & compute** -- name, environment (dev/staging/prod), instance type, volume size -3. **Network & SSH** -- VPC CIDR, VPN IP for SSH access, SSH key setup -4. **Domain & certificates** -- base domain, Route53 integration, certificate method (ACM, upload, or none) -5. **Review** -- full summary before applying +2. **Deployment topology** -- single-server, multi-server (N app servers + 1 services server), or call-server (standalone LiveKit) +3. **Project & compute** -- name, environment (dev/staging/prod), instance type, volume size +4. **Network & SSH** -- VPC CIDR, VPN IP for SSH access, SSH key setup +5. **Domain & certificates** -- base domain, Route53 integration, certificate method (ACM, upload, or none) +6. **Review** -- full summary before applying + +Sizing guidance: single / multi-server require a **100 GB minimum** root volume. Picking a 32 GB-RAM instance prints a non-blocking heads-up suggesting 64 GB (e.g. `m5.4xlarge` / `r5.2xlarge`) when AI features will be enabled. Terraform runs with real-time progress showing each resource as it's created. @@ -131,54 +134,38 @@ Both paths run the same Ansible playbook. The difference is where the inputs com - **With a project name** -- auto-populates IP, domain, SSH key, and AWS credentials from the Terraform state - **Without a project name** -- prompts for server IP, SSH key, domain, image tags, and credentials interactively. No Terraform required. -The playbook runs 9 sequential roles: - -| Role | What it does | -|------|-------------| -| `docker` | Installs Docker Engine, docker compose, and apache2-utils | -| `awscli` | Installs AWS CLI v2 for ECR and S3 access | -| `python` | Installs pyenv and Python 3.11.8 | -| `ibl_cli_ops` | Installs [iblai-prod-images](https://github.com/iblai/iblai-prod-images) (which includes iblai-cli-ops and pinned image versions) via `uv pip install` | -| `ibl_platform` | Configures base domain, environment, image tags, CORS, RBAC, unified API gateway, and service defaults | -| `ibl_dm` | Launches iblai-dm-pro (PostgreSQL with pgvector, Redis, Django, Celery, Langfuse, Minio) | -| `ibl_edx` | Launches iblai-edx-pro (LMS, CMS, MySQL, MongoDB, Redis, Elasticsearch, MFE) | -| `ibl_spa` | Creates OAuth2 apps, configures and launches Auth, Mentor AI, and Skills AI SPAs | -| `final_steps` | Reloads proxy, OAuth/OIDC setup, syncs edX with DM, creates super admins, seeds CSRF domains, flows, LLM registry, mentors, and RBAC data | - -The setup wizard prompts for: -- Target host IP and SSH key path -- Base domain and environment config -- iblai-cli-ops release tag (image versions are pinned by [iblai-prod-images](https://github.com/iblai/iblai-prod-images)) -- Whether to enable AI features -- OpenAI API key (optional) -- Super admin credentials (username, email, password) -- GitHub PAT and AWS credentials for the VM +The playbook runs sequential roles grouped by concern: + +| Phase | Roles | What it does | +|---|---|---| +| Host setup | `docker`, `awscli`, `python` | Docker Engine + compose, AWS CLI v2, pyenv + Python 3.11.8 | +| Platform install | `ibl_cli_ops`, `ibl_platform` | Installs [iblai-prod-images](https://github.com/iblai/iblai-prod-images) (pins `iblai-cli-ops` + image versions); configures base domain, CORS, RBAC, gateway, defaults | +| Core services | `ibl_dm`, `ibl_edx`, `ibl_spa` | Launches DM (Django + Postgres + Redis + Celery + Flowise + Langfuse), edX (LMS / CMS / MySQL / MongoDB / Elasticsearch / Forum), and the Auth / Mentor / Skills SPAs | +| Finalization | `integrations`, `admin_setup`, `data_seeding`, `ibl_tenant_platform` | OAuth/OIDC setup, syncs edX with DM, creates super admin, seeds CSRF / flows / LLM registry / mentors / RBAC; launches a tenant `Platform` via `run_launch_steps` when `PLATFORM_NAME` is set to anything other than `main` | +| Optional integrations | `smtp_config`, `stripe_config`, `google_sso_config`, `microsoft_sso_config` | Each role no-ops unless its trigger key (`SMTP_HOST` / `STRIPE_SECRET_KEY` / `GOOGLE_SSO_CLIENT_ID` / `MICROSOFT_SSO_CLIENT_ID`) is set | +| Post-tasks | `ibl global-proxy reload` | Final nginx reload so any SSO-driven edX/SPA restarts are picked up before exit | + +The setup wizard prompts for: target host + SSH key, base domain, tenant platform name (blank for `main` — `main` itself is reserved), `iblai-cli-ops` release tag, enable-AI toggle, OpenAI key, super admin credentials, GitHub PAT, and AWS credentials. Reserved usernames (e.g. `ibl_admin`) are rejected — the new default suggestion is `platform_admin`. Stripe billing UI and advertising are **off by default**; enable Stripe by passing `STRIPE_SECRET_KEY`. ### 4. Non-interactive provision + setup (`.env` file) -Skip the wizards. Same Terraform + same 9-role Ansible playbook as the interactive flow, just driven from a `.env` file. **Single-server only** (multi/call still use the wizard). +Skip the wizards. Same Terraform + same Ansible roles as the interactive flow, driven from a `.env` file. **Single-server only** (multi / call still use the wizard). ```bash # Provision (Terraform) — fresh single-server, no AMI required -cp .env.provision.example .env.provision -$EDITOR .env.provision # fill in PROJECT_NAME, DOMAIN, AWS creds, etc. +cp .env.provision.example .env.provision && $EDITOR .env.provision iblai infra provision-env -f .env.provision -# Bootstrap (Ansible) — runs against the project just provisioned -cp .env.setup.example .env.setup -$EDITOR .env.setup # fill in GIT_TOKEN, admin creds, etc. +# Bootstrap (Ansible) — against the just-provisioned project +cp .env.setup.example .env.setup && $EDITOR .env.setup iblai infra setup-env -f .env.setup ``` -**Free-standing server (any cloud, no Terraform):** omit the project name and add `TARGET_HOST`, `SSH_PRIVATE_KEY_PATH`, `BASE_DOMAIN`, `PROJECT_NAME` to your `.env.setup`: +**Free-standing server** (any cloud, no Terraform): omit the project name and add `TARGET_HOST`, `SSH_PRIVATE_KEY_PATH`, `BASE_DOMAIN`, `PROJECT_NAME` to `.env.setup`, then `iblai infra setup-env -f .env.setup`. -```bash -iblai infra setup-env -f .env.setup # builds a synthetic ProjectState, runs Ansible -``` - -**`.env` schema:** `.env.provision.example` and `.env.setup.example` document every key with synthetic placeholders. Required vs. optional, defaults, and integration triggers (SMTP / Stripe / Google SSO / Microsoft SSO — each enabled when its trigger key is set) are inline. +**Schema:** `.env.provision.example` and `.env.setup.example` document every key inline (required vs. optional, defaults, integration triggers). -**Security note:** populated `.env` files are gitignored by default (`.gitignore` blocks `.env.*` except the `*.example` templates). Never commit a real `.env`. The CLI never persists secrets to `state.json` — they ride `--extra-vars` into Ansible at run time only. +**Security:** populated `.env` files are gitignored (`.env.*` blocked except `*.example`). The CLI never persists secrets to `state.json` — they ride `--extra-vars` into Ansible at run time only. ### 5. Re-setup an existing environment @@ -192,66 +179,25 @@ Use this when you need to change the domain or rotate credentials on a running e ### 6. Launch from AMI -**Simplest way — using a `.env` file:** +One-shot Terraform + Ansible from a pre-built AMI. Two equivalent entry points — `.env` for ergonomics, flags for CI/CD. ```bash -cp .env.example .env # Copy the template -vim .env # Fill in your values -iblai infra launch-env # Review summary, confirm, launch -``` +# .env-driven (review + confirm) +cp .env.example .env && $EDITOR .env +iblai infra launch-env -The CLI reads `.env` from the current directory, shows a summary of what will be launched, and asks for confirmation before proceeding. - -**Non-interactive (CI/CD) — using flags:** - -```bash +# Fully non-interactive (CI/CD pipelines) iblai infra launch \ - --ami-id $AMI_ID \ - --domain $DOMAIN \ - --hosted-zone-id $HOSTED_ZONE_ID \ - --aws-key-id $AWS_ACCESS_KEY_ID \ - --aws-secret-key $AWS_SECRET_ACCESS_KEY \ - --ssh-public-key "$SSH_PUBLIC_KEY" \ - --ssh-key $SSH_KEY_PATH \ - --git-token $GIT_TOKEN \ - --admin-email $ADMIN_EMAIL \ - --admin-password $ADMIN_PASSWORD \ - --vpn-ip $VPN_IP + --ami-id $AMI_ID --domain $DOMAIN --hosted-zone-id $HOSTED_ZONE_ID \ + --aws-key-id $AWS_ACCESS_KEY_ID --aws-secret-key $AWS_SECRET_ACCESS_KEY \ + --ssh-public-key "$SSH_PUBLIC_KEY" --ssh-key $SSH_KEY_PATH \ + --git-token $GIT_TOKEN --vpn-ip $VPN_IP \ + --admin-email $ADMIN_EMAIL --admin-password $ADMIN_PASSWORD ``` -Fully non-interactive command for CI/CD pipelines (e.g. GitHub Actions). Provisions AWS infrastructure from a pre-built AMI via Terraform, then configures the platform via Ansible — all in one step. - -**What it does:** -1. **Terraform** -- creates VPC, ALB, ACM certificates, Route53 DNS records, and launches EC2 from the specified AMI -2. **Ansible** -- sets domain, rotates secrets, syncs database passwords, restarts all services (DM, edX, SPAs), runs final setup (OAuth, admin creation, data seeding) - -**Cleanup:** - -```bash -iblai infra destroy # Tears down all Terraform resources -``` - -**Using a `.env` file:** - -Copy `.env.example` to `.env`, fill in real values, then: - -```bash -source .env -iblai infra launch \ - --ami-id $AMI_ID \ - --domain $DOMAIN \ - --hosted-zone-id $HOSTED_ZONE_ID \ - --aws-key-id $AWS_ACCESS_KEY_ID \ - --aws-secret-key $AWS_SECRET_ACCESS_KEY \ - --ssh-public-key "$SSH_PUBLIC_KEY" \ - --ssh-key $SSH_KEY_PATH \ - --git-token $GIT_TOKEN \ - --admin-email $ADMIN_EMAIL \ - --admin-password $ADMIN_PASSWORD \ - --vpn-ip $VPN_IP -``` +**Flow:** Terraform creates VPC / ALB / ACM / Route53 and launches EC2 from the AMI → Ansible sets the domain, rotates secrets, syncs DB passwords, restarts services, runs OAuth + admin + seeding → final `ibl global-proxy reload`. -See `iblai infra launch --help` for all optional flags (instance type, volume size, region, AI features, etc.). +See `iblai infra launch --help` for optional flags (instance type, volume size, region, `--platform-name`, SMTP / Stripe / SSO toggles, `--enable-ai`). ### 7. Service update (image updates, CI/CD) @@ -369,15 +315,19 @@ iblai-infra-ops/ │ ├── app.py # Application logic │ ├── models.py # Pydantic models │ ├── ui.py # Rich terminal UI +│ ├── env_provision.py # .env → InfraConfig (provision-env) +│ ├── env_setup.py # .env → SetupConfig (setup-env) │ ├── prompts/ # Interactive questionary prompts │ ├── providers/ # AWS provider (STS, EC2, S3) -│ ├── terraform/ # Terraform runner and templates -│ │ └── templates/aws/single-server/ -│ └── ansible/ # Ansible runner and templates +│ ├── terraform/ # Terraform runner + templates +│ │ └── templates/aws/ # single-server, multi-server, call-server +│ └── ansible/ # Ansible runner + templates │ └── templates/single-server/ -│ ├── playbook.yml -│ └── roles/ # 9 Ansible roles -├── tests/ # 357 tests +│ ├── playbook.yml # interactive setup + setup-env +│ ├── launch_playbook.yml # AMI launch + launch-env +│ ├── service_update_playbook.yml +│ └── roles/ # ansible roles (see playbook table) +├── tests/ # 562 tests, ~1.3s ├── docs/ # Architecture diagrams └── pyproject.toml ``` From 310f8eac77457fe74aac974419e41a7afb547e5a Mon Sep 17 00:00:00 2001 From: bnsoni Date: Wed, 20 May 2026 08:53:31 +0300 Subject: [PATCH 3/8] chore(release): 1.10.0 - Bump `__version__` to 1.10.0 - Add CHANGELOG entry covering the tenant launcher, reserved-name rules, safer SPA defaults, 100 GB volume floor, 32 GB memory warning, Microsoft SSO IBL_SPA.AUTH completion, final proxy reload, codebase scrub, and the slow-test fix. Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 22 ++++++++++++++++++++++ src/iblai_infra/__init__.py | 2 +- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 80d85ba..e382eb4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,27 @@ # Changelog +## [1.10.0] — 2026-05-20 + +### Added +- **`ibl_tenant_platform` ansible role** — launches a tenant `Platform` (Platform + admin User + UserPlatformLink) via `run_launch_steps` when `PLATFORM_NAME` is set to anything other than `main`. NOT a raw `Platform.objects.create()` — the state machine fires every after_launch signal (default apps, edX hooks, UserPlatformLink flags). Wired into both `playbook.yml` (setup / setup-env) and `launch_playbook.yml` (launch / launch-env). Skips + logs on re-runs when the tenant already exists. Also writes `PLATFORM_NAME=` (uppercase) at the root of `/ibl/config.yml` and enforces `Platform.show_paywall=False` + `Platform.is_advertising=False` as defense in depth. Surfaces the generated admin password via the `IBLAI_FIXTURE_OUTPUT` pipeline — printed once after the Rich Live display tears down, never persisted to disk. +- **Microsoft SSO writes `IBL_SPA.AUTH`** — `microsoft_sso_config` now also patches `EXTERNAL_IDP_LOGOUT_URL` and `IBL_DIRECT_SSO_URL` (using `microsoft_sso_tenant_id`, falling back to `common`), then restarts the Auth + Mentor SPAs so the new auth flow takes effect. +- **`INSTANCE_RAM_GB` helper + 32 GB memory warning** — non-blocking heads-up suggesting 64 GB (e.g. `m5.4xlarge` / `r5.2xlarge`) when the operator picks a 32 GB instance. Always shown in the interactive provision wizard and `provision-env`; conditional in `launch` / `launch-env` (only when AI is enabled). +- **Final `ibl global-proxy reload`** added as `post_tasks` in both `playbook.yml` and `launch_playbook.yml`, so any nginx state touched by SSO roles (edX restarts in `google_sso_config` / `microsoft_sso_config`) is reloaded before the playbook exits. +- **`RESERVED_ADMIN_USERNAMES` + `RESERVED_PLATFORM_NAMES`** — `models.py` constants, surfaced via `is_reserved_admin_username()` and `is_reserved_platform_name()` helpers and an `InfraConfig` model_validator. + +### Changed +- **Stripe billing UI off by default** — `IBL_SPA.MENTOR.STRIPE_ENABLED=false` and `IBL_SPA.MENTOR.ENABLE_ADVERTISING=false` are now written unconditionally by `ibl_spa` (fresh installs) and `ibl_launch_services` (AMI launches). **Behavior change:** Stripe-using deployments must explicitly flip `IBL_SPA.MENTOR.STRIPE_ENABLED` back to `'true'` post-setup. The previous "always on" SPA flag surfaced billing UI even when Stripe wasn't actually configured. +- **100 GB minimum root volume for single / multi server** — enforced by Pydantic (`InfraConfig` model_validator gated on `DeploymentType.SINGLE`, plus `MultiServerConfig.validate_volume_sizes`) and matching interactive + CLI + .env input checks. **Behavior change:** values below 100 GB are now rejected upfront. Default `ComputeConfig.volume_size` bumped 50 → 100. Call-server unchanged (LiveKit only needs ~40 GB). +- **`ADMIN_USERNAME=ibl_admin` rejected at every input layer** — reserved for the SPA OAuth Application owner the platform itself maintains. New default suggestion is `platform_admin`. Interactive prompts, `.env` parsers, and `--admin-username` flag all reject `ibl_admin` with a clear reserved-name error. **Behavior change:** scripted deploys passing `ADMIN_USERNAME=ibl_admin` must rename. +- **`PLATFORM_NAME=main` rejected as an explicit input** — unset / blank silently resolves to `main` (preserving SSO `backend_name=main-oauth2` and skipping the tenant launcher). **Behavior change:** scripted deploys passing `PLATFORM_NAME=main` should drop the line. +- **README** — refreshed against current playbook (16 roles, phase-grouped table), three deployment topologies, sizing guidance, tenant launcher, reserved-name rules. -50 lines net. + +### Removed +- **All references to a specific canonical-client name** from comments, docstrings, prompt instructions, error hints, and example .env files. Placeholders: `` for monorepo org names, `acme` for tenant-key examples. + +### Fixed +- **Slow `_test_ssh()` retry-path tests** — five tests in `tests/ansible/test_runner.py` exercise the SSH-retry exhaust path (10 retries × 15 s sleep). They now mock `time.sleep` alongside the existing `subprocess.run` mock, cutting ~11 minutes off the full suite. Test count: 562 passing in ~1.3 s. + ## [1.7.0] — 2026-05-06 ### Added diff --git a/src/iblai_infra/__init__.py b/src/iblai_infra/__init__.py index 12479d2..2279a18 100644 --- a/src/iblai_infra/__init__.py +++ b/src/iblai_infra/__init__.py @@ -1,3 +1,3 @@ """ibl.ai Infrastructure Provisioning Tool.""" -__version__ = "1.9.0" +__version__ = "1.10.0" From 09f334dd2abdb767ebc7fb21db698f5fccfc17d4 Mon Sep 17 00:00:00 2001 From: bnsoni Date: Wed, 20 May 2026 11:17:49 +0300 Subject: [PATCH 4/8] feat(provision): post-provision runtime IAM policy + setup instructions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds `src/iblai_infra/runtime_iam.py` — a small helper that runs at the tail end of `provision` / `provision-env` and prints the exact minimum-privilege IAM policy the operator needs to attach to a scoped "runtime" user in their own AWS account before `setup-env` runs. The motivation: today the AWS keys baked into `/ibl/config.yml` on the platform server have to serve TWO accounts at once — IBL's ECR (image pulls) and the operator's own S3 buckets. Reusing the provisioning admin keys is overkill and minting a separate user by hand is friction. This change closes that gap by: 1. **Computing the policy JSON at runtime** — bucket ARNs come from the actual `s3_bucket_{backups,media,static}` terraform outputs, not from any hardcoded list. ECR scope targets IBL's `arn:aws:ecr:: :repository/*` via two centralized module constants. 2. **Saving it to `/runtime-iam-policy.json`** so the operator can pipe it into `aws iam put-user-policy --policy-document file://...` without copy-pasting JSON. 3. **Printing three ready-to-run `aws iam` commands** (`create-user`, `put-user-policy`, `create-access-key`) with the project / environment substituted into the user name. 4. **Pointing the operator at `.env.setup`** with the exact lines to update (`AWS_ACCESS_KEY_ID` + `AWS_SECRET_ACCESS_KEY`). Policy scope: - S3: literal bucket ARNs only (no wildcards, no bucket-policy mutation, no lifecycle config) with `Get/Put/Delete/Acl + ListBucket / GetBucketLocation`. - ECR: `GetAuthorizationToken` on `*` (AWS requires this) plus `BatchGetImage`, `BatchCheckLayerAvailability`, `GetDownloadUrlForLayer` scoped to IBL's ECR repos. Skipped automatically for `DeploymentType.CALL` (no S3 buckets, separate credential flow). Other changes: - `.env.setup.example` — `AWS_ACCESS_KEY_ID` comment block now directs the operator to use the runtime user from the post-provision step, not their provisioning admin keys. - `README.md` — new sub-section under "Provision infrastructure" documenting the runtime IAM step + the scope table. Section 4 (non- interactive `.env` flow) renumbered as a 3-step sequence so the IAM step isn't missed. - `__version__` 1.10.0 → 1.11.0 + CHANGELOG entry. 11 new tests in `tests/test_runtime_iam.py` (policy shape, ARN generation, tight verb set, call-server skip, empty-output handling, JSON round-trip). Full suite: 576 passing in ~1.3 s. No hardcoded bucket names, no client references — the policy is constructed entirely from terraform outputs. Co-Authored-By: Claude Opus 4.7 (1M context) --- .env.setup.example | 10 +- CHANGELOG.md | 11 ++ README.md | 32 +++++- src/iblai_infra/__init__.py | 2 +- src/iblai_infra/app.py | 7 ++ src/iblai_infra/runtime_iam.py | 200 +++++++++++++++++++++++++++++++++ tests/test_runtime_iam.py | 134 ++++++++++++++++++++++ 7 files changed, 392 insertions(+), 4 deletions(-) create mode 100644 src/iblai_infra/runtime_iam.py create mode 100644 tests/test_runtime_iam.py diff --git a/.env.setup.example b/.env.setup.example index 92966c1..bd0fdb0 100644 --- a/.env.setup.example +++ b/.env.setup.example @@ -8,7 +8,15 @@ # REQUIRED FOR BOTH MODES # ============================================================================ -# AWS credentials (passed to ansible as extra_vars; embedded in /ibl/config.yml) +# AWS credentials — these get embedded into /ibl/config.yml on the server +# and used for the lifetime of the deployment (ECR image pulls + S3 +# read/write on the buckets created at provision time). +# +# IMPORTANT: do NOT reuse your provisioning admin keys here. After +# `iblai infra provision-env` finishes it prints the exact IAM policy +# JSON + the three `aws iam create-user / put-user-policy / +# create-access-key` commands to mint a minimum-privilege "runtime" user. +# Paste the resulting AccessKeyId + SecretAccessKey below. AWS_ACCESS_KEY_ID=AKIAIOSFODNN7EXAMPLE AWS_SECRET_ACCESS_KEY=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY diff --git a/CHANGELOG.md b/CHANGELOG.md index e382eb4..1ee089b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,16 @@ # Changelog +## [1.11.0] — 2026-05-20 + +### Added +- **Post-provision runtime IAM helper** (`src/iblai_infra/runtime_iam.py`). After `provision` / `provision-env` succeeds, the CLI prints the exact minimum-privilege IAM policy JSON the operator needs to attach to a scoped runtime user in their own AWS account — and writes the same JSON to `/runtime-iam-policy.json` so it can be piped into `aws iam put-user-policy --policy-document file://...`. The policy scopes S3 to the literal bucket ARNs Terraform just created (no wildcards) and grants ECR auth + pull verbs against IBL's image registry. Skipped automatically for `DeploymentType.CALL` (no S3 buckets, different credential flow). +- **Three copy-paste `aws iam` commands** in the post-provision output (`create-user`, `put-user-policy`, `create-access-key`) using the resolved project / environment as the user name — operator pastes the resulting `AccessKeyId` + `SecretAccessKey` directly into `.env.setup`. +- **README sub-section** under "Provision infrastructure" documenting the runtime IAM step + the scope table. + +### Changed +- **`.env.setup.example`** — the `AWS_ACCESS_KEY_ID` / `AWS_SECRET_ACCESS_KEY` comment block now explicitly directs the operator to use the runtime user the post-provision step mints, not their provisioning admin keys. +- **Section 4 of the README** (non-interactive `.env` flow) renumbered as a 3-step sequence (provision → mint runtime user → setup) so the IAM step isn't missed. + ## [1.10.0] — 2026-05-20 ### Added diff --git a/README.md b/README.md index e76a563..b044a90 100644 --- a/README.md +++ b/README.md @@ -122,6 +122,30 @@ Sizing guidance: single / multi-server require a **100 GB minimum** root volume. Terraform runs with real-time progress showing each resource as it's created. +#### After provision succeeds — create a runtime IAM user + +The platform server bakes a single AWS access key into `/ibl/config.yml` for two ongoing purposes: **ECR pulls** (IBL's image registry, cross-account) and **S3 read/write** on the three buckets Terraform just created. Reusing your provisioning admin keys here is overkill — instead, mint a scoped runtime user in your own account. + +When `provision` / `provision-env` finishes it prints the exact IAM policy JSON (also saved to `/runtime-iam-policy.json`) plus three `aws` commands to copy-paste: + +```bash +aws iam create-user --user-name --runtime +aws iam put-user-policy \ + --user-name --runtime \ + --policy-name iblai-runtime \ + --policy-document file:///runtime-iam-policy.json +aws iam create-access-key --user-name --runtime +``` + +Paste the resulting `AccessKeyId` + `SecretAccessKey` into `.env.setup` (`AWS_ACCESS_KEY_ID` + `AWS_SECRET_ACCESS_KEY`). The policy scope is tight: + +| | Resource | Verbs | +|---|---|---| +| S3 | The three buckets Terraform created (no wildcards) | `Get/Put/Delete/Acl/ListBucket` | +| ECR | IBL's `arn:aws:ecr:::repository/*` | `GetAuthorizationToken`, `BatchGetImage`, `BatchCheckLayerAvailability`, `GetDownloadUrlForLayer` | + +No bucket-policy mutation, no lifecycle config, no IAM rights. Safe to leave on the box for the lifetime of the deployment. Skipped automatically for `--deployment-type call-server` (no S3 buckets). + ### 3. Setup the platform ```bash @@ -152,11 +176,15 @@ The setup wizard prompts for: target host + SSH key, base domain, tenant platfor Skip the wizards. Same Terraform + same Ansible roles as the interactive flow, driven from a `.env` file. **Single-server only** (multi / call still use the wizard). ```bash -# Provision (Terraform) — fresh single-server, no AMI required +# 1. Provision (Terraform) — fresh single-server, no AMI required cp .env.provision.example .env.provision && $EDITOR .env.provision iblai infra provision-env -f .env.provision -# Bootstrap (Ansible) — against the just-provisioned project +# 2. Create the runtime IAM user (one-time) — run the 3 `aws iam ...` +# commands printed by step 1, then paste the resulting AccessKeyId + +# SecretAccessKey into .env.setup as AWS_ACCESS_KEY_ID / _SECRET_. + +# 3. Bootstrap (Ansible) — against the just-provisioned project cp .env.setup.example .env.setup && $EDITOR .env.setup iblai infra setup-env -f .env.setup ``` diff --git a/src/iblai_infra/__init__.py b/src/iblai_infra/__init__.py index 2279a18..643a832 100644 --- a/src/iblai_infra/__init__.py +++ b/src/iblai_infra/__init__.py @@ -1,3 +1,3 @@ """ibl.ai Infrastructure Provisioning Tool.""" -__version__ = "1.10.0" +__version__ = "1.11.0" diff --git a/src/iblai_infra/app.py b/src/iblai_infra/app.py index cba809e..568381d 100644 --- a/src/iblai_infra/app.py +++ b/src/iblai_infra/app.py @@ -160,6 +160,13 @@ def show_results(config: InfraConfig, outputs: dict, ws: Path) -> None: ui.info(f"SSH key: [highlight]{config.ssh.private_key_path}[/highlight]") ui.newline() + # Print the post-provision IAM-user setup. Operator needs to create a + # scoped runtime user in their own AWS account before `setup-env` runs + # — see src/iblai_infra/runtime_iam.py for the policy shape + why. + # Skipped for call-server (no S3 buckets, different credential flow). + from iblai_infra.runtime_iam import render_runtime_access_instructions + render_runtime_access_instructions(config, outputs, ws) + def _offer_setup(config: InfraConfig, state) -> None: """After successful provision, offer to run platform setup.""" diff --git a/src/iblai_infra/runtime_iam.py b/src/iblai_infra/runtime_iam.py new file mode 100644 index 0000000..c0ba9eb --- /dev/null +++ b/src/iblai_infra/runtime_iam.py @@ -0,0 +1,200 @@ +"""Post-provision runtime IAM helper. + +The platform server bakes a single AWS access key into `/ibl/config.yml` for +two ongoing purposes: + + 1. **ECR pulls** — `aws ecr get-login-password` against IBL's image + registry. Cross-account; works because IBL's ECR repositories have a + repository policy granting pulls from the operator's AWS account. + 2. **S3 access** — read / write the dm-media, dm-static, and backups + buckets Terraform just created in the operator's own account. + +Rather than reusing the operator's admin keys (full provisioning scope, +massive blast radius) or asking IBL ops to mint a separate user, this +module prints a **scoped IAM policy** the operator pastes into their own +IAM console after `provision-env` / `provision` succeeds. The resulting +access key is minimum-privilege: + + * S3: only the three buckets Terraform created, only the verbs the + platform actually uses (no `s3:*`, no bucket-policy mutation). + * ECR: only the auth + pull verbs, scoped to IBL's ECR repos. + +The policy JSON is also written to the project workspace +(`/runtime-iam-policy.json`) so the operator can pipe it +directly into the CLI: + + aws iam put-user-policy \\ + --user-name -runtime \\ + --policy-name iblai-runtime \\ + --policy-document file:///runtime-iam-policy.json +""" + +from __future__ import annotations + +import json +from pathlib import Path + +from iblai_infra import ui +from iblai_infra.models import DeploymentType, InfraConfig + +# IBL's image registry account / region — the ECR cross-account pull target. +# Centralized here so the rendered policy stays consistent with the actual +# `docker login` target hardcoded across the ansible roles. +IBLAI_ECR_ACCOUNT_ID = "765174860755" +IBLAI_ECR_REGION = "us-east-1" + +# Tight S3 verbs the platform actually uses at runtime. Notably excludes +# bucket-policy / ACL mutations, lifecycle config, encryption config — all +# of which Terraform set up at provision time and the platform never +# revisits. +_S3_OBJECT_ACTIONS = [ + "s3:GetObject", + "s3:PutObject", + "s3:DeleteObject", + "s3:GetObjectAcl", + "s3:PutObjectAcl", +] +_S3_BUCKET_ACTIONS = [ + "s3:ListBucket", + "s3:GetBucketLocation", +] +_ECR_AUTH_ACTIONS = ["ecr:GetAuthorizationToken"] +_ECR_PULL_ACTIONS = [ + "ecr:BatchCheckLayerAvailability", + "ecr:BatchGetImage", + "ecr:GetDownloadUrlForLayer", +] + +POLICY_FILENAME = "runtime-iam-policy.json" + + +def build_runtime_iam_policy(bucket_names: list[str]) -> dict: + """Build the IAM policy JSON document for the runtime user. + + `bucket_names` must be the literal S3 bucket names Terraform created + (the values of `s3_bucket_*` outputs). Returns a dict ready to + `json.dumps()` — no formatting opinions baked in here. + """ + if not bucket_names: + raise ValueError("at least one S3 bucket name is required") + + bucket_arns = [f"arn:aws:s3:::{b}" for b in bucket_names] + object_arns = [f"arn:aws:s3:::{b}/*" for b in bucket_names] + ecr_repo_arn = ( + f"arn:aws:ecr:{IBLAI_ECR_REGION}:{IBLAI_ECR_ACCOUNT_ID}:repository/*" + ) + + return { + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "PlatformBucketObjects", + "Effect": "Allow", + "Action": _S3_OBJECT_ACTIONS, + "Resource": object_arns, + }, + { + "Sid": "PlatformBucketList", + "Effect": "Allow", + "Action": _S3_BUCKET_ACTIONS, + "Resource": bucket_arns, + }, + { + "Sid": "ECRAuth", + "Effect": "Allow", + "Action": _ECR_AUTH_ACTIONS, + "Resource": "*", + }, + { + "Sid": "ECRPullPlatformImages", + "Effect": "Allow", + "Action": _ECR_PULL_ACTIONS, + "Resource": ecr_repo_arn, + }, + ], + } + + +def extract_bucket_names(outputs: dict) -> list[str]: + """Pull bucket names out of a terraform outputs dict. + + Reads the three `s3_bucket_{backups,media,static}` outputs that the + single-server template emits. Returns an empty list when none are + present (e.g. call-server, which has no buckets). + """ + keys = ("s3_bucket_backups", "s3_bucket_media", "s3_bucket_static") + return [outputs[k] for k in keys if outputs.get(k)] + + +def render_runtime_access_instructions( + config: InfraConfig, + outputs: dict, + ws: Path, +) -> None: + """Print post-provision IAM-user setup instructions to the operator. + + Skips silently for `DeploymentType.CALL` (no S3 buckets and the call + stack uses its own credentials flow). Writes the policy JSON to the + workspace at `runtime-iam-policy.json` so the operator can pipe it + into `aws iam put-user-policy --policy-document file://...`. + """ + if config.deployment_type == DeploymentType.CALL: + return + + bucket_names = extract_bucket_names(outputs) + if not bucket_names: + # No buckets in outputs — terraform template might not have run S3 + # creation, or the operator pointed at a deployment shape we don't + # cover. Surface a soft note instead of printing a half-policy. + ui.muted( + "Skipping runtime IAM instructions: no S3 buckets in terraform " + "outputs." + ) + return + + policy = build_runtime_iam_policy(bucket_names) + policy_path = ws / POLICY_FILENAME + policy_path.write_text(json.dumps(policy, indent=2) + "\n") + + user_name = f"{config.project_name}-{config.environment.value}-runtime" + + ui.newline() + ui.console.rule("[bold yellow]Next: create the runtime IAM user[/]") + ui.console.print( + "The platform server needs minimum-privilege AWS credentials baked\n" + "into [highlight]/ibl/config.yml[/highlight] for [bold]ECR pulls[/bold] (IBL's image registry)\n" + "and [bold]S3 access[/bold] to the three buckets Terraform just created.\n" + ) + ui.console.print( + " [muted]The policy below has already been saved to:[/muted]\n" + f" [highlight]{policy_path}[/highlight]\n" + ) + + # Show the policy verbatim so the operator can sanity-check before + # creating anything. Indented blob renders monospace via the IBL theme. + ui.console.rule("[muted]runtime-iam-policy.json[/muted]") + ui.console.print(json.dumps(policy, indent=2)) + ui.console.rule() + ui.newline() + + ui.console.print(" [bold]One-time setup — copy/paste into your shell:[/]\n") + ui.console.print( + f" [highlight]aws iam create-user --user-name {user_name}[/highlight]\n" + f" [highlight]aws iam put-user-policy \\\n" + f" --user-name {user_name} \\\n" + f" --policy-name iblai-runtime \\\n" + f" --policy-document file://{policy_path}[/highlight]\n" + f" [highlight]aws iam create-access-key --user-name {user_name}[/highlight]\n" + ) + ui.console.print( + " Copy the [bold]AccessKeyId[/bold] + [bold]SecretAccessKey[/bold] from the last command into your\n" + " [highlight].env.setup[/highlight] as [highlight]AWS_ACCESS_KEY_ID[/highlight] and [highlight]AWS_SECRET_ACCESS_KEY[/highlight], then run:\n" + ) + ui.console.print( + f" [brand]iblai infra setup-env {config.project_name} -f .env.setup[/brand]\n" + ) + ui.muted( + " These runtime keys are minimum-privilege — safe to commit to a " + "vault or password manager, but never to git." + ) + ui.newline() diff --git a/tests/test_runtime_iam.py b/tests/test_runtime_iam.py new file mode 100644 index 0000000..3f1792f --- /dev/null +++ b/tests/test_runtime_iam.py @@ -0,0 +1,134 @@ +"""Tests for iblai_infra.runtime_iam — IAM policy generator + post-provision output.""" + +from __future__ import annotations + +import json +from pathlib import Path + +import pytest + +from iblai_infra.models import DeploymentType +from iblai_infra.runtime_iam import ( + IBLAI_ECR_ACCOUNT_ID, + IBLAI_ECR_REGION, + POLICY_FILENAME, + build_runtime_iam_policy, + extract_bucket_names, + render_runtime_access_instructions, +) + + +class TestBuildPolicy: + def test_single_bucket(self): + policy = build_runtime_iam_policy(["my-backups"]) + assert policy["Version"] == "2012-10-17" + sids = {s["Sid"] for s in policy["Statement"]} + assert sids == { + "PlatformBucketObjects", + "PlatformBucketList", + "ECRAuth", + "ECRPullPlatformImages", + } + + def test_three_buckets_arn_shape(self): + policy = build_runtime_iam_policy([ + "p-staging-backups", + "p-staging-dm-media", + "p-staging-dm-static", + ]) + objects_stmt = next(s for s in policy["Statement"] if s["Sid"] == "PlatformBucketObjects") + list_stmt = next(s for s in policy["Statement"] if s["Sid"] == "PlatformBucketList") + # Object-level resources get the /* suffix; bucket-level don't. + assert objects_stmt["Resource"] == [ + "arn:aws:s3:::p-staging-backups/*", + "arn:aws:s3:::p-staging-dm-media/*", + "arn:aws:s3:::p-staging-dm-static/*", + ] + assert list_stmt["Resource"] == [ + "arn:aws:s3:::p-staging-backups", + "arn:aws:s3:::p-staging-dm-media", + "arn:aws:s3:::p-staging-dm-static", + ] + + def test_s3_actions_are_tight(self): + policy = build_runtime_iam_policy(["b"]) + obj_actions = next(s["Action"] for s in policy["Statement"] if s["Sid"] == "PlatformBucketObjects") + assert "s3:*" not in obj_actions + # Bucket policy / lifecycle / encryption mutations stay out. + for forbidden in ("s3:PutBucketPolicy", "s3:DeleteBucketPolicy", "s3:PutLifecycleConfiguration"): + assert forbidden not in obj_actions + + def test_ecr_resource_targets_iblai_account(self): + policy = build_runtime_iam_policy(["b"]) + pull = next(s for s in policy["Statement"] if s["Sid"] == "ECRPullPlatformImages") + assert pull["Resource"] == ( + f"arn:aws:ecr:{IBLAI_ECR_REGION}:{IBLAI_ECR_ACCOUNT_ID}:repository/*" + ) + + def test_ecr_auth_is_wildcard(self): + # ecr:GetAuthorizationToken can ONLY be granted on Resource: "*" + # — AWS rejects scoped ARNs for this action. + policy = build_runtime_iam_policy(["b"]) + auth = next(s for s in policy["Statement"] if s["Sid"] == "ECRAuth") + assert auth["Resource"] == "*" + assert auth["Action"] == ["ecr:GetAuthorizationToken"] + + def test_empty_buckets_raises(self): + with pytest.raises(ValueError, match="at least one S3 bucket"): + build_runtime_iam_policy([]) + + def test_policy_is_json_serializable(self): + policy = build_runtime_iam_policy(["a", "b", "c"]) + # Round-trip — what we hand the operator must survive `aws iam put-user-policy`. + round_tripped = json.loads(json.dumps(policy)) + assert round_tripped == policy + + +class TestExtractBuckets: + def test_all_three_present(self): + outputs = { + "instance_public_ip": "1.2.3.4", + "s3_bucket_backups": "p-backups", + "s3_bucket_media": "p-media", + "s3_bucket_static": "p-static", + } + assert extract_bucket_names(outputs) == ["p-backups", "p-media", "p-static"] + + def test_partial_outputs(self): + outputs = {"s3_bucket_backups": "only-backups"} + assert extract_bucket_names(outputs) == ["only-backups"] + + def test_no_buckets(self): + assert extract_bucket_names({}) == [] + assert extract_bucket_names({"instance_public_ip": "1.2.3.4"}) == [] + + def test_empty_string_skipped(self): + # Terraform sometimes emits "" for an unset output rather than omitting. + outputs = {"s3_bucket_backups": "", "s3_bucket_media": "p-m"} + assert extract_bucket_names(outputs) == ["p-m"] + + +class TestRenderInstructions: + def test_writes_policy_file(self, infra_config, tmp_path): + outputs = { + "s3_bucket_backups": "test-backups", + "s3_bucket_media": "test-media", + "s3_bucket_static": "test-static", + } + render_runtime_access_instructions(infra_config, outputs, tmp_path) + policy_path = tmp_path / POLICY_FILENAME + assert policy_path.exists() + loaded = json.loads(policy_path.read_text()) + # File contents match what build_runtime_iam_policy emits. + expected = build_runtime_iam_policy(["test-backups", "test-media", "test-static"]) + assert loaded == expected + + def test_call_server_skipped(self, infra_config, tmp_path): + infra_config.deployment_type = DeploymentType.CALL + outputs = {"s3_bucket_backups": "would-not-be-touched"} + render_runtime_access_instructions(infra_config, outputs, tmp_path) + assert not (tmp_path / POLICY_FILENAME).exists() + + def test_no_buckets_skips_write(self, infra_config, tmp_path): + render_runtime_access_instructions(infra_config, outputs={}, ws=tmp_path) + assert not (tmp_path / POLICY_FILENAME).exists() From db5da56ac1b3f3e5e809b5ec3f22b3b552e06d77 Mon Sep 17 00:00:00 2001 From: bnsoni Date: Wed, 20 May 2026 11:34:14 +0300 Subject: [PATCH 5/8] =?UTF-8?q?fix(runtime-iam):=20policy=20is=20S3-only?= =?UTF-8?q?=20=E2=80=94=20ECR=20creds=20come=20from=20IBL=20separately?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Earlier rev folded both into one customer-minted policy. Correcting per spec: the customer creates an S3-only IAM user in their own account; ECR pull credentials for IBL's image registry are a separate IBL-provided handoff and are explicitly out of scope for this module. - `build_runtime_iam_policy` now emits only `PlatformBucketObjects` + `PlatformBucketList` statements. Dropped the `ECRAuth` and `ECRPullPlatformImages` statements, and removed the now-unused `IBLAI_ECR_ACCOUNT_ID` / `IBLAI_ECR_REGION` module constants. - Renderer rewritten: - Section title is now "Next: create the S3 IAM user". - Two-sentence opening explains it's the S3 set only. - User name template is `--s3-runtime` (was `-runtime`) so it's unambiguous which set this is. - Policy name is `iblai-s3-runtime`. - Closing line explicitly notes ECR pull credentials are provided separately by IBL and are NOT set up here. - Tests: added `test_no_ecr_statements` (negative assertion sweeping every Statement's Action list for `ecr:*` and failing on any hit). Dropped the ECR-resource-shape tests since those statements no longer exist. Net: 11 → 10 tests; full suite 575 passing. - README sub-section gains a leading two-row table making the "S3 (customer) vs ECR (IBL handoff)" split crystal-clear, then walks through the S3 user creation; the ECR row points back to IBL's handoff procedure. - `.env.setup.example` comment block restated: keys here are S3-only, ECR is a separate IBL handoff. - CHANGELOG 1.11.0 entry updated. Co-Authored-By: Claude Opus 4.7 (1M context) --- .env.setup.example | 17 ++++--- CHANGELOG.md | 10 ++-- README.md | 35 ++++++++----- src/iblai_infra/runtime_iam.py | 92 ++++++++++++---------------------- tests/test_runtime_iam.py | 35 +++++-------- 5 files changed, 82 insertions(+), 107 deletions(-) diff --git a/.env.setup.example b/.env.setup.example index bd0fdb0..242cbcb 100644 --- a/.env.setup.example +++ b/.env.setup.example @@ -8,15 +8,18 @@ # REQUIRED FOR BOTH MODES # ============================================================================ -# AWS credentials — these get embedded into /ibl/config.yml on the server -# and used for the lifetime of the deployment (ECR image pulls + S3 -# read/write on the buckets created at provision time). +# AWS credentials — S3 access for the platform server (read / write on +# the dm-media, dm-static, and backups buckets Terraform created in YOUR +# OWN AWS account). Baked into /ibl/config.yml for the lifetime of the +# deployment. # # IMPORTANT: do NOT reuse your provisioning admin keys here. After -# `iblai infra provision-env` finishes it prints the exact IAM policy -# JSON + the three `aws iam create-user / put-user-policy / -# create-access-key` commands to mint a minimum-privilege "runtime" user. -# Paste the resulting AccessKeyId + SecretAccessKey below. +# `iblai infra provision-env` finishes it prints the exact S3-only IAM +# policy JSON + the three `aws iam` commands to mint a minimum-privilege +# user. Paste the resulting AccessKeyId + SecretAccessKey below. +# +# ECR pull credentials for IBL's image registry are a SEPARATE handoff +# from IBL — not what you set up here. AWS_ACCESS_KEY_ID=AKIAIOSFODNN7EXAMPLE AWS_SECRET_ACCESS_KEY=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY diff --git a/CHANGELOG.md b/CHANGELOG.md index 1ee089b..f8f546e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,13 +3,13 @@ ## [1.11.0] — 2026-05-20 ### Added -- **Post-provision runtime IAM helper** (`src/iblai_infra/runtime_iam.py`). After `provision` / `provision-env` succeeds, the CLI prints the exact minimum-privilege IAM policy JSON the operator needs to attach to a scoped runtime user in their own AWS account — and writes the same JSON to `/runtime-iam-policy.json` so it can be piped into `aws iam put-user-policy --policy-document file://...`. The policy scopes S3 to the literal bucket ARNs Terraform just created (no wildcards) and grants ECR auth + pull verbs against IBL's image registry. Skipped automatically for `DeploymentType.CALL` (no S3 buckets, different credential flow). -- **Three copy-paste `aws iam` commands** in the post-provision output (`create-user`, `put-user-policy`, `create-access-key`) using the resolved project / environment as the user name — operator pastes the resulting `AccessKeyId` + `SecretAccessKey` directly into `.env.setup`. -- **README sub-section** under "Provision infrastructure" documenting the runtime IAM step + the scope table. +- **Post-provision S3 IAM helper** (`src/iblai_infra/runtime_iam.py`). After `provision` / `provision-env` succeeds, the CLI prints the exact **S3-only** minimum-privilege IAM policy JSON the operator needs to attach to a scoped runtime user in their own AWS account — and writes the same JSON to `/runtime-iam-policy.json` so it can be piped into `aws iam put-user-policy --policy-document file://...`. The policy scopes S3 to the literal bucket ARNs Terraform just created (no wildcards, no `s3:*`, no bucket-policy / lifecycle / encryption mutation). Skipped automatically for `DeploymentType.CALL` (no S3 buckets). +- **Three copy-paste `aws iam` commands** in the post-provision output (`create-user`, `put-user-policy`, `create-access-key`) using `--s3-runtime` as the user name — operator pastes the resulting `AccessKeyId` + `SecretAccessKey` directly into `.env.setup`. +- **README sub-section** under "Provision infrastructure" documenting the S3 IAM step + the scope table, plus a credential-set table clarifying that **ECR pull credentials are a separate IBL-provided handoff**, not part of this flow. ### Changed -- **`.env.setup.example`** — the `AWS_ACCESS_KEY_ID` / `AWS_SECRET_ACCESS_KEY` comment block now explicitly directs the operator to use the runtime user the post-provision step mints, not their provisioning admin keys. -- **Section 4 of the README** (non-interactive `.env` flow) renumbered as a 3-step sequence (provision → mint runtime user → setup) so the IAM step isn't missed. +- **`.env.setup.example`** — the `AWS_ACCESS_KEY_ID` / `AWS_SECRET_ACCESS_KEY` comment block now explicitly directs the operator to use the S3-only runtime user the post-provision step mints, and calls out that ECR credentials are a separate handoff from IBL. +- **Section 4 of the README** (non-interactive `.env` flow) renumbered as a 3-step sequence (provision → mint S3 user → setup) so the IAM step isn't missed. ## [1.10.0] — 2026-05-20 diff --git a/README.md b/README.md index b044a90..ba29f92 100644 --- a/README.md +++ b/README.md @@ -122,29 +122,38 @@ Sizing guidance: single / multi-server require a **100 GB minimum** root volume. Terraform runs with real-time progress showing each resource as it's created. -#### After provision succeeds — create a runtime IAM user +#### After provision succeeds — create an S3 IAM user -The platform server bakes a single AWS access key into `/ibl/config.yml` for two ongoing purposes: **ECR pulls** (IBL's image registry, cross-account) and **S3 read/write** on the three buckets Terraform just created. Reusing your provisioning admin keys here is overkill — instead, mint a scoped runtime user in your own account. +Two distinct AWS credential sets serve the running platform: -When `provision` / `provision-env` finishes it prints the exact IAM policy JSON (also saved to `/runtime-iam-policy.json`) plus three `aws` commands to copy-paste: +| Credential set | Provided by | Used for | +|---|---|---| +| **S3** — the runtime user this section is about | You, in your own AWS account, post-provision | Read / write the three buckets Terraform just created | +| **ECR** — image-registry pulls | IBL (out-of-band handoff) | `docker login` against IBL's container registry | + +This section covers only the **S3** set. ECR credentials are provided separately by IBL — follow their handoff procedure for those. + +When `provision` / `provision-env` finishes it prints the exact S3-only IAM policy JSON (also saved to `/runtime-iam-policy.json`) plus three `aws` commands to copy-paste: ```bash -aws iam create-user --user-name --runtime +aws iam create-user --user-name --s3-runtime aws iam put-user-policy \ - --user-name --runtime \ - --policy-name iblai-runtime \ + --user-name --s3-runtime \ + --policy-name iblai-s3-runtime \ --policy-document file:///runtime-iam-policy.json -aws iam create-access-key --user-name --runtime +aws iam create-access-key --user-name --s3-runtime ``` -Paste the resulting `AccessKeyId` + `SecretAccessKey` into `.env.setup` (`AWS_ACCESS_KEY_ID` + `AWS_SECRET_ACCESS_KEY`). The policy scope is tight: +Paste the resulting `AccessKeyId` + `SecretAccessKey` into `.env.setup` (`AWS_ACCESS_KEY_ID` + `AWS_SECRET_ACCESS_KEY`). -| | Resource | Verbs | -|---|---|---| -| S3 | The three buckets Terraform created (no wildcards) | `Get/Put/Delete/Acl/ListBucket` | -| ECR | IBL's `arn:aws:ecr:::repository/*` | `GetAuthorizationToken`, `BatchGetImage`, `BatchCheckLayerAvailability`, `GetDownloadUrlForLayer` | +**Policy scope** — S3 only, scoped to the literal bucket ARNs Terraform created (no wildcards): + +| Resource | Verbs | +|---|---| +| Objects in the three buckets (`arn:aws:s3:::/*`) | `GetObject` `PutObject` `DeleteObject` `GetObjectAcl` `PutObjectAcl` | +| The buckets themselves (`arn:aws:s3:::`) | `ListBucket` `GetBucketLocation` | -No bucket-policy mutation, no lifecycle config, no IAM rights. Safe to leave on the box for the lifetime of the deployment. Skipped automatically for `--deployment-type call-server` (no S3 buckets). +No `s3:*`, no bucket-policy mutation, no lifecycle / encryption config, no IAM rights. Skipped automatically for `--deployment-type call-server` (no S3 buckets). ### 3. Setup the platform diff --git a/src/iblai_infra/runtime_iam.py b/src/iblai_infra/runtime_iam.py index c0ba9eb..ccc2b5d 100644 --- a/src/iblai_infra/runtime_iam.py +++ b/src/iblai_infra/runtime_iam.py @@ -1,31 +1,29 @@ """Post-provision runtime IAM helper. -The platform server bakes a single AWS access key into `/ibl/config.yml` for -two ongoing purposes: - - 1. **ECR pulls** — `aws ecr get-login-password` against IBL's image - registry. Cross-account; works because IBL's ECR repositories have a - repository policy granting pulls from the operator's AWS account. - 2. **S3 access** — read / write the dm-media, dm-static, and backups - buckets Terraform just created in the operator's own account. - -Rather than reusing the operator's admin keys (full provisioning scope, -massive blast radius) or asking IBL ops to mint a separate user, this -module prints a **scoped IAM policy** the operator pastes into their own -IAM console after `provision-env` / `provision` succeeds. The resulting -access key is minimum-privilege: - - * S3: only the three buckets Terraform created, only the verbs the - platform actually uses (no `s3:*`, no bucket-policy mutation). - * ECR: only the auth + pull verbs, scoped to IBL's ECR repos. - -The policy JSON is also written to the project workspace +Two distinct AWS credential sets are needed on the running platform: + + 1. **S3 access** to the dm-media, dm-static, and backups buckets + Terraform just created in the **operator's own AWS account**. The + operator mints these themselves by attaching the policy this module + generates to a scoped IAM user — one-time, post-provision. + 2. **ECR pulls** against IBL's image registry. These credentials are + **provided separately by IBL** (out-of-band) and are NOT in scope + for this module. The post-provision instructions intentionally do + not mention them — operators should follow IBL's hand-off + procedure for the ECR keys. + +The policy here is therefore **S3-only**: scoped to the literal bucket +ARNs Terraform created, with the verbs the platform actually uses (no +`s3:*`, no bucket-policy mutation, no lifecycle config — Terraform +configured those at provision time and the platform never revisits). + +The JSON is also written to the project workspace (`/runtime-iam-policy.json`) so the operator can pipe it directly into the CLI: aws iam put-user-policy \\ - --user-name -runtime \\ - --policy-name iblai-runtime \\ + --user-name -s3-runtime \\ + --policy-name iblai-s3-runtime \\ --policy-document file:///runtime-iam-policy.json """ @@ -37,12 +35,6 @@ from iblai_infra import ui from iblai_infra.models import DeploymentType, InfraConfig -# IBL's image registry account / region — the ECR cross-account pull target. -# Centralized here so the rendered policy stays consistent with the actual -# `docker login` target hardcoded across the ansible roles. -IBLAI_ECR_ACCOUNT_ID = "765174860755" -IBLAI_ECR_REGION = "us-east-1" - # Tight S3 verbs the platform actually uses at runtime. Notably excludes # bucket-policy / ACL mutations, lifecycle config, encryption config — all # of which Terraform set up at provision time and the platform never @@ -58,31 +50,25 @@ "s3:ListBucket", "s3:GetBucketLocation", ] -_ECR_AUTH_ACTIONS = ["ecr:GetAuthorizationToken"] -_ECR_PULL_ACTIONS = [ - "ecr:BatchCheckLayerAvailability", - "ecr:BatchGetImage", - "ecr:GetDownloadUrlForLayer", -] POLICY_FILENAME = "runtime-iam-policy.json" def build_runtime_iam_policy(bucket_names: list[str]) -> dict: - """Build the IAM policy JSON document for the runtime user. + """Build the **S3-only** IAM policy JSON document for the runtime user. `bucket_names` must be the literal S3 bucket names Terraform created (the values of `s3_bucket_*` outputs). Returns a dict ready to `json.dumps()` — no formatting opinions baked in here. + + ECR access is intentionally not included: the IBL provider hands off + those credentials separately (see module docstring). """ if not bucket_names: raise ValueError("at least one S3 bucket name is required") bucket_arns = [f"arn:aws:s3:::{b}" for b in bucket_names] object_arns = [f"arn:aws:s3:::{b}/*" for b in bucket_names] - ecr_repo_arn = ( - f"arn:aws:ecr:{IBLAI_ECR_REGION}:{IBLAI_ECR_ACCOUNT_ID}:repository/*" - ) return { "Version": "2012-10-17", @@ -99,18 +85,6 @@ def build_runtime_iam_policy(bucket_names: list[str]) -> dict: "Action": _S3_BUCKET_ACTIONS, "Resource": bucket_arns, }, - { - "Sid": "ECRAuth", - "Effect": "Allow", - "Action": _ECR_AUTH_ACTIONS, - "Resource": "*", - }, - { - "Sid": "ECRPullPlatformImages", - "Effect": "Allow", - "Action": _ECR_PULL_ACTIONS, - "Resource": ecr_repo_arn, - }, ], } @@ -156,17 +130,17 @@ def render_runtime_access_instructions( policy_path = ws / POLICY_FILENAME policy_path.write_text(json.dumps(policy, indent=2) + "\n") - user_name = f"{config.project_name}-{config.environment.value}-runtime" + user_name = f"{config.project_name}-{config.environment.value}-s3-runtime" ui.newline() - ui.console.rule("[bold yellow]Next: create the runtime IAM user[/]") + ui.console.rule("[bold yellow]Next: create the S3 IAM user[/]") ui.console.print( - "The platform server needs minimum-privilege AWS credentials baked\n" - "into [highlight]/ibl/config.yml[/highlight] for [bold]ECR pulls[/bold] (IBL's image registry)\n" - "and [bold]S3 access[/bold] to the three buckets Terraform just created.\n" + "The platform server reads / writes the three S3 buckets Terraform\n" + "just created in [bold]your own AWS account[/bold]. Create a scoped IAM user\n" + "with the policy below and paste its access key into [highlight].env.setup[/highlight].\n" ) ui.console.print( - " [muted]The policy below has already been saved to:[/muted]\n" + " [muted]The policy has already been saved to:[/muted]\n" f" [highlight]{policy_path}[/highlight]\n" ) @@ -182,7 +156,7 @@ def render_runtime_access_instructions( f" [highlight]aws iam create-user --user-name {user_name}[/highlight]\n" f" [highlight]aws iam put-user-policy \\\n" f" --user-name {user_name} \\\n" - f" --policy-name iblai-runtime \\\n" + f" --policy-name iblai-s3-runtime \\\n" f" --policy-document file://{policy_path}[/highlight]\n" f" [highlight]aws iam create-access-key --user-name {user_name}[/highlight]\n" ) @@ -194,7 +168,7 @@ def render_runtime_access_instructions( f" [brand]iblai infra setup-env {config.project_name} -f .env.setup[/brand]\n" ) ui.muted( - " These runtime keys are minimum-privilege — safe to commit to a " - "vault or password manager, but never to git." + " ECR pull credentials for IBL's image registry are provided " + "separately by IBL — they are NOT what you set up here." ) ui.newline() diff --git a/tests/test_runtime_iam.py b/tests/test_runtime_iam.py index 3f1792f..92cf9c6 100644 --- a/tests/test_runtime_iam.py +++ b/tests/test_runtime_iam.py @@ -9,8 +9,6 @@ from iblai_infra.models import DeploymentType from iblai_infra.runtime_iam import ( - IBLAI_ECR_ACCOUNT_ID, - IBLAI_ECR_REGION, POLICY_FILENAME, build_runtime_iam_policy, extract_bucket_names, @@ -23,12 +21,18 @@ def test_single_bucket(self): policy = build_runtime_iam_policy(["my-backups"]) assert policy["Version"] == "2012-10-17" sids = {s["Sid"] for s in policy["Statement"]} - assert sids == { - "PlatformBucketObjects", - "PlatformBucketList", - "ECRAuth", - "ECRPullPlatformImages", - } + # S3-only by design — ECR credentials are provided separately by IBL. + assert sids == {"PlatformBucketObjects", "PlatformBucketList"} + + def test_no_ecr_statements(self): + # IBL's image registry creds are an out-of-band handoff. The + # customer-created policy must not include ECR scope. + policy = build_runtime_iam_policy(["b"]) + for stmt in policy["Statement"]: + for action in stmt["Action"]: + assert not action.startswith("ecr:"), ( + f"runtime IAM policy must be S3-only; found {action!r}" + ) def test_three_buckets_arn_shape(self): policy = build_runtime_iam_policy([ @@ -58,21 +62,6 @@ def test_s3_actions_are_tight(self): for forbidden in ("s3:PutBucketPolicy", "s3:DeleteBucketPolicy", "s3:PutLifecycleConfiguration"): assert forbidden not in obj_actions - def test_ecr_resource_targets_iblai_account(self): - policy = build_runtime_iam_policy(["b"]) - pull = next(s for s in policy["Statement"] if s["Sid"] == "ECRPullPlatformImages") - assert pull["Resource"] == ( - f"arn:aws:ecr:{IBLAI_ECR_REGION}:{IBLAI_ECR_ACCOUNT_ID}:repository/*" - ) - - def test_ecr_auth_is_wildcard(self): - # ecr:GetAuthorizationToken can ONLY be granted on Resource: "*" - # — AWS rejects scoped ARNs for this action. - policy = build_runtime_iam_policy(["b"]) - auth = next(s for s in policy["Statement"] if s["Sid"] == "ECRAuth") - assert auth["Resource"] == "*" - assert auth["Action"] == ["ecr:GetAuthorizationToken"] - def test_empty_buckets_raises(self): with pytest.raises(ValueError, match="at least one S3 bucket"): build_runtime_iam_policy([]) From 4e6054c39aa2ba3abd660733614736745dd63e0e Mon Sep 17 00:00:00 2001 From: bnsoni Date: Wed, 20 May 2026 11:37:40 +0300 Subject: [PATCH 6/8] docs(runtime-iam): shorten ECR note to point at ibl.ai/contact MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the longer "credentials are provided separately by IBL" wording across runtime_iam.py, .env.setup.example, and README with: "For ECR images, use AWS credentials provided by ibl.ai — or contact us at https://ibl.ai/contact" Same surface area; tighter copy. Co-Authored-By: Claude Opus 4.7 (1M context) --- .env.setup.example | 4 ++-- README.md | 4 ++-- src/iblai_infra/runtime_iam.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.env.setup.example b/.env.setup.example index 242cbcb..e4c6766 100644 --- a/.env.setup.example +++ b/.env.setup.example @@ -18,8 +18,8 @@ # policy JSON + the three `aws iam` commands to mint a minimum-privilege # user. Paste the resulting AccessKeyId + SecretAccessKey below. # -# ECR pull credentials for IBL's image registry are a SEPARATE handoff -# from IBL — not what you set up here. +# For ECR images, use AWS credentials provided by ibl.ai — or contact +# us at https://ibl.ai/contact AWS_ACCESS_KEY_ID=AKIAIOSFODNN7EXAMPLE AWS_SECRET_ACCESS_KEY=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY diff --git a/README.md b/README.md index ba29f92..a377534 100644 --- a/README.md +++ b/README.md @@ -129,9 +129,9 @@ Two distinct AWS credential sets serve the running platform: | Credential set | Provided by | Used for | |---|---|---| | **S3** — the runtime user this section is about | You, in your own AWS account, post-provision | Read / write the three buckets Terraform just created | -| **ECR** — image-registry pulls | IBL (out-of-band handoff) | `docker login` against IBL's container registry | +| **ECR** — image-registry pulls | AWS credentials provided by ibl.ai — or contact us at [ibl.ai/contact](https://ibl.ai/contact) | `docker login` against IBL's container registry | -This section covers only the **S3** set. ECR credentials are provided separately by IBL — follow their handoff procedure for those. +This section covers only the **S3** set. When `provision` / `provision-env` finishes it prints the exact S3-only IAM policy JSON (also saved to `/runtime-iam-policy.json`) plus three `aws` commands to copy-paste: diff --git a/src/iblai_infra/runtime_iam.py b/src/iblai_infra/runtime_iam.py index ccc2b5d..0169e30 100644 --- a/src/iblai_infra/runtime_iam.py +++ b/src/iblai_infra/runtime_iam.py @@ -168,7 +168,7 @@ def render_runtime_access_instructions( f" [brand]iblai infra setup-env {config.project_name} -f .env.setup[/brand]\n" ) ui.muted( - " ECR pull credentials for IBL's image registry are provided " - "separately by IBL — they are NOT what you set up here." + " For ECR images, use AWS credentials provided by ibl.ai — " + "or contact us at https://ibl.ai/contact" ) ui.newline() From a54ed8c799f825e1ad6efe2394a8ab375c494020 Mon Sep 17 00:00:00 2001 From: bnsoni Date: Wed, 20 May 2026 11:50:07 +0300 Subject: [PATCH 7/8] =?UTF-8?q?feat(setup):=20two-credential=20split=20?= =?UTF-8?q?=E2=80=94=20S3=20=E2=86=92=20config.yml,=20ECR=20=E2=86=92=20~/?= =?UTF-8?q?.aws/credentials?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes a long-standing conflation in the .env.setup credential model. Previously a single AWS access key had to satisfy two distinct accounts at once: ECR pulls against IBL's image registry AND S3 access against the buckets in the operator's own account. Worked only when that one key happened to have both scopes. Now the two sets are first-class and land in the right place on the host: S3 keys → /ibl/config.yml root (AWS_ACCESS_KEY_ID + AWS_SECRET_ACCESS_KEY at top level). Consumed by DM / edX containers at runtime via iblai-cli-ops templating. Source: customer creates this user post-provision using the runtime-iam-policy.json the CLI prints. ECR keys → ~/.aws/credentials [default] profile. Consumed by `aws ecr get-login-password` in every Login to ECR task, without env-var overrides anywhere. Source: ibl.ai-provided handoff. Implementation: * `SetupConfig` gains optional `ecr_aws_access_key_id` / `ecr_aws_secret_access_key` / `ecr_aws_default_region` (secret is `Field(exclude=True)`). * `env_setup.py` reads new `ECR_AWS_*` env vars. * `runner.py::_build_extra_vars` passes both sets as separate ansible extra-vars (`aws_*` and `ecr_aws_*`). When ECR is empty, the S3 keys fall through — backwards-compatible with older single-key-set deployments. * `awscli` role: writes ECR keys (not S3) to ~/.aws/credentials default profile. * `ibl_platform` role: new task writes S3 keys to /ibl/config.yml root via three `ibl config save --set` calls. Gated `no_log: true`. * Four `Login to ECR` tasks across `ibl_spa`, `ibl_launch_services`, `ibl_platform`, `ibl_service_update` strip the env-var overrides — they now use whatever ~/.aws/credentials [default] holds, which is exactly the ECR set. Docs / examples: * `.env.setup.example` — two clearly-labeled AWS_* blocks (S3 + ECR) with destination + usage inline. Comments call out the fall-through behavior for older deployments. * `README` — credential-set table under "Provision infrastructure" gains a "Lives in" column making the split unambiguous. * `CHANGELOG` — 1.11.0 entry expanded with the split details. Full suite: 575 passing. Co-Authored-By: Claude Opus 4.7 (1M context) --- .env.setup.example | 36 +++++++++++++------ CHANGELOG.md | 9 ++++- README.md | 10 +++--- src/iblai_infra/ansible/runner.py | 19 ++++++++++ .../single-server/roles/awscli/tasks/main.yml | 25 ++++++++----- .../roles/ibl_launch_services/tasks/main.yml | 5 ++- .../roles/ibl_platform/tasks/main.yml | 29 +++++++++++++-- .../roles/ibl_spa/tasks/main.yml | 5 ++- src/iblai_infra/env_setup.py | 3 ++ src/iblai_infra/models.py | 13 +++++++ 10 files changed, 122 insertions(+), 32 deletions(-) diff --git a/.env.setup.example b/.env.setup.example index e4c6766..1ba71a2 100644 --- a/.env.setup.example +++ b/.env.setup.example @@ -8,21 +8,37 @@ # REQUIRED FOR BOTH MODES # ============================================================================ -# AWS credentials — S3 access for the platform server (read / write on -# the dm-media, dm-static, and backups buckets Terraform created in YOUR -# OWN AWS account). Baked into /ibl/config.yml for the lifetime of the -# deployment. +# ============================================================================ +# AWS credentials — TWO distinct sets serve two distinct purposes +# ============================================================================ + +# --- S3 access (customer-created, post-provision) --- +# Read / write on the dm-media, dm-static, and backups buckets Terraform +# created in YOUR OWN AWS account. Written to the root of /ibl/config.yml +# by the `ibl_platform` role; consumed by DM / edX containers at runtime. # -# IMPORTANT: do NOT reuse your provisioning admin keys here. After -# `iblai infra provision-env` finishes it prints the exact S3-only IAM -# policy JSON + the three `aws iam` commands to mint a minimum-privilege -# user. Paste the resulting AccessKeyId + SecretAccessKey below. +# After `iblai infra provision-env` finishes it prints the exact S3-only +# IAM policy JSON + three `aws iam` commands to mint a minimum-privilege +# user. Paste that user's AccessKeyId + SecretAccessKey below. # -# For ECR images, use AWS credentials provided by ibl.ai — or contact -# us at https://ibl.ai/contact +# Do NOT reuse your provisioning admin keys here. AWS_ACCESS_KEY_ID=AKIAIOSFODNN7EXAMPLE AWS_SECRET_ACCESS_KEY=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY +# --- ECR pulls (provided by ibl.ai) --- +# `docker login` against IBL's container registry. Written to +# ~/.aws/credentials [default] on the host by the `awscli` role so +# `aws ecr get-login-password` picks them up without env-var overrides. +# +# For ECR images, use AWS credentials provided by ibl.ai — or contact us +# at https://ibl.ai/contact +# +# If you leave these blank, the S3 keys above fall through to ECR (the +# old single-key-set behavior — works only when one key has both scopes). +ECR_AWS_ACCESS_KEY_ID=AKIAIOSFODNN7EXAMPLE +ECR_AWS_SECRET_ACCESS_KEY=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY +# ECR_AWS_DEFAULT_REGION=us-east-1 # defaults to AWS_DEFAULT_REGION + # GitHub PAT — needs read on iblai/iblai-cli-ops + iblai/iblai-prod-images # (or your overrides via GITHUB_ORG / CLI_OPS_REPO / PROD_IMAGES_REPO below) GIT_TOKEN= diff --git a/CHANGELOG.md b/CHANGELOG.md index f8f546e..f8717d8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,8 +8,15 @@ - **README sub-section** under "Provision infrastructure" documenting the S3 IAM step + the scope table, plus a credential-set table clarifying that **ECR pull credentials are a separate IBL-provided handoff**, not part of this flow. ### Changed -- **`.env.setup.example`** — the `AWS_ACCESS_KEY_ID` / `AWS_SECRET_ACCESS_KEY` comment block now explicitly directs the operator to use the S3-only runtime user the post-provision step mints, and calls out that ECR credentials are a separate handoff from IBL. +- **Two-credential split end-to-end.** Previously a single `AWS_ACCESS_KEY_ID` / `AWS_SECRET_ACCESS_KEY` from `.env.setup` had to serve both ECR auth (IBL's account) and S3 access (customer's account) — works only when one key happens to have both scopes. Now: + - `AWS_ACCESS_KEY_ID` / `AWS_SECRET_ACCESS_KEY` carry the **S3** keys (customer-created post-provision) and are written to the root of `/ibl/config.yml` by a new task in the `ibl_platform` role; consumed by DM / edX at runtime via iblai-cli-ops templating. + - New `ECR_AWS_ACCESS_KEY_ID` / `ECR_AWS_SECRET_ACCESS_KEY` (optional `ECR_AWS_DEFAULT_REGION`) carry the **ECR** keys (ibl.ai-provided). The `awscli` role writes these to `~/.aws/credentials` `[default]` profile on the host so `aws ecr get-login-password` finds them without env-var overrides anywhere. + - The four `Login to ECR` tasks across `ibl_spa`, `ibl_launch_services`, `ibl_platform`, `ibl_service_update` no longer set `AWS_ACCESS_KEY_ID` / `AWS_SECRET_ACCESS_KEY` env-vars at command time — they rely on the default profile populated by `awscli`. + - `SetupConfig` gains `ecr_aws_access_key_id`, `ecr_aws_secret_access_key`, `ecr_aws_default_region` (all optional). Secret is `Field(exclude=True)`. + - `runner.py::_build_extra_vars` passes both sets as separate ansible extra-vars. When `ECR_AWS_*` is empty, the S3 keys fall through to the ECR slot — backwards-compatible with single-key-set deployments. +- **`.env.setup.example`** now shows two clearly-labeled `AWS_*` blocks (S3 + ECR) with usage / destination spelled out inline. - **Section 4 of the README** (non-interactive `.env` flow) renumbered as a 3-step sequence (provision → mint S3 user → setup) so the IAM step isn't missed. +- **README credential-set table** under "Provision infrastructure" gains a "Lives in" column documenting `/ibl/config.yml` root vs `~/.aws/credentials [default]` so the operator knows exactly where each set lands on the server. ## [1.10.0] — 2026-05-20 diff --git a/README.md b/README.md index a377534..4d25e2e 100644 --- a/README.md +++ b/README.md @@ -126,10 +126,12 @@ Terraform runs with real-time progress showing each resource as it's created. Two distinct AWS credential sets serve the running platform: -| Credential set | Provided by | Used for | -|---|---|---| -| **S3** — the runtime user this section is about | You, in your own AWS account, post-provision | Read / write the three buckets Terraform just created | -| **ECR** — image-registry pulls | AWS credentials provided by ibl.ai — or contact us at [ibl.ai/contact](https://ibl.ai/contact) | `docker login` against IBL's container registry | +| Credential set | Provided by | Lives in (on the server) | Used for | +|---|---|---|---| +| **S3** — the runtime user this section is about | You, in your own AWS account, post-provision | `/ibl/config.yml` root (`AWS_ACCESS_KEY_ID` / `AWS_SECRET_ACCESS_KEY`) | DM / edX runtime read / write on the three buckets Terraform created | +| **ECR** — image-registry pulls | AWS credentials provided by ibl.ai — or contact us at [ibl.ai/contact](https://ibl.ai/contact) | `~/.aws/credentials` `[default]` profile | `aws ecr get-login-password` for `docker login` against IBL's registry | + +Two separate `AWS_*` blocks in `.env.setup` carry each set: `AWS_ACCESS_KEY_ID` / `AWS_SECRET_ACCESS_KEY` for S3, and `ECR_AWS_ACCESS_KEY_ID` / `ECR_AWS_SECRET_ACCESS_KEY` for ECR. If `ECR_AWS_*` is left blank, the S3 keys fall through to ECR (backwards-compatible with older single-key-set deployments). This section covers only the **S3** set. diff --git a/src/iblai_infra/ansible/runner.py b/src/iblai_infra/ansible/runner.py index 67d3599..8b3c2cb 100644 --- a/src/iblai_infra/ansible/runner.py +++ b/src/iblai_infra/ansible/runner.py @@ -462,10 +462,29 @@ def _build_extra_vars(self) -> dict: """Build the extra-vars dict. Secrets are passed here, never to disk.""" cli_ops_repo, cli_ops_subdir = parse_repo_path(self.config.cli_ops_repo) prod_images_repo, prod_images_subdir = parse_repo_path(self.config.prod_images_repo) + # Two AWS credential sets are surfaced as separate extra_vars so the + # ansible roles can place each set in the right destination: + # - aws_* → S3 access keys (customer-created post- + # provision). The `ibl_platform` role writes + # these to `/ibl/config.yml` root for DM / edX. + # - ecr_aws_* → IBL-provided ECR pull keys. The `awscli` + # role writes these to `~/.aws/credentials` + # `[default]` so `aws ecr get-login-password` + # picks them up. + # When the operator hasn't supplied ECR_AWS_* in .env.setup (older + # single-key-set deployments), fall through to the S3 keys so the + # ECR login path still works — with the caveat that the same key + # is doing both jobs. + ecr_key_id = self.config.ecr_aws_access_key_id or self.config.aws_access_key_id + ecr_secret = self.config.ecr_aws_secret_access_key or self.config.aws_secret_access_key + ecr_region = self.config.ecr_aws_default_region or self.config.aws_default_region extra = { "aws_access_key_id": self.config.aws_access_key_id, "aws_secret_access_key": self.config.aws_secret_access_key, "aws_default_region": self.config.aws_default_region, + "ecr_aws_access_key_id": ecr_key_id, + "ecr_aws_secret_access_key": ecr_secret, + "ecr_aws_default_region": ecr_region, "git_access_token": self.config.git_access_token, "github_org": self.config.github_org, "cli_ops_repo": cli_ops_repo, diff --git a/src/iblai_infra/ansible/templates/single-server/roles/awscli/tasks/main.yml b/src/iblai_infra/ansible/templates/single-server/roles/awscli/tasks/main.yml index 1ee076d..6378e15 100644 --- a/src/iblai_infra/ansible/templates/single-server/roles/awscli/tasks/main.yml +++ b/src/iblai_infra/ansible/templates/single-server/roles/awscli/tasks/main.yml @@ -27,19 +27,28 @@ changed_when: false ignore_errors: true -- name: Configure AWS credentials - when: aws_access_key_id | length > 0 and aws_secret_access_key | length > 0 +# Writes the ECR pull credentials (ibl.ai-provided) to +# ~/.aws/credentials [default] so `aws ecr get-login-password` picks them +# up by default everywhere on the host. The S3 access keys (customer- +# created post-provision) live in /ibl/config.yml — written by the +# `ibl_platform` role, NOT here. +# +# `ecr_aws_*` falls back to `aws_*` upstream in runner.py when the +# operator hasn't supplied a separate ECR set (older single-key-set +# deployments). +- name: Configure ECR credentials in ~/.aws/credentials [default] + when: ecr_aws_access_key_id | length > 0 and ecr_aws_secret_access_key | length > 0 become: false block: - - name: Set AWS access key ID - command: /usr/local/bin/aws configure set aws_access_key_id "{{ aws_access_key_id }}" + - name: Set AWS access key ID (ECR) + command: /usr/local/bin/aws configure set aws_access_key_id "{{ ecr_aws_access_key_id }}" - - name: Set AWS secret access key - command: /usr/local/bin/aws configure set aws_secret_access_key "{{ aws_secret_access_key }}" + - name: Set AWS secret access key (ECR) + command: /usr/local/bin/aws configure set aws_secret_access_key "{{ ecr_aws_secret_access_key }}" no_log: true - - name: Set AWS default region - command: /usr/local/bin/aws configure set default.region "{{ aws_default_region }}" + - name: Set AWS default region (ECR) + command: /usr/local/bin/aws configure set default.region "{{ ecr_aws_default_region }}" - name: Set AWS output format command: /usr/local/bin/aws configure set output json diff --git a/src/iblai_infra/ansible/templates/single-server/roles/ibl_launch_services/tasks/main.yml b/src/iblai_infra/ansible/templates/single-server/roles/ibl_launch_services/tasks/main.yml index d6c56a3..dd2bd48 100644 --- a/src/iblai_infra/ansible/templates/single-server/roles/ibl_launch_services/tasks/main.yml +++ b/src/iblai_infra/ansible/templates/single-server/roles/ibl_launch_services/tasks/main.yml @@ -4,6 +4,8 @@ # --------------------------------------------------------------------------- - name: Login to ECR + # Uses ~/.aws/credentials [default] profile (populated by the `awscli` + # role with the ECR pull keys ibl.ai supplied via ecr_aws_*). become: false shell: | aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 765174860755.dkr.ecr.us-east-1.amazonaws.com @@ -11,9 +13,6 @@ executable: /bin/bash environment: HOME: "/home/{{ ansible_user }}" - AWS_ACCESS_KEY_ID: "{{ aws_access_key_id }}" - AWS_SECRET_ACCESS_KEY: "{{ aws_secret_access_key }}" - AWS_DEFAULT_REGION: "{{ aws_default_region }}" # --------------------------------------------------------------------------- # DM restart diff --git a/src/iblai_infra/ansible/templates/single-server/roles/ibl_platform/tasks/main.yml b/src/iblai_infra/ansible/templates/single-server/roles/ibl_platform/tasks/main.yml index c3455dd..9b1ca93 100644 --- a/src/iblai_infra/ansible/templates/single-server/roles/ibl_platform/tasks/main.yml +++ b/src/iblai_infra/ansible/templates/single-server/roles/ibl_platform/tasks/main.yml @@ -234,6 +234,30 @@ HOME: "/home/{{ ansible_user }}" IBL_ROOT: "{{ ibl_root }}/" +# S3 access keys at the root of /ibl/config.yml. Consumed by iblai-cli-ops +# templating + the running DM / edX containers for read / write on the +# three buckets Terraform created. These are the customer-created keys +# scoped by `runtime-iam-policy.json` (S3-only) — NOT the ECR keys (which +# live in ~/.aws/credentials [default] — see the `awscli` role). +- name: Write S3 access keys to /ibl/config.yml root + become: false + shell: | + export PYENV_ROOT="{{ pyenv_root }}" + export PATH="$PYENV_ROOT/bin:$PATH" + eval "$(pyenv init -)"; eval "$(pyenv virtualenv-init -)" + pyenv activate "{{ venv_name }}" + ibl config save --set AWS_ACCESS_KEY_ID='{{ aws_access_key_id }}' && \ + ibl config save --set AWS_SECRET_ACCESS_KEY='{{ aws_secret_access_key }}' && \ + ibl config save --set AWS_DEFAULT_REGION='{{ aws_default_region }}' + args: + executable: /bin/bash + chdir: "{{ repo_dir }}" + environment: + HOME: "/home/{{ ansible_user }}" + IBL_ROOT: "{{ ibl_root }}/" + no_log: true + when: aws_access_key_id | length > 0 and aws_secret_access_key | length > 0 + - name: Enable unified API gateway become: false shell: | @@ -428,6 +452,8 @@ when: enable_ai | bool - name: Login to ECR + # Uses ~/.aws/credentials [default] profile (populated by the `awscli` + # role with the ECR pull keys ibl.ai supplied via ecr_aws_*). become: false shell: | aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 765174860755.dkr.ecr.us-east-1.amazonaws.com @@ -435,6 +461,3 @@ executable: /bin/bash environment: HOME: "/home/{{ ansible_user }}" - AWS_ACCESS_KEY_ID: "{{ aws_access_key_id }}" - AWS_SECRET_ACCESS_KEY: "{{ aws_secret_access_key }}" - AWS_DEFAULT_REGION: "{{ aws_default_region }}" diff --git a/src/iblai_infra/ansible/templates/single-server/roles/ibl_spa/tasks/main.yml b/src/iblai_infra/ansible/templates/single-server/roles/ibl_spa/tasks/main.yml index d33fcd1..af776eb 100644 --- a/src/iblai_infra/ansible/templates/single-server/roles/ibl_spa/tasks/main.yml +++ b/src/iblai_infra/ansible/templates/single-server/roles/ibl_spa/tasks/main.yml @@ -185,6 +185,8 @@ IBL_ROOT: "{{ ibl_root }}/" - name: Login to ECR for SPA images + # Uses ~/.aws/credentials [default] profile (populated by the `awscli` + # role with the ECR pull keys ibl.ai supplied via ecr_aws_*). become: false shell: | aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 765174860755.dkr.ecr.us-east-1.amazonaws.com @@ -192,9 +194,6 @@ executable: /bin/bash environment: HOME: "/home/{{ ansible_user }}" - AWS_ACCESS_KEY_ID: "{{ aws_access_key_id }}" - AWS_SECRET_ACCESS_KEY: "{{ aws_secret_access_key }}" - AWS_DEFAULT_REGION: "{{ aws_default_region }}" - name: Launch Auth SPA become: false diff --git a/src/iblai_infra/env_setup.py b/src/iblai_infra/env_setup.py index 6c885cf..819a41f 100644 --- a/src/iblai_infra/env_setup.py +++ b/src/iblai_infra/env_setup.py @@ -299,6 +299,9 @@ def build_setup_config_from_env( aws_access_key_id=env["AWS_ACCESS_KEY_ID"].strip(), aws_secret_access_key=env["AWS_SECRET_ACCESS_KEY"].strip(), aws_default_region=region, + ecr_aws_access_key_id=(env.get("ECR_AWS_ACCESS_KEY_ID") or "").strip(), + ecr_aws_secret_access_key=(env.get("ECR_AWS_SECRET_ACCESS_KEY") or ""), + ecr_aws_default_region=(env.get("ECR_AWS_DEFAULT_REGION") or "").strip(), git_access_token=git_token, github_org=(env.get("GITHUB_ORG") or "iblai").strip(), cli_ops_repo=(env.get("CLI_OPS_REPO") or "iblai-cli-ops").strip(), diff --git a/src/iblai_infra/models.py b/src/iblai_infra/models.py index 61aa142..637c037 100644 --- a/src/iblai_infra/models.py +++ b/src/iblai_infra/models.py @@ -393,9 +393,22 @@ class SetupConfig(BaseModel): enable_ai: bool = True is_resetup: bool = False create_playwright_platforms: bool = False + # S3 access keys — customer-created post-provision (scoped to the three + # dm-media / dm-static / backups buckets Terraform created). Written to + # `/ibl/config.yml` root by the `ibl_platform` role; consumed by DM / + # edX at runtime via iblai-cli-ops templating. aws_access_key_id: str aws_secret_access_key: str aws_default_region: str + # ECR pull keys — provided by ibl.ai out-of-band. Written to + # `~/.aws/credentials` `[default]` profile on the host by the `awscli` + # role; consumed by `aws ecr get-login-password` in any role that does + # `docker login`. Optional — when empty, the S3 keys above fall through + # (backwards-compatible with one-key-set deployments). Secret is + # `Field(exclude=True)` so it never lands in `state.json`. + ecr_aws_access_key_id: str = "" + ecr_aws_secret_access_key: str = Field(default="", exclude=True) + ecr_aws_default_region: str = "" git_access_token: str # GitHub org + repo names for the two private packages this setup # installs (iblai-prod-images directly, iblai-cli-ops transitively). From 4383f35de7e77b51a22da039e704e3b149158716 Mon Sep 17 00:00:00 2001 From: bnsoni Date: Wed, 20 May 2026 12:12:43 +0300 Subject: [PATCH 8/8] =?UTF-8?q?fix(ibl=5Fspa):=20bump=20SPA-ready=20wait?= =?UTF-8?q?=20budget=2010=C3=9715s=20=E2=86=92=2030=C3=9715s=20(150s=20?= =?UTF-8?q?=E2=86=92=20450s)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root cause: SPA images do NOT ship with node_modules baked in. The container runs `pnpm install` on first boot (~80–120s observed) before Next.js can start. Combined with `docker compose pull` and image- extraction overhead, total cold-start can comfortably exceed the older 150s budget on a slower instance or marginal network — the wait task gives up, the playbook bails, but the SPA finishes installing seconds later and ends up serving 200. False negative. Repro: a fresh `iblai infra setup-env ` run failed at the Auth SPA wait with 10 attempts of `non-zero return code`. SSH'd in immediately after, container was Up 17 minutes, curl `localhost:5000` returned 200. The SPA was healthy — the wait just didn't wait long enough. Fix: 30 retries × 15s = 450s (7.5 min). Applied to all six SPA wait tasks across both flows: ibl_spa role (initial setup / setup-env) - Wait for Auth SPA - Wait for Mentor SPA - Wait for Skills SPA ibl_launch_services (AMI launch / launch-env) - Wait for Auth SPA - Wait for Mentor SPA - Wait for Skills SPA Each task gets an inline comment explaining the 450s budget rationale so a future maintainer doesn't shrink it without re-tracing this. Note: a node_modules-prebake at the image level would fix this more elegantly, but that's an iblai-prod-images concern, outside this repo. This change makes the ansible-side wait robust to the current image shape. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../roles/ibl_launch_services/tasks/main.yml | 12 +++++++++--- .../roles/ibl_spa/tasks/main.yml | 19 ++++++++++++++++--- 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/src/iblai_infra/ansible/templates/single-server/roles/ibl_launch_services/tasks/main.yml b/src/iblai_infra/ansible/templates/single-server/roles/ibl_launch_services/tasks/main.yml index dd2bd48..968cc98 100644 --- a/src/iblai_infra/ansible/templates/single-server/roles/ibl_launch_services/tasks/main.yml +++ b/src/iblai_infra/ansible/templates/single-server/roles/ibl_launch_services/tasks/main.yml @@ -181,6 +181,10 @@ delay: 5 - name: Wait for Auth SPA to be ready + # 30 × 15s = 450s. The SPA image runs `pnpm install` on first boot + # before Next.js starts, which can push cold-start past the older + # 150s budget on slower hosts. See ibl_spa role for the long-form + # comment. become: false shell: | curl -s -o /dev/null -w '%{http_code}' http://localhost:5000/ 2>/dev/null | grep -qE '^(200|301|302)$' @@ -188,7 +192,7 @@ executable: /bin/bash register: auth_health until: auth_health.rc == 0 - retries: 10 + retries: 30 delay: 15 - name: Restart Mentor SPA @@ -203,6 +207,7 @@ delay: 5 - name: Wait for Mentor SPA to be ready + # 450s cold-start budget — same rationale as the Auth SPA wait above. become: false shell: | curl -s -o /dev/null -w '%{http_code}' http://localhost:5001/ 2>/dev/null | grep -qE '^(200|301|302)$' @@ -210,7 +215,7 @@ executable: /bin/bash register: mentor_health until: mentor_health.rc == 0 - retries: 10 + retries: 30 delay: 15 - name: Restart Skills SPA @@ -225,6 +230,7 @@ delay: 5 - name: Wait for Skills SPA to be ready + # 450s cold-start budget — same rationale as the Auth SPA wait above. become: false shell: | curl -s -o /dev/null -w '%{http_code}' http://localhost:5002/ 2>/dev/null | grep -qE '^(200|301|302)$' @@ -232,7 +238,7 @@ executable: /bin/bash register: skills_health until: skills_health.rc == 0 - retries: 10 + retries: 30 delay: 15 # --------------------------------------------------------------------------- diff --git a/src/iblai_infra/ansible/templates/single-server/roles/ibl_spa/tasks/main.yml b/src/iblai_infra/ansible/templates/single-server/roles/ibl_spa/tasks/main.yml index af776eb..dd86bb2 100644 --- a/src/iblai_infra/ansible/templates/single-server/roles/ibl_spa/tasks/main.yml +++ b/src/iblai_infra/ansible/templates/single-server/roles/ibl_spa/tasks/main.yml @@ -207,6 +207,13 @@ retries: 2 delay: 5 +# Budget: 30 × 15s = 450s (7.5 min). SPA images don't ship with +# node_modules baked in — the container runs `pnpm install` on first +# boot (~80–120s observed) before Next.js can start. Combined with +# `docker compose pull` and image-extraction overhead, cold start can +# easily exceed the older 10-retry / 150s budget on a slower instance +# or marginal network. 450s comfortably covers that without making +# real failures take an unreasonable amount of time to surface. - name: Wait for Auth SPA to be ready become: false shell: | @@ -215,7 +222,7 @@ executable: /bin/bash register: auth_health until: auth_health.rc == 0 - retries: 10 + retries: 30 delay: 15 - name: Launch Mentor SPA @@ -231,6 +238,9 @@ delay: 5 - name: Wait for Mentor SPA to be ready + # Same 450s cold-start budget rationale as the Auth SPA wait above — + # node_modules install on first boot pushes ready-time past the older + # 150s budget on slower hosts. become: false shell: | curl -s -o /dev/null -w '%{http_code}' http://localhost:5001/ 2>/dev/null | grep -qE '^(200|301|302)$' @@ -238,7 +248,7 @@ executable: /bin/bash register: mentor_health until: mentor_health.rc == 0 - retries: 10 + retries: 30 delay: 15 - name: Launch Skills SPA @@ -254,6 +264,9 @@ delay: 5 - name: Wait for Skills SPA to be ready + # Same 450s cold-start budget rationale as the Auth SPA wait above — + # node_modules install on first boot pushes ready-time past the older + # 150s budget on slower hosts. become: false shell: | curl -s -o /dev/null -w '%{http_code}' http://localhost:5002/ 2>/dev/null | grep -qE '^(200|301|302)$' @@ -261,7 +274,7 @@ executable: /bin/bash register: skills_health until: skills_health.rc == 0 - retries: 10 + retries: 30 delay: 15 - name: Save config and reload proxy after SPA launch