diff --git a/.env.example b/.env.example index d13c608..0da345d 100644 --- a/.env.example +++ b/.env.example @@ -24,8 +24,9 @@ DOMAIN=platform.example.com # GitHub GIT_TOKEN=ghp_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx -# Platform Admin -ADMIN_USERNAME=ibl_admin +# Platform Admin. +# NOTE: `ibl_admin` is reserved for system use — pick a different name. +ADMIN_USERNAME=platform_admin ADMIN_EMAIL=admin@example.com ADMIN_PASSWORD=change-me-min-8-chars diff --git a/.env.provision.example b/.env.provision.example index 49f3ee4..610f2ff 100644 --- a/.env.provision.example +++ b/.env.provision.example @@ -34,8 +34,10 @@ SSH_KEY_METHOD=generate # generate | existing_file | aws_keypair # SSH_KEY_NAME=my-existing-keypair-name # --- Compute (optional, defaults shown) --- +# NOTE: t3.2xlarge has 32 GB RAM. If you enable AI features in the setup +# step, 64 GB (e.g. m5.4xlarge or r5.2xlarge) is strongly recommended. INSTANCE_TYPE=t3.2xlarge -VOLUME_SIZE=50 # min 20 GB +VOLUME_SIZE=100 # min 100 GB VOLUME_TYPE=gp3 # gp2 | gp3 | io1 # --- Network (optional) --- diff --git a/.env.setup.example b/.env.setup.example index cc3e84f..1ba71a2 100644 --- a/.env.setup.example +++ b/.env.setup.example @@ -8,16 +8,45 @@ # REQUIRED FOR BOTH MODES # ============================================================================ -# AWS credentials (passed to ansible as extra_vars; embedded in /ibl/config.yml) +# ============================================================================ +# AWS credentials — TWO distinct sets serve two distinct purposes +# ============================================================================ + +# --- S3 access (customer-created, post-provision) --- +# Read / write on the dm-media, dm-static, and backups buckets Terraform +# created in YOUR OWN AWS account. Written to the root of /ibl/config.yml +# by the `ibl_platform` role; consumed by DM / edX containers at runtime. +# +# After `iblai infra provision-env` finishes it prints the exact S3-only +# IAM policy JSON + three `aws iam` commands to mint a minimum-privilege +# user. Paste that user's AccessKeyId + SecretAccessKey below. +# +# Do NOT reuse your provisioning admin keys here. AWS_ACCESS_KEY_ID=AKIAIOSFODNN7EXAMPLE AWS_SECRET_ACCESS_KEY=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY +# --- ECR pulls (provided by ibl.ai) --- +# `docker login` against IBL's container registry. Written to +# ~/.aws/credentials [default] on the host by the `awscli` role so +# `aws ecr get-login-password` picks them up without env-var overrides. +# +# For ECR images, use AWS credentials provided by ibl.ai — or contact us +# at https://ibl.ai/contact +# +# If you leave these blank, the S3 keys above fall through to ECR (the +# old single-key-set behavior — works only when one key has both scopes). +ECR_AWS_ACCESS_KEY_ID=AKIAIOSFODNN7EXAMPLE +ECR_AWS_SECRET_ACCESS_KEY=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY +# ECR_AWS_DEFAULT_REGION=us-east-1 # defaults to AWS_DEFAULT_REGION + # GitHub PAT — needs read on iblai/iblai-cli-ops + iblai/iblai-prod-images # (or your overrides via GITHUB_ORG / CLI_OPS_REPO / PROD_IMAGES_REPO below) GIT_TOKEN= -# Platform admin (created on the LMS + DM) -ADMIN_USERNAME=ibl_admin +# Platform admin (created on the LMS + DM). +# NOTE: `ibl_admin` is reserved for system use (owns SPA OAuth records). +# Pick any other username — e.g. `platform_admin`, your name, etc. +ADMIN_USERNAME=platform_admin ADMIN_EMAIL=admin@example.com ADMIN_PASSWORD=change-me-min-8-chars @@ -43,8 +72,16 @@ ADMIN_PASSWORD=change-me-min-8-chars # CLI_OPS_RELEASE_TAG=3.19.0 # PROD_IMAGES_TAG=main -# Platform identity (used by SSO roles to derive backend_name + platform_key) -# PLATFORM_NAME=main +# Platform identity. Used by SSO roles to derive backend_name + platform_key, +# AND by ibl_tenant_platform to launch a tenant Platform via run_launch_steps +# (Platform + admin User + UserPlatformLink) when set to a non-default value. +# - Leave unset / blank → defaults to 'main' (system default tenant +# created by the platform itself, no extra launch) +# - PLATFORM_NAME=main → REJECTED ('main' is reserved as an explicit input) +# - PLATFORM_NAME=acme → launches an 'acme' tenant with admin user +# 'acmeadmin@' (password printed +# one-time at the end of setup) +# PLATFORM_NAME=acme # Feature toggles # ENABLE_AI=true diff --git a/CHANGELOG.md b/CHANGELOG.md index 65dd640..f8717d8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,45 @@ # Changelog +## [1.11.0] — 2026-05-20 + +### Added +- **Post-provision S3 IAM helper** (`src/iblai_infra/runtime_iam.py`). After `provision` / `provision-env` succeeds, the CLI prints the exact **S3-only** minimum-privilege IAM policy JSON the operator needs to attach to a scoped runtime user in their own AWS account — and writes the same JSON to `/runtime-iam-policy.json` so it can be piped into `aws iam put-user-policy --policy-document file://...`. The policy scopes S3 to the literal bucket ARNs Terraform just created (no wildcards, no `s3:*`, no bucket-policy / lifecycle / encryption mutation). Skipped automatically for `DeploymentType.CALL` (no S3 buckets). +- **Three copy-paste `aws iam` commands** in the post-provision output (`create-user`, `put-user-policy`, `create-access-key`) using `--s3-runtime` as the user name — operator pastes the resulting `AccessKeyId` + `SecretAccessKey` directly into `.env.setup`. +- **README sub-section** under "Provision infrastructure" documenting the S3 IAM step + the scope table, plus a credential-set table clarifying that **ECR pull credentials are a separate IBL-provided handoff**, not part of this flow. + +### Changed +- **Two-credential split end-to-end.** Previously a single `AWS_ACCESS_KEY_ID` / `AWS_SECRET_ACCESS_KEY` from `.env.setup` had to serve both ECR auth (IBL's account) and S3 access (customer's account) — works only when one key happens to have both scopes. Now: + - `AWS_ACCESS_KEY_ID` / `AWS_SECRET_ACCESS_KEY` carry the **S3** keys (customer-created post-provision) and are written to the root of `/ibl/config.yml` by a new task in the `ibl_platform` role; consumed by DM / edX at runtime via iblai-cli-ops templating. + - New `ECR_AWS_ACCESS_KEY_ID` / `ECR_AWS_SECRET_ACCESS_KEY` (optional `ECR_AWS_DEFAULT_REGION`) carry the **ECR** keys (ibl.ai-provided). The `awscli` role writes these to `~/.aws/credentials` `[default]` profile on the host so `aws ecr get-login-password` finds them without env-var overrides anywhere. + - The four `Login to ECR` tasks across `ibl_spa`, `ibl_launch_services`, `ibl_platform`, `ibl_service_update` no longer set `AWS_ACCESS_KEY_ID` / `AWS_SECRET_ACCESS_KEY` env-vars at command time — they rely on the default profile populated by `awscli`. + - `SetupConfig` gains `ecr_aws_access_key_id`, `ecr_aws_secret_access_key`, `ecr_aws_default_region` (all optional). Secret is `Field(exclude=True)`. + - `runner.py::_build_extra_vars` passes both sets as separate ansible extra-vars. When `ECR_AWS_*` is empty, the S3 keys fall through to the ECR slot — backwards-compatible with single-key-set deployments. +- **`.env.setup.example`** now shows two clearly-labeled `AWS_*` blocks (S3 + ECR) with usage / destination spelled out inline. +- **Section 4 of the README** (non-interactive `.env` flow) renumbered as a 3-step sequence (provision → mint S3 user → setup) so the IAM step isn't missed. +- **README credential-set table** under "Provision infrastructure" gains a "Lives in" column documenting `/ibl/config.yml` root vs `~/.aws/credentials [default]` so the operator knows exactly where each set lands on the server. + +## [1.10.0] — 2026-05-20 + +### Added +- **`ibl_tenant_platform` ansible role** — launches a tenant `Platform` (Platform + admin User + UserPlatformLink) via `run_launch_steps` when `PLATFORM_NAME` is set to anything other than `main`. NOT a raw `Platform.objects.create()` — the state machine fires every after_launch signal (default apps, edX hooks, UserPlatformLink flags). Wired into both `playbook.yml` (setup / setup-env) and `launch_playbook.yml` (launch / launch-env). Skips + logs on re-runs when the tenant already exists. Also writes `PLATFORM_NAME=` (uppercase) at the root of `/ibl/config.yml` and enforces `Platform.show_paywall=False` + `Platform.is_advertising=False` as defense in depth. Surfaces the generated admin password via the `IBLAI_FIXTURE_OUTPUT` pipeline — printed once after the Rich Live display tears down, never persisted to disk. +- **Microsoft SSO writes `IBL_SPA.AUTH`** — `microsoft_sso_config` now also patches `EXTERNAL_IDP_LOGOUT_URL` and `IBL_DIRECT_SSO_URL` (using `microsoft_sso_tenant_id`, falling back to `common`), then restarts the Auth + Mentor SPAs so the new auth flow takes effect. +- **`INSTANCE_RAM_GB` helper + 32 GB memory warning** — non-blocking heads-up suggesting 64 GB (e.g. `m5.4xlarge` / `r5.2xlarge`) when the operator picks a 32 GB instance. Always shown in the interactive provision wizard and `provision-env`; conditional in `launch` / `launch-env` (only when AI is enabled). +- **Final `ibl global-proxy reload`** added as `post_tasks` in both `playbook.yml` and `launch_playbook.yml`, so any nginx state touched by SSO roles (edX restarts in `google_sso_config` / `microsoft_sso_config`) is reloaded before the playbook exits. +- **`RESERVED_ADMIN_USERNAMES` + `RESERVED_PLATFORM_NAMES`** — `models.py` constants, surfaced via `is_reserved_admin_username()` and `is_reserved_platform_name()` helpers and an `InfraConfig` model_validator. + +### Changed +- **Stripe billing UI off by default** — `IBL_SPA.MENTOR.STRIPE_ENABLED=false` and `IBL_SPA.MENTOR.ENABLE_ADVERTISING=false` are now written unconditionally by `ibl_spa` (fresh installs) and `ibl_launch_services` (AMI launches). **Behavior change:** Stripe-using deployments must explicitly flip `IBL_SPA.MENTOR.STRIPE_ENABLED` back to `'true'` post-setup. The previous "always on" SPA flag surfaced billing UI even when Stripe wasn't actually configured. +- **100 GB minimum root volume for single / multi server** — enforced by Pydantic (`InfraConfig` model_validator gated on `DeploymentType.SINGLE`, plus `MultiServerConfig.validate_volume_sizes`) and matching interactive + CLI + .env input checks. **Behavior change:** values below 100 GB are now rejected upfront. Default `ComputeConfig.volume_size` bumped 50 → 100. Call-server unchanged (LiveKit only needs ~40 GB). +- **`ADMIN_USERNAME=ibl_admin` rejected at every input layer** — reserved for the SPA OAuth Application owner the platform itself maintains. New default suggestion is `platform_admin`. Interactive prompts, `.env` parsers, and `--admin-username` flag all reject `ibl_admin` with a clear reserved-name error. **Behavior change:** scripted deploys passing `ADMIN_USERNAME=ibl_admin` must rename. +- **`PLATFORM_NAME=main` rejected as an explicit input** — unset / blank silently resolves to `main` (preserving SSO `backend_name=main-oauth2` and skipping the tenant launcher). **Behavior change:** scripted deploys passing `PLATFORM_NAME=main` should drop the line. +- **README** — refreshed against current playbook (16 roles, phase-grouped table), three deployment topologies, sizing guidance, tenant launcher, reserved-name rules. -50 lines net. + +### Removed +- **All references to a specific canonical-client name** from comments, docstrings, prompt instructions, error hints, and example .env files. Placeholders: `` for monorepo org names, `acme` for tenant-key examples. + +### Fixed +- **Slow `_test_ssh()` retry-path tests** — five tests in `tests/ansible/test_runner.py` exercise the SSH-retry exhaust path (10 retries × 15 s sleep). They now mock `time.sleep` alongside the existing `subprocess.run` mock, cutting ~11 minutes off the full suite. Test count: 562 passing in ~1.3 s. + ## [1.7.0] — 2026-05-06 ### Added @@ -38,7 +78,7 @@ ## [1.5.0] — 2026-04-30 ### Added -- **Monorepo subdirectory installs** — `--cli-ops-repo` / `--prod-images-repo` (and the matching setup prompts) now accept a `repo/subdir` path, e.g. `kaplan-iblai-infra-ops/kaplan-iblai-prod-images`. The ansible role appends `&subdirectory=` to the install URL so a single client monorepo can host both `iblai-cli-ops` and the prod-images package +- **Monorepo subdirectory installs** — `--cli-ops-repo` / `--prod-images-repo` (and the matching setup prompts) now accept a `repo/subdir` path, e.g. `-iblai-infra-ops/-iblai-prod-images`. The ansible role appends `&subdirectory=` to the install URL so a single client monorepo can host both `iblai-cli-ops` and the prod-images package - **`parse_repo_path()` helper** in `models.py` — splits operator input into `(repo, subdir)`. Bare `iblai-cli-ops` keeps the canonical behavior; subdir-form unlocks per-client monorepo deployments - **`cli_ops_subdir` / `prod_images_subdir` extra-vars** passed through `AnsibleRunner` to the `ibl_cli_ops` role (single-server + call-server templates) diff --git a/CLAUDE.md b/CLAUDE.md index 341450e..dfcfe16 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -170,7 +170,7 @@ Sets `state.provider = "launch"` to distinguish from interactive provisioning. `iblai infra setup-env [] -f .env` — non-interactive Ansible bootstrap from a `.env` file. Single-server only (multi/call rejected upstream). Two modes: -- **Provisioned-name:** `setup-env kapsix -f .env` — loads `ProjectState`, derives `target_host` / `ssh_private_key_path` / `base_domain` / `aws_default_region` from it. `.env` only carries credentials, image tags, admin user, optional integrations. +- **Provisioned-name:** `setup-env -f .env` — loads `ProjectState`, derives `target_host` / `ssh_private_key_path` / `base_domain` / `aws_default_region` from it. `.env` only carries credentials, image tags, admin user, optional integrations. - **Free-standing:** `setup-env -f .env` (no name) — builds a synthetic `ProjectState` with `provider="bootstrap"` (matching `_run_setup_interactive`). `.env` must include `PROJECT_NAME`, `TARGET_HOST`, `SSH_PRIVATE_KEY_PATH`, `BASE_DOMAIN`. **Schema** (`.env.setup.example` is the source of truth). Always required: AWS keys, `GIT_TOKEN` (or `GIT_ACCESS_TOKEN`), `ADMIN_USERNAME`/`ADMIN_EMAIL`/`ADMIN_PASSWORD`. Free-standing additionally needs the four "where to deploy" fields. Optional integrations follow the same trigger pattern as `iblai infra launch` — SMTP enabled when `SMTP_HOST` set, Stripe when `STRIPE_SECRET_KEY` set, Google SSO when `GOOGLE_SSO_CLIENT_ID` set, Microsoft SSO when `MICROSOFT_SSO_CLIENT_ID` set. diff --git a/README.md b/README.md index dbc7d01..4d25e2e 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ The following are installed as Python package dependencies when you install ibla The setup phase installs and configures the following on the provisioned EC2 instance: -- **[iblai-cli-ops](https://github.com/iblai/ibl-cli-ops)** -- the IBL platform management CLI, cloned and installed inside a pyenv virtualenv on the server. This is a required dependency for all service launches. **Note:** This is a private repository -- unauthenticated users or those without access will see a 404. +- **[iblai-cli-ops](https://github.com/iblai/iblai-cli-ops)** -- the IBL platform management CLI, installed inside a pyenv virtualenv on the server. Required by every service launch. **Private repository — unauthenticated requests see a 404.** - **Docker Engine** with docker compose - **pyenv** with Python 3.11.8 - **AWS CLI v2** for ECR authentication and S3 access @@ -112,13 +112,51 @@ iblai infra provision Interactive wizard that walks you through: 1. **AWS credentials** -- profile, access keys, or environment variables -2. **Project & compute** -- name, environment (dev/staging/prod), instance type, volume size -3. **Network & SSH** -- VPC CIDR, VPN IP for SSH access, SSH key setup -4. **Domain & certificates** -- base domain, Route53 integration, certificate method (ACM, upload, or none) -5. **Review** -- full summary before applying +2. **Deployment topology** -- single-server, multi-server (N app servers + 1 services server), or call-server (standalone LiveKit) +3. **Project & compute** -- name, environment (dev/staging/prod), instance type, volume size +4. **Network & SSH** -- VPC CIDR, VPN IP for SSH access, SSH key setup +5. **Domain & certificates** -- base domain, Route53 integration, certificate method (ACM, upload, or none) +6. **Review** -- full summary before applying + +Sizing guidance: single / multi-server require a **100 GB minimum** root volume. Picking a 32 GB-RAM instance prints a non-blocking heads-up suggesting 64 GB (e.g. `m5.4xlarge` / `r5.2xlarge`) when AI features will be enabled. Terraform runs with real-time progress showing each resource as it's created. +#### After provision succeeds — create an S3 IAM user + +Two distinct AWS credential sets serve the running platform: + +| Credential set | Provided by | Lives in (on the server) | Used for | +|---|---|---|---| +| **S3** — the runtime user this section is about | You, in your own AWS account, post-provision | `/ibl/config.yml` root (`AWS_ACCESS_KEY_ID` / `AWS_SECRET_ACCESS_KEY`) | DM / edX runtime read / write on the three buckets Terraform created | +| **ECR** — image-registry pulls | AWS credentials provided by ibl.ai — or contact us at [ibl.ai/contact](https://ibl.ai/contact) | `~/.aws/credentials` `[default]` profile | `aws ecr get-login-password` for `docker login` against IBL's registry | + +Two separate `AWS_*` blocks in `.env.setup` carry each set: `AWS_ACCESS_KEY_ID` / `AWS_SECRET_ACCESS_KEY` for S3, and `ECR_AWS_ACCESS_KEY_ID` / `ECR_AWS_SECRET_ACCESS_KEY` for ECR. If `ECR_AWS_*` is left blank, the S3 keys fall through to ECR (backwards-compatible with older single-key-set deployments). + +This section covers only the **S3** set. + +When `provision` / `provision-env` finishes it prints the exact S3-only IAM policy JSON (also saved to `/runtime-iam-policy.json`) plus three `aws` commands to copy-paste: + +```bash +aws iam create-user --user-name --s3-runtime +aws iam put-user-policy \ + --user-name --s3-runtime \ + --policy-name iblai-s3-runtime \ + --policy-document file:///runtime-iam-policy.json +aws iam create-access-key --user-name --s3-runtime +``` + +Paste the resulting `AccessKeyId` + `SecretAccessKey` into `.env.setup` (`AWS_ACCESS_KEY_ID` + `AWS_SECRET_ACCESS_KEY`). + +**Policy scope** — S3 only, scoped to the literal bucket ARNs Terraform created (no wildcards): + +| Resource | Verbs | +|---|---| +| Objects in the three buckets (`arn:aws:s3:::/*`) | `GetObject` `PutObject` `DeleteObject` `GetObjectAcl` `PutObjectAcl` | +| The buckets themselves (`arn:aws:s3:::`) | `ListBucket` `GetBucketLocation` | + +No `s3:*`, no bucket-policy mutation, no lifecycle / encryption config, no IAM rights. Skipped automatically for `--deployment-type call-server` (no S3 buckets). + ### 3. Setup the platform ```bash @@ -131,54 +169,42 @@ Both paths run the same Ansible playbook. The difference is where the inputs com - **With a project name** -- auto-populates IP, domain, SSH key, and AWS credentials from the Terraform state - **Without a project name** -- prompts for server IP, SSH key, domain, image tags, and credentials interactively. No Terraform required. -The playbook runs 9 sequential roles: - -| Role | What it does | -|------|-------------| -| `docker` | Installs Docker Engine, docker compose, and apache2-utils | -| `awscli` | Installs AWS CLI v2 for ECR and S3 access | -| `python` | Installs pyenv and Python 3.11.8 | -| `ibl_cli_ops` | Installs [iblai-prod-images](https://github.com/iblai/iblai-prod-images) (which includes iblai-cli-ops and pinned image versions) via `uv pip install` | -| `ibl_platform` | Configures base domain, environment, image tags, CORS, RBAC, unified API gateway, and service defaults | -| `ibl_dm` | Launches iblai-dm-pro (PostgreSQL with pgvector, Redis, Django, Celery, Langfuse, Minio) | -| `ibl_edx` | Launches iblai-edx-pro (LMS, CMS, MySQL, MongoDB, Redis, Elasticsearch, MFE) | -| `ibl_spa` | Creates OAuth2 apps, configures and launches Auth, Mentor AI, and Skills AI SPAs | -| `final_steps` | Reloads proxy, OAuth/OIDC setup, syncs edX with DM, creates super admins, seeds CSRF domains, flows, LLM registry, mentors, and RBAC data | - -The setup wizard prompts for: -- Target host IP and SSH key path -- Base domain and environment config -- iblai-cli-ops release tag (image versions are pinned by [iblai-prod-images](https://github.com/iblai/iblai-prod-images)) -- Whether to enable AI features -- OpenAI API key (optional) -- Super admin credentials (username, email, password) -- GitHub PAT and AWS credentials for the VM +The playbook runs sequential roles grouped by concern: + +| Phase | Roles | What it does | +|---|---|---| +| Host setup | `docker`, `awscli`, `python` | Docker Engine + compose, AWS CLI v2, pyenv + Python 3.11.8 | +| Platform install | `ibl_cli_ops`, `ibl_platform` | Installs [iblai-prod-images](https://github.com/iblai/iblai-prod-images) (pins `iblai-cli-ops` + image versions); configures base domain, CORS, RBAC, gateway, defaults | +| Core services | `ibl_dm`, `ibl_edx`, `ibl_spa` | Launches DM (Django + Postgres + Redis + Celery + Flowise + Langfuse), edX (LMS / CMS / MySQL / MongoDB / Elasticsearch / Forum), and the Auth / Mentor / Skills SPAs | +| Finalization | `integrations`, `admin_setup`, `data_seeding`, `ibl_tenant_platform` | OAuth/OIDC setup, syncs edX with DM, creates super admin, seeds CSRF / flows / LLM registry / mentors / RBAC; launches a tenant `Platform` via `run_launch_steps` when `PLATFORM_NAME` is set to anything other than `main` | +| Optional integrations | `smtp_config`, `stripe_config`, `google_sso_config`, `microsoft_sso_config` | Each role no-ops unless its trigger key (`SMTP_HOST` / `STRIPE_SECRET_KEY` / `GOOGLE_SSO_CLIENT_ID` / `MICROSOFT_SSO_CLIENT_ID`) is set | +| Post-tasks | `ibl global-proxy reload` | Final nginx reload so any SSO-driven edX/SPA restarts are picked up before exit | + +The setup wizard prompts for: target host + SSH key, base domain, tenant platform name (blank for `main` — `main` itself is reserved), `iblai-cli-ops` release tag, enable-AI toggle, OpenAI key, super admin credentials, GitHub PAT, and AWS credentials. Reserved usernames (e.g. `ibl_admin`) are rejected — the new default suggestion is `platform_admin`. Stripe billing UI and advertising are **off by default**; enable Stripe by passing `STRIPE_SECRET_KEY`. ### 4. Non-interactive provision + setup (`.env` file) -Skip the wizards. Same Terraform + same 9-role Ansible playbook as the interactive flow, just driven from a `.env` file. **Single-server only** (multi/call still use the wizard). +Skip the wizards. Same Terraform + same Ansible roles as the interactive flow, driven from a `.env` file. **Single-server only** (multi / call still use the wizard). ```bash -# Provision (Terraform) — fresh single-server, no AMI required -cp .env.provision.example .env.provision -$EDITOR .env.provision # fill in PROJECT_NAME, DOMAIN, AWS creds, etc. +# 1. Provision (Terraform) — fresh single-server, no AMI required +cp .env.provision.example .env.provision && $EDITOR .env.provision iblai infra provision-env -f .env.provision -# Bootstrap (Ansible) — runs against the project just provisioned -cp .env.setup.example .env.setup -$EDITOR .env.setup # fill in GIT_TOKEN, admin creds, etc. +# 2. Create the runtime IAM user (one-time) — run the 3 `aws iam ...` +# commands printed by step 1, then paste the resulting AccessKeyId + +# SecretAccessKey into .env.setup as AWS_ACCESS_KEY_ID / _SECRET_. + +# 3. Bootstrap (Ansible) — against the just-provisioned project +cp .env.setup.example .env.setup && $EDITOR .env.setup iblai infra setup-env -f .env.setup ``` -**Free-standing server (any cloud, no Terraform):** omit the project name and add `TARGET_HOST`, `SSH_PRIVATE_KEY_PATH`, `BASE_DOMAIN`, `PROJECT_NAME` to your `.env.setup`: +**Free-standing server** (any cloud, no Terraform): omit the project name and add `TARGET_HOST`, `SSH_PRIVATE_KEY_PATH`, `BASE_DOMAIN`, `PROJECT_NAME` to `.env.setup`, then `iblai infra setup-env -f .env.setup`. -```bash -iblai infra setup-env -f .env.setup # builds a synthetic ProjectState, runs Ansible -``` +**Schema:** `.env.provision.example` and `.env.setup.example` document every key inline (required vs. optional, defaults, integration triggers). -**`.env` schema:** `.env.provision.example` and `.env.setup.example` document every key with synthetic placeholders. Required vs. optional, defaults, and integration triggers (SMTP / Stripe / Google SSO / Microsoft SSO — each enabled when its trigger key is set) are inline. - -**Security note:** populated `.env` files are gitignored by default (`.gitignore` blocks `.env.*` except the `*.example` templates). Never commit a real `.env`. The CLI never persists secrets to `state.json` — they ride `--extra-vars` into Ansible at run time only. +**Security:** populated `.env` files are gitignored (`.env.*` blocked except `*.example`). The CLI never persists secrets to `state.json` — they ride `--extra-vars` into Ansible at run time only. ### 5. Re-setup an existing environment @@ -192,66 +218,25 @@ Use this when you need to change the domain or rotate credentials on a running e ### 6. Launch from AMI -**Simplest way — using a `.env` file:** +One-shot Terraform + Ansible from a pre-built AMI. Two equivalent entry points — `.env` for ergonomics, flags for CI/CD. ```bash -cp .env.example .env # Copy the template -vim .env # Fill in your values -iblai infra launch-env # Review summary, confirm, launch -``` +# .env-driven (review + confirm) +cp .env.example .env && $EDITOR .env +iblai infra launch-env -The CLI reads `.env` from the current directory, shows a summary of what will be launched, and asks for confirmation before proceeding. - -**Non-interactive (CI/CD) — using flags:** - -```bash +# Fully non-interactive (CI/CD pipelines) iblai infra launch \ - --ami-id $AMI_ID \ - --domain $DOMAIN \ - --hosted-zone-id $HOSTED_ZONE_ID \ - --aws-key-id $AWS_ACCESS_KEY_ID \ - --aws-secret-key $AWS_SECRET_ACCESS_KEY \ - --ssh-public-key "$SSH_PUBLIC_KEY" \ - --ssh-key $SSH_KEY_PATH \ - --git-token $GIT_TOKEN \ - --admin-email $ADMIN_EMAIL \ - --admin-password $ADMIN_PASSWORD \ - --vpn-ip $VPN_IP -``` - -Fully non-interactive command for CI/CD pipelines (e.g. GitHub Actions). Provisions AWS infrastructure from a pre-built AMI via Terraform, then configures the platform via Ansible — all in one step. - -**What it does:** -1. **Terraform** -- creates VPC, ALB, ACM certificates, Route53 DNS records, and launches EC2 from the specified AMI -2. **Ansible** -- sets domain, rotates secrets, syncs database passwords, restarts all services (DM, edX, SPAs), runs final setup (OAuth, admin creation, data seeding) - -**Cleanup:** - -```bash -iblai infra destroy # Tears down all Terraform resources + --ami-id $AMI_ID --domain $DOMAIN --hosted-zone-id $HOSTED_ZONE_ID \ + --aws-key-id $AWS_ACCESS_KEY_ID --aws-secret-key $AWS_SECRET_ACCESS_KEY \ + --ssh-public-key "$SSH_PUBLIC_KEY" --ssh-key $SSH_KEY_PATH \ + --git-token $GIT_TOKEN --vpn-ip $VPN_IP \ + --admin-email $ADMIN_EMAIL --admin-password $ADMIN_PASSWORD ``` -**Using a `.env` file:** - -Copy `.env.example` to `.env`, fill in real values, then: - -```bash -source .env -iblai infra launch \ - --ami-id $AMI_ID \ - --domain $DOMAIN \ - --hosted-zone-id $HOSTED_ZONE_ID \ - --aws-key-id $AWS_ACCESS_KEY_ID \ - --aws-secret-key $AWS_SECRET_ACCESS_KEY \ - --ssh-public-key "$SSH_PUBLIC_KEY" \ - --ssh-key $SSH_KEY_PATH \ - --git-token $GIT_TOKEN \ - --admin-email $ADMIN_EMAIL \ - --admin-password $ADMIN_PASSWORD \ - --vpn-ip $VPN_IP -``` +**Flow:** Terraform creates VPC / ALB / ACM / Route53 and launches EC2 from the AMI → Ansible sets the domain, rotates secrets, syncs DB passwords, restarts services, runs OAuth + admin + seeding → final `ibl global-proxy reload`. -See `iblai infra launch --help` for all optional flags (instance type, volume size, region, AI features, etc.). +See `iblai infra launch --help` for optional flags (instance type, volume size, region, `--platform-name`, SMTP / Stripe / SSO toggles, `--enable-ai`). ### 7. Service update (image updates, CI/CD) @@ -369,15 +354,19 @@ iblai-infra-ops/ │ ├── app.py # Application logic │ ├── models.py # Pydantic models │ ├── ui.py # Rich terminal UI +│ ├── env_provision.py # .env → InfraConfig (provision-env) +│ ├── env_setup.py # .env → SetupConfig (setup-env) │ ├── prompts/ # Interactive questionary prompts │ ├── providers/ # AWS provider (STS, EC2, S3) -│ ├── terraform/ # Terraform runner and templates -│ │ └── templates/aws/single-server/ -│ └── ansible/ # Ansible runner and templates +│ ├── terraform/ # Terraform runner + templates +│ │ └── templates/aws/ # single-server, multi-server, call-server +│ └── ansible/ # Ansible runner + templates │ └── templates/single-server/ -│ ├── playbook.yml -│ └── roles/ # 9 Ansible roles -├── tests/ # 357 tests +│ ├── playbook.yml # interactive setup + setup-env +│ ├── launch_playbook.yml # AMI launch + launch-env +│ ├── service_update_playbook.yml +│ └── roles/ # ansible roles (see playbook table) +├── tests/ # 562 tests, ~1.3s ├── docs/ # Architecture diagrams └── pyproject.toml ``` diff --git a/src/iblai_infra/__init__.py b/src/iblai_infra/__init__.py index 12479d2..643a832 100644 --- a/src/iblai_infra/__init__.py +++ b/src/iblai_infra/__init__.py @@ -1,3 +1,3 @@ """ibl.ai Infrastructure Provisioning Tool.""" -__version__ = "1.9.0" +__version__ = "1.11.0" diff --git a/src/iblai_infra/ansible/runner.py b/src/iblai_infra/ansible/runner.py index 8ab1ad0..8b3c2cb 100644 --- a/src/iblai_infra/ansible/runner.py +++ b/src/iblai_infra/ansible/runner.py @@ -39,6 +39,7 @@ "integrations": "OAuth & Integrations", "admin_setup": "Admin & CORS Setup", "data_seeding": "Data Seeding", + "ibl_tenant_platform": "Tenant Platform", "stripe_config": "Stripe Config", "google_sso_config": "Google SSO Config", "microsoft_sso_config": "Microsoft SSO Config", @@ -52,6 +53,7 @@ "integrations": "OAuth & Integrations", "admin_setup": "Admin & CORS Setup", "data_seeding": "Data Seeding", + "ibl_tenant_platform": "Tenant Platform", "stripe_config": "Stripe Config", "google_sso_config": "Google SSO Config", "microsoft_sso_config": "Microsoft SSO Config", @@ -460,10 +462,29 @@ def _build_extra_vars(self) -> dict: """Build the extra-vars dict. Secrets are passed here, never to disk.""" cli_ops_repo, cli_ops_subdir = parse_repo_path(self.config.cli_ops_repo) prod_images_repo, prod_images_subdir = parse_repo_path(self.config.prod_images_repo) + # Two AWS credential sets are surfaced as separate extra_vars so the + # ansible roles can place each set in the right destination: + # - aws_* → S3 access keys (customer-created post- + # provision). The `ibl_platform` role writes + # these to `/ibl/config.yml` root for DM / edX. + # - ecr_aws_* → IBL-provided ECR pull keys. The `awscli` + # role writes these to `~/.aws/credentials` + # `[default]` so `aws ecr get-login-password` + # picks them up. + # When the operator hasn't supplied ECR_AWS_* in .env.setup (older + # single-key-set deployments), fall through to the S3 keys so the + # ECR login path still works — with the caveat that the same key + # is doing both jobs. + ecr_key_id = self.config.ecr_aws_access_key_id or self.config.aws_access_key_id + ecr_secret = self.config.ecr_aws_secret_access_key or self.config.aws_secret_access_key + ecr_region = self.config.ecr_aws_default_region or self.config.aws_default_region extra = { "aws_access_key_id": self.config.aws_access_key_id, "aws_secret_access_key": self.config.aws_secret_access_key, "aws_default_region": self.config.aws_default_region, + "ecr_aws_access_key_id": ecr_key_id, + "ecr_aws_secret_access_key": ecr_secret, + "ecr_aws_default_region": ecr_region, "git_access_token": self.config.git_access_token, "github_org": self.config.github_org, "cli_ops_repo": cli_ops_repo, diff --git a/src/iblai_infra/ansible/templates/single-server/launch_playbook.yml b/src/iblai_infra/ansible/templates/single-server/launch_playbook.yml index 9378671..b60a2bb 100644 --- a/src/iblai_infra/ansible/templates/single-server/launch_playbook.yml +++ b/src/iblai_infra/ansible/templates/single-server/launch_playbook.yml @@ -30,6 +30,7 @@ - integrations - admin_setup - data_seeding + - ibl_tenant_platform - stripe_config - google_sso_config - microsoft_sso_config @@ -39,3 +40,25 @@ ansible.builtin.include_role: name: playwright_test_platforms when: create_playwright_platforms | default(false) | bool + + post_tasks: + # Final unconditional proxy reload. Runs AFTER every role and the + # optional Playwright include, so any nginx state touched by SSO + # roles (edX restarts in google_sso_config / microsoft_sso_config) + # is picked up before the playbook exits. Mirrors the post_tasks + # block in playbook.yml so `launch` / `launch-env` get the same + # final-reload guarantee that `setup` / `setup-env` already have. + - name: Reload global proxy (final step) + become: false + ansible.builtin.shell: | + export PYENV_ROOT="{{ pyenv_root }}" + export PATH="$PYENV_ROOT/bin:$PATH" + eval "$(pyenv init -)"; eval "$(pyenv virtualenv-init -)" + pyenv activate "{{ venv_name }}" + ibl global-proxy reload + args: + executable: /bin/bash + chdir: "{{ repo_dir }}" + environment: + HOME: "/home/{{ ansible_user }}" + IBL_ROOT: "{{ ibl_root }}/" diff --git a/src/iblai_infra/ansible/templates/single-server/playbook.yml b/src/iblai_infra/ansible/templates/single-server/playbook.yml index 2428595..3444285 100644 --- a/src/iblai_infra/ansible/templates/single-server/playbook.yml +++ b/src/iblai_infra/ansible/templates/single-server/playbook.yml @@ -48,6 +48,7 @@ - integrations - admin_setup - data_seeding + - ibl_tenant_platform - stripe_config - google_sso_config - microsoft_sso_config @@ -60,3 +61,23 @@ ansible.builtin.include_role: name: playwright_test_platforms when: create_playwright_platforms | default(false) | bool + + post_tasks: + # Final unconditional proxy reload. Runs AFTER every role and the + # optional Playwright include, so any nginx state touched by SSO + # roles (edX restarts in google_sso_config / microsoft_sso_config) + # is picked up before the playbook exits. + - name: Reload global proxy (final step) + become: false + ansible.builtin.shell: | + export PYENV_ROOT="{{ pyenv_root }}" + export PATH="$PYENV_ROOT/bin:$PATH" + eval "$(pyenv init -)"; eval "$(pyenv virtualenv-init -)" + pyenv activate "{{ venv_name }}" + ibl global-proxy reload + args: + executable: /bin/bash + chdir: "{{ repo_dir }}" + environment: + HOME: "/home/{{ ansible_user }}" + IBL_ROOT: "{{ ibl_root }}/" diff --git a/src/iblai_infra/ansible/templates/single-server/roles/awscli/tasks/main.yml b/src/iblai_infra/ansible/templates/single-server/roles/awscli/tasks/main.yml index 1ee076d..6378e15 100644 --- a/src/iblai_infra/ansible/templates/single-server/roles/awscli/tasks/main.yml +++ b/src/iblai_infra/ansible/templates/single-server/roles/awscli/tasks/main.yml @@ -27,19 +27,28 @@ changed_when: false ignore_errors: true -- name: Configure AWS credentials - when: aws_access_key_id | length > 0 and aws_secret_access_key | length > 0 +# Writes the ECR pull credentials (ibl.ai-provided) to +# ~/.aws/credentials [default] so `aws ecr get-login-password` picks them +# up by default everywhere on the host. The S3 access keys (customer- +# created post-provision) live in /ibl/config.yml — written by the +# `ibl_platform` role, NOT here. +# +# `ecr_aws_*` falls back to `aws_*` upstream in runner.py when the +# operator hasn't supplied a separate ECR set (older single-key-set +# deployments). +- name: Configure ECR credentials in ~/.aws/credentials [default] + when: ecr_aws_access_key_id | length > 0 and ecr_aws_secret_access_key | length > 0 become: false block: - - name: Set AWS access key ID - command: /usr/local/bin/aws configure set aws_access_key_id "{{ aws_access_key_id }}" + - name: Set AWS access key ID (ECR) + command: /usr/local/bin/aws configure set aws_access_key_id "{{ ecr_aws_access_key_id }}" - - name: Set AWS secret access key - command: /usr/local/bin/aws configure set aws_secret_access_key "{{ aws_secret_access_key }}" + - name: Set AWS secret access key (ECR) + command: /usr/local/bin/aws configure set aws_secret_access_key "{{ ecr_aws_secret_access_key }}" no_log: true - - name: Set AWS default region - command: /usr/local/bin/aws configure set default.region "{{ aws_default_region }}" + - name: Set AWS default region (ECR) + command: /usr/local/bin/aws configure set default.region "{{ ecr_aws_default_region }}" - name: Set AWS output format command: /usr/local/bin/aws configure set output json diff --git a/src/iblai_infra/ansible/templates/single-server/roles/ibl_cli_ops/tasks/main.yml b/src/iblai_infra/ansible/templates/single-server/roles/ibl_cli_ops/tasks/main.yml index 25a6650..584b214 100644 --- a/src/iblai_infra/ansible/templates/single-server/roles/ibl_cli_ops/tasks/main.yml +++ b/src/iblai_infra/ansible/templates/single-server/roles/ibl_cli_ops/tasks/main.yml @@ -44,7 +44,7 @@ # and crashes on `ibl --help`. Reinstalling here overrides the wrong # package with the right one (e.g. iblai/iblai-cli-ops@5.8.1, or a # subdir of a client monorepo like -# kaplan-iblai-infra-ops/iblai-cli-ops@v1.0.1). +# -iblai-infra-ops/iblai-cli-ops@v1.0.1). - name: Install iblai-cli-ops (override transitive ibl-cli with correct repo+tag) become: false shell: | diff --git a/src/iblai_infra/ansible/templates/single-server/roles/ibl_launch_services/tasks/main.yml b/src/iblai_infra/ansible/templates/single-server/roles/ibl_launch_services/tasks/main.yml index 4bdb4af..968cc98 100644 --- a/src/iblai_infra/ansible/templates/single-server/roles/ibl_launch_services/tasks/main.yml +++ b/src/iblai_infra/ansible/templates/single-server/roles/ibl_launch_services/tasks/main.yml @@ -4,6 +4,8 @@ # --------------------------------------------------------------------------- - name: Login to ECR + # Uses ~/.aws/credentials [default] profile (populated by the `awscli` + # role with the ECR pull keys ibl.ai supplied via ecr_aws_*). become: false shell: | aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 765174860755.dkr.ecr.us-east-1.amazonaws.com @@ -11,9 +13,6 @@ executable: /bin/bash environment: HOME: "/home/{{ ansible_user }}" - AWS_ACCESS_KEY_ID: "{{ aws_access_key_id }}" - AWS_SECRET_ACCESS_KEY: "{{ aws_secret_access_key }}" - AWS_DEFAULT_REGION: "{{ aws_default_region }}" # --------------------------------------------------------------------------- # DM restart @@ -117,6 +116,55 @@ retries: 40 delay: 15 +# --------------------------------------------------------------------------- +# SPA config — flip stripe / advertising off by default (override the +# values baked into the AMI's config.yml). Matches the fresh-install +# defaults set in `ibl_spa`. Direct yaml patch because `ibl config save +# --set` cannot round-trip quoted-string booleans. +# --------------------------------------------------------------------------- + +- name: Set SPA quoted boolean defaults in config.yml + become: false + shell: | + export PYENV_ROOT="{{ pyenv_root }}" + export PATH="$PYENV_ROOT/bin:$PATH" + eval "$(pyenv init -)"; eval "$(pyenv virtualenv-init -)" + pyenv activate "{{ venv_name }}" + python3 -c " + import yaml + config_path = '{{ ibl_root }}/config.yml' + with open(config_path) as f: + cfg = yaml.safe_load(f) or {} + spa = cfg.setdefault('IBL_SPA', {}) + mentor = spa.setdefault('MENTOR', {}) + mentor['STRIPE_ENABLED'] = 'false' + mentor['ENABLE_ADVERTISING'] = 'false' + with open(config_path, 'w') as f: + yaml.dump(cfg, f, default_flow_style=False) + print('SPA boolean config values set') + " + args: + executable: /bin/bash + chdir: "{{ repo_dir }}" + environment: + HOME: "/home/{{ ansible_user }}" + IBL_ROOT: "{{ ibl_root }}/" + +- name: Re-render templates with SPA config defaults + become: false + shell: | + export PYENV_ROOT="{{ pyenv_root }}" + export PATH="$PYENV_ROOT/bin:$PATH" + eval "$(pyenv init -)"; eval "$(pyenv virtualenv-init -)" + pyenv activate "{{ venv_name }}" + ibl config save + args: + executable: /bin/bash + chdir: "{{ repo_dir }}" + environment: + HOME: "/home/{{ ansible_user }}" + IBL_ROOT: "{{ ibl_root }}/" + # --------------------------------------------------------------------------- # SPA restart # --------------------------------------------------------------------------- @@ -133,6 +181,10 @@ delay: 5 - name: Wait for Auth SPA to be ready + # 30 × 15s = 450s. The SPA image runs `pnpm install` on first boot + # before Next.js starts, which can push cold-start past the older + # 150s budget on slower hosts. See ibl_spa role for the long-form + # comment. become: false shell: | curl -s -o /dev/null -w '%{http_code}' http://localhost:5000/ 2>/dev/null | grep -qE '^(200|301|302)$' @@ -140,7 +192,7 @@ executable: /bin/bash register: auth_health until: auth_health.rc == 0 - retries: 10 + retries: 30 delay: 15 - name: Restart Mentor SPA @@ -155,6 +207,7 @@ delay: 5 - name: Wait for Mentor SPA to be ready + # 450s cold-start budget — same rationale as the Auth SPA wait above. become: false shell: | curl -s -o /dev/null -w '%{http_code}' http://localhost:5001/ 2>/dev/null | grep -qE '^(200|301|302)$' @@ -162,7 +215,7 @@ executable: /bin/bash register: mentor_health until: mentor_health.rc == 0 - retries: 10 + retries: 30 delay: 15 - name: Restart Skills SPA @@ -177,6 +230,7 @@ delay: 5 - name: Wait for Skills SPA to be ready + # 450s cold-start budget — same rationale as the Auth SPA wait above. become: false shell: | curl -s -o /dev/null -w '%{http_code}' http://localhost:5002/ 2>/dev/null | grep -qE '^(200|301|302)$' @@ -184,7 +238,7 @@ executable: /bin/bash register: skills_health until: skills_health.rc == 0 - retries: 10 + retries: 30 delay: 15 # --------------------------------------------------------------------------- diff --git a/src/iblai_infra/ansible/templates/single-server/roles/ibl_platform/tasks/main.yml b/src/iblai_infra/ansible/templates/single-server/roles/ibl_platform/tasks/main.yml index c3455dd..9b1ca93 100644 --- a/src/iblai_infra/ansible/templates/single-server/roles/ibl_platform/tasks/main.yml +++ b/src/iblai_infra/ansible/templates/single-server/roles/ibl_platform/tasks/main.yml @@ -234,6 +234,30 @@ HOME: "/home/{{ ansible_user }}" IBL_ROOT: "{{ ibl_root }}/" +# S3 access keys at the root of /ibl/config.yml. Consumed by iblai-cli-ops +# templating + the running DM / edX containers for read / write on the +# three buckets Terraform created. These are the customer-created keys +# scoped by `runtime-iam-policy.json` (S3-only) — NOT the ECR keys (which +# live in ~/.aws/credentials [default] — see the `awscli` role). +- name: Write S3 access keys to /ibl/config.yml root + become: false + shell: | + export PYENV_ROOT="{{ pyenv_root }}" + export PATH="$PYENV_ROOT/bin:$PATH" + eval "$(pyenv init -)"; eval "$(pyenv virtualenv-init -)" + pyenv activate "{{ venv_name }}" + ibl config save --set AWS_ACCESS_KEY_ID='{{ aws_access_key_id }}' && \ + ibl config save --set AWS_SECRET_ACCESS_KEY='{{ aws_secret_access_key }}' && \ + ibl config save --set AWS_DEFAULT_REGION='{{ aws_default_region }}' + args: + executable: /bin/bash + chdir: "{{ repo_dir }}" + environment: + HOME: "/home/{{ ansible_user }}" + IBL_ROOT: "{{ ibl_root }}/" + no_log: true + when: aws_access_key_id | length > 0 and aws_secret_access_key | length > 0 + - name: Enable unified API gateway become: false shell: | @@ -428,6 +452,8 @@ when: enable_ai | bool - name: Login to ECR + # Uses ~/.aws/credentials [default] profile (populated by the `awscli` + # role with the ECR pull keys ibl.ai supplied via ecr_aws_*). become: false shell: | aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 765174860755.dkr.ecr.us-east-1.amazonaws.com @@ -435,6 +461,3 @@ executable: /bin/bash environment: HOME: "/home/{{ ansible_user }}" - AWS_ACCESS_KEY_ID: "{{ aws_access_key_id }}" - AWS_SECRET_ACCESS_KEY: "{{ aws_secret_access_key }}" - AWS_DEFAULT_REGION: "{{ aws_default_region }}" diff --git a/src/iblai_infra/ansible/templates/single-server/roles/ibl_spa/tasks/main.yml b/src/iblai_infra/ansible/templates/single-server/roles/ibl_spa/tasks/main.yml index 8c87f50..dd86bb2 100644 --- a/src/iblai_infra/ansible/templates/single-server/roles/ibl_spa/tasks/main.yml +++ b/src/iblai_infra/ansible/templates/single-server/roles/ibl_spa/tasks/main.yml @@ -150,7 +150,8 @@ spa = cfg.setdefault('IBL_SPA', {}) mentor = spa.setdefault('MENTOR', {}) mentor['ENABLE_RBAC'] = 'true' - mentor['STRIPE_ENABLED'] = 'true' + mentor['STRIPE_ENABLED'] = 'false' + mentor['ENABLE_ADVERTISING'] = 'false' mentor['SKIP_TEST'] = 'true' mentor['ENABLE_APP_SITE_ASSOCIATION'] = 'true' mentor['CANVAS_ADMIN_ONLY'] = 'false' @@ -184,6 +185,8 @@ IBL_ROOT: "{{ ibl_root }}/" - name: Login to ECR for SPA images + # Uses ~/.aws/credentials [default] profile (populated by the `awscli` + # role with the ECR pull keys ibl.ai supplied via ecr_aws_*). become: false shell: | aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 765174860755.dkr.ecr.us-east-1.amazonaws.com @@ -191,9 +194,6 @@ executable: /bin/bash environment: HOME: "/home/{{ ansible_user }}" - AWS_ACCESS_KEY_ID: "{{ aws_access_key_id }}" - AWS_SECRET_ACCESS_KEY: "{{ aws_secret_access_key }}" - AWS_DEFAULT_REGION: "{{ aws_default_region }}" - name: Launch Auth SPA become: false @@ -207,6 +207,13 @@ retries: 2 delay: 5 +# Budget: 30 × 15s = 450s (7.5 min). SPA images don't ship with +# node_modules baked in — the container runs `pnpm install` on first +# boot (~80–120s observed) before Next.js can start. Combined with +# `docker compose pull` and image-extraction overhead, cold start can +# easily exceed the older 10-retry / 150s budget on a slower instance +# or marginal network. 450s comfortably covers that without making +# real failures take an unreasonable amount of time to surface. - name: Wait for Auth SPA to be ready become: false shell: | @@ -215,7 +222,7 @@ executable: /bin/bash register: auth_health until: auth_health.rc == 0 - retries: 10 + retries: 30 delay: 15 - name: Launch Mentor SPA @@ -231,6 +238,9 @@ delay: 5 - name: Wait for Mentor SPA to be ready + # Same 450s cold-start budget rationale as the Auth SPA wait above — + # node_modules install on first boot pushes ready-time past the older + # 150s budget on slower hosts. become: false shell: | curl -s -o /dev/null -w '%{http_code}' http://localhost:5001/ 2>/dev/null | grep -qE '^(200|301|302)$' @@ -238,7 +248,7 @@ executable: /bin/bash register: mentor_health until: mentor_health.rc == 0 - retries: 10 + retries: 30 delay: 15 - name: Launch Skills SPA @@ -254,6 +264,9 @@ delay: 5 - name: Wait for Skills SPA to be ready + # Same 450s cold-start budget rationale as the Auth SPA wait above — + # node_modules install on first boot pushes ready-time past the older + # 150s budget on slower hosts. become: false shell: | curl -s -o /dev/null -w '%{http_code}' http://localhost:5002/ 2>/dev/null | grep -qE '^(200|301|302)$' @@ -261,7 +274,7 @@ executable: /bin/bash register: skills_health until: skills_health.rc == 0 - retries: 10 + retries: 30 delay: 15 - name: Save config and reload proxy after SPA launch diff --git a/src/iblai_infra/ansible/templates/single-server/roles/ibl_tenant_platform/tasks/main.yml b/src/iblai_infra/ansible/templates/single-server/roles/ibl_tenant_platform/tasks/main.yml new file mode 100644 index 0000000..bd6c8b0 --- /dev/null +++ b/src/iblai_infra/ansible/templates/single-server/roles/ibl_tenant_platform/tasks/main.yml @@ -0,0 +1,216 @@ +--- +# Launch a tenant platform via `run_launch_steps` (NOT raw +# `Platform.objects.create()`) so the launcher's state machine fires +# all the after_launch signals — default apps, edX hooks, +# UserPlatformLink with the right role flags, etc. This mirrors the +# shape of a canonical multi-tenant deployment, where a Platform + +# UserPlatformLink + tenant admin User were all created together by +# the launcher. +# +# Gating: +# - Skipped when `platform_name == 'main'` (the IBL default tenant +# that the platform itself maintains via `ibl launch`). +# - Skipped on re-runs when a Platform with key=platform_name already +# exists — the launcher does not upsert, so re-running it would +# either create a duplicate or 500 at the DB layer. +# +# Tenant admin credentials are derived from `platform_name` + +# `base_domain`: +# username = admin +# email = @ +# password = secrets.token_hex(16) (generated on each launch) +# The generated password is surfaced via the IBLAI_FIXTURE_OUTPUT +# markers (see runner.py::_maybe_capture_fixture) — it's printed +# AFTER the Rich Live display tears down so the operator gets one +# clean chance to copy it. It is never persisted to disk. + +- name: Wait for DM web container ready (tenant launch) + become: false + ansible.builtin.shell: + cmd: | + docker inspect --format='{{ '{{' }}.State.Running{{ '}}' }}' ibl_dm_pro_web 2>/dev/null | grep -q true && \ + docker exec ibl_dm_pro_web echo "DM ready" + executable: /bin/bash + register: dm_ready_tenant + until: dm_ready_tenant.rc == 0 + retries: 30 + delay: 10 + changed_when: false + when: (platform_name | default('main')) != 'main' + +# Write PLATFORM_NAME (uppercase) at the root of /ibl/config.yml so the +# iblai-cli-ops templates that read it (LMS display name, etc.) pick up +# the tenant identity. Matches the on-disk shape seen on the canonical +# multi-tenant deployments (`PLATFORM_NAME: ACME`). `ibl config save --set` is +# naturally idempotent — re-running with the same value is a no-op. +# Skipped when `platform_name == 'main'` per the same gate as the +# launcher tasks below. +- name: Set PLATFORM_NAME (uppercase) in /ibl/config.yml + become: false + ansible.builtin.shell: + cmd: | + export PYENV_ROOT="{{ pyenv_root }}" + export PATH="$PYENV_ROOT/bin:$PATH" + eval "$(pyenv init -)"; eval "$(pyenv virtualenv-init -)" + pyenv activate "{{ venv_name }}" + ibl config save --set 'PLATFORM_NAME={{ platform_name | upper }}' + executable: /bin/bash + chdir: "{{ repo_dir }}" + environment: + HOME: "/home/{{ ansible_user }}" + IBL_ROOT: "{{ ibl_root }}/" + when: (platform_name | default('main')) != 'main' + +- name: Check whether tenant platform already exists + become: false + ansible.builtin.shell: + cmd: | + docker exec -i ibl_dm_pro_web python manage.py shell <<'PY' + from core.models import Platform + key = "{{ platform_name }}" + p = Platform.objects.filter(key=key).first() + if p is None: + print("TENANT_PLATFORM_STATUS:ABSENT key=" + key) + else: + print("TENANT_PLATFORM_STATUS:PRESENT key=" + key + " pk=" + str(p.pk)) + PY + executable: /bin/bash + register: tenant_check + changed_when: false + when: (platform_name | default('main')) != 'main' + +- name: Launch tenant platform via run_launch_steps + become: false + ansible.builtin.shell: + cmd: | + docker exec -i ibl_dm_pro_web python manage.py shell <<'PY' + import logging + import secrets + import sys + + # Quiet down the transitions state-machine chatter so the captured + # stdout stays readable; the launcher prints one INFO line per + # state transition by default. + logging.getLogger("transitions").setLevel(logging.WARNING) + + from dl_iblai_services_app.services.launchers import run_launch_steps + + KEY = "{{ platform_name }}" + BASE_DOMAIN = "{{ base_domain }}" + + # admin_username must be alphanumeric (skill §4); strip hyphens and + # underscores from the platform key. + admin_username = (KEY + "admin").replace("-", "").replace("_", "").lower() + admin_email = admin_username + "@" + BASE_DOMAIN + display_name = KEY.replace("-", " ").replace("_", " ").title() + password = secrets.token_hex(16) + + launch_data = { + "username": admin_username, + "email": admin_email, + "firstname": display_name, + "lastname": "Admin", + "password": password, + "role": "org-instructor", + "org": KEY, + "key": KEY, + "name": display_name, + "lms_url": "https://learn." + KEY + "." + BASE_DOMAIN, + "cms_url": "https://studio.learn." + KEY + "." + BASE_DOMAIN, + "portal_url": KEY + "." + BASE_DOMAIN, + } + resp = run_launch_steps(launch_data) + if not resp.get("success"): + print("TENANT_LAUNCH:FAILED message=" + str(resp.get("message"))) + print("TRACEBACK: " + str(resp.get("traceback"))) + sys.exit(1) + + # Marker block parsed by the follow-up debug task. Keep on its own + # lines for readability — the debug task wraps these in + # IBLAI_FIXTURE_OUTPUT_BEGIN/END so the runner replays them after + # the Live display tears down. + print("====TENANT_ADMIN_CREDS_BEGIN====") + print("Platform key: " + KEY) + print("Platform name: " + display_name) + print("Admin username: " + admin_username) + print("Admin email: " + admin_email) + print("Admin password: " + password) + print("Launch ID: " + str(resp.get("id"))) + print("LMS URL: " + launch_data["lms_url"]) + print("CMS URL: " + launch_data["cms_url"]) + print("Portal URL: " + launch_data["portal_url"]) + print("====TENANT_ADMIN_CREDS_END====") + print("TENANT_LAUNCH:OK key=" + KEY) + PY + executable: /bin/bash + register: tenant_launch + changed_when: "'TENANT_LAUNCH:OK' in (tenant_launch.stdout | default(''))" + no_log: true # keep the generated password out of -vvv ansible logs + when: + - (platform_name | default('main')) != 'main' + - "'TENANT_PLATFORM_STATUS:ABSENT' in (tenant_check.stdout | default(''))" + +# Reprint the generated admin password through the fixture-output +# pipeline so the operator sees it AFTER the Rich Live display tears +# down. Ansible's default callback JSON-encodes multi-line debug msgs +# onto a single stdout line, which is exactly what +# `_maybe_capture_fixture` in runner.py expects (both BEGIN/END markers +# on the same line). +- name: Show tenant admin credentials (one-time, save this output) + ansible.builtin.debug: + msg: | + IBLAI_FIXTURE_OUTPUT_BEGIN + TENANT PLATFORM LAUNCHED — SAVE THIS OUTPUT + The admin password below is generated fresh on every launch and + is NEVER persisted to disk. Capture it now. + + {{ tenant_launch.stdout_lines | default([]) | join('\n') }} + IBLAI_FIXTURE_OUTPUT_END + when: + - (platform_name | default('main')) != 'main' + - tenant_launch is defined + - tenant_launch.changed | default(false) + +# Defense-in-depth: ensure show_paywall=False AND is_advertising=False on +# the tenant Platform row, regardless of how the launcher / model defaults +# behave today. Matches the canonical multi-tenant shape (Platform.show_paywall=False, +# is_advertising=False). Runs whether the launcher ran fresh OR was skipped +# because the platform already exists — so re-runs reconcile any flag drift. +# `.update()` is idempotent: writes only if the resolved value differs, and +# matches zero rows if the platform doesn't exist (no-op). +- name: Ensure tenant paywall + advertising disabled + become: false + ansible.builtin.shell: + cmd: | + docker exec -i ibl_dm_pro_web python manage.py shell <<'PY' + from core.models import Platform + key = "{{ platform_name }}" + changed = Platform.objects.filter(key=key).exclude( + show_paywall=False, is_advertising=False, + ).update(show_paywall=False, is_advertising=False) + if changed: + print(f"PAYWALL_FLAGS:UPDATED key={key} rows={changed}") + else: + print(f"PAYWALL_FLAGS:UNCHANGED key={key}") + PY + executable: /bin/bash + register: paywall_enforce + changed_when: "'PAYWALL_FLAGS:UPDATED' in (paywall_enforce.stdout | default(''))" + when: (platform_name | default('main')) != 'main' + +- name: Confirm tenant platform skipped (already present) + ansible.builtin.debug: + msg: >- + Tenant platform '{{ platform_name }}' already exists — skipping launcher + (run_launch_steps does not upsert). Manage the existing Platform / admin + via the DM Django shell if you need to repair flags. + when: + - (platform_name | default('main')) != 'main' + - "'TENANT_PLATFORM_STATUS:PRESENT' in (tenant_check.stdout | default(''))" + +- name: Confirm tenant platform skipped (platform_name == 'main') + ansible.builtin.debug: + msg: >- + platform_name='main' — skipping tenant launch (the IBL default tenant is + maintained by `ibl launch`, not by this role). + when: (platform_name | default('main')) == 'main' diff --git a/src/iblai_infra/ansible/templates/single-server/roles/microsoft_sso_config/tasks/main.yml b/src/iblai_infra/ansible/templates/single-server/roles/microsoft_sso_config/tasks/main.yml index b9642cf..68a485e 100644 --- a/src/iblai_infra/ansible/templates/single-server/roles/microsoft_sso_config/tasks/main.yml +++ b/src/iblai_infra/ansible/templates/single-server/roles/microsoft_sso_config/tasks/main.yml @@ -257,3 +257,135 @@ ansible.builtin.debug: msg: "Microsoft SSO configured for learn.{{ base_domain }} (platform={{ platform_name | default('main') }}, organization={{ microsoft_sso_organization or '(none)' }})" when: microsoft_sso_enabled | default(false) | bool + +# --------------------------------------------------------------------------- +# IBL_SPA.AUTH patches consumed by the Auth + Mentor SPAs at boot. +# +# - EXTERNAL_IDP_LOGOUT_URL — hit by the Auth SPA on sign-out to redirect +# the user through Microsoft's logout endpoint, then back to our own +# auth host (so the Azure session is killed alongside ours). +# - IBL_DIRECT_SSO_URL — the "Sign in with Microsoft" deep link the +# SPA renders. Points at the LMS python-social-auth login endpoint for +# this platform's backend (`-oauth2`), with an inner +# `?next=...` that completes the OAuth dance back into the SPA's +# /login/complete callback. The inner client_id MUST be the same +# EDX_SSO_CLIENT_ID the ibl_spa role minted for the `spa-sso` +# Application — we read it from config.yml rather than re-deriving it. +# +# Tenant ID: uses `microsoft_sso_tenant_id` when set; falls back to "common" +# (multi-tenant) when blank. Same pattern as the existing logout_url in the +# OAuth2ProviderConfig.other_settings JSON above. +# +# Idempotent: only marks `changed` (and triggers the downstream +# `ibl config save` + SPA restart) when the resolved values differ from +# what's already in config.yml. +# --------------------------------------------------------------------------- + +- name: Patch IBL_SPA.AUTH with Microsoft SSO logout + direct-SSO URLs + become: false + ansible.builtin.shell: + cmd: | + python3 <<'PY' + import sys + import yaml + from urllib.parse import quote + + PATH = "{{ ibl_root }}/config.yml" + BASE_DOMAIN = "{{ base_domain }}" + PLATFORM_NAME = "{{ platform_name | default('main') }}" + TENANT_ID = "{{ microsoft_sso_tenant_id }}".strip() or "common" + BACKEND_NAME = PLATFORM_NAME + "-oauth2" + + with open(PATH) as f: + cfg = yaml.safe_load(f) or {} + + spa = cfg.setdefault("IBL_SPA", {}) + auth = spa.setdefault("AUTH", {}) + + spa_sso_client_id = (spa.get("EDX_SSO_CLIENT_ID") or "").strip() + if not spa_sso_client_id: + print("ERROR: IBL_SPA.EDX_SSO_CLIENT_ID is empty — ibl_spa role must run first") + sys.exit(1) + + external_logout = ( + "https://login.microsoftonline.com/" + TENANT_ID + + "/oauth2/v2.0/logout?post_logout_redirect_uri=" + + "https://auth." + BASE_DOMAIN + ) + + # Inner /oauth2/authorize URL the LMS completes into after Microsoft + # auth. URL-encoded verbatim into the outer `next=` query param so + # python-social-auth treats the whole thing as one opaque target. + inner_next = ( + "/oauth2/authorize?response_type=code" + "&client_id=" + spa_sso_client_id + + "&scope=profile email" + "&redirect_uri=https://auth." + BASE_DOMAIN + "/login/complete" + ) + direct_sso = ( + "https://learn." + BASE_DOMAIN + "/auth/login/" + BACKEND_NAME + + "/?auth_entry=login&next=" + quote(inner_next, safe="") + ) + + desired = { + "EXTERNAL_IDP_LOGOUT_URL": external_logout, + "IBL_DIRECT_SSO_URL": direct_sso, + } + current = {k: auth.get(k) for k in desired} + if current == desired: + print("SPA_AUTH_UNCHANGED") + else: + auth.update(desired) + with open(PATH, "w") as f: + yaml.safe_dump(cfg, f, default_flow_style=False, sort_keys=False) + print("SPA_AUTH_PATCHED") + PY + executable: /bin/bash + register: spa_auth_patch + changed_when: "'SPA_AUTH_PATCHED' in (spa_auth_patch.stdout | default(''))" + when: microsoft_sso_enabled | default(false) | bool + +- name: Save platform config (regenerate SPA env files) + become: false + ansible.builtin.shell: + cmd: | + export PYENV_ROOT="{{ pyenv_root }}" + export PATH="$PYENV_ROOT/bin:$PATH" + eval "$(pyenv init -)"; eval "$(pyenv virtualenv-init -)" + pyenv activate "{{ venv_name }}" + ibl config save + executable: /bin/bash + chdir: "{{ repo_dir }}" + environment: + HOME: "/home/{{ ansible_user }}" + IBL_ROOT: "{{ ibl_root }}/" + when: + - microsoft_sso_enabled | default(false) | bool + - spa_auth_patch.changed | default(false) + +# `docker compose down && up -d` (not `restart`) because compose only +# re-reads the env file at container creation time. The Auth SPA owns the +# IBL_DIRECT_SSO_URL / EXTERNAL_IDP_LOGOUT_URL renders directly; the Mentor +# SPA reuses the same AUTH block for its login deep link. +- name: Restart Auth + Mentor SPAs to pick up new IBL_SPA.AUTH values + become: false + ansible.builtin.shell: + cmd: | + set -e + cd {{ ibl_root }}/app/ibl-spa/auth/ && docker compose down && docker compose up -d + cd {{ ibl_root }}/app/ibl-spa/mentor/ && docker compose down && docker compose up -d + executable: /bin/bash + environment: + HOME: "/home/{{ ansible_user }}" + when: + - microsoft_sso_enabled | default(false) | bool + - spa_auth_patch.changed | default(false) + +- name: Confirm IBL_SPA.AUTH patched for Microsoft SSO + ansible.builtin.debug: + msg: >- + IBL_SPA.AUTH updated: EXTERNAL_IDP_LOGOUT_URL → login.microsoftonline.com/{{ microsoft_sso_tenant_id | default('common', true) }}/..., + IBL_DIRECT_SSO_URL → learn.{{ base_domain }}/auth/login/{{ platform_name | default('main') }}-oauth2/... + when: + - microsoft_sso_enabled | default(false) | bool + - spa_auth_patch.changed | default(false) diff --git a/src/iblai_infra/app.py b/src/iblai_infra/app.py index cba809e..568381d 100644 --- a/src/iblai_infra/app.py +++ b/src/iblai_infra/app.py @@ -160,6 +160,13 @@ def show_results(config: InfraConfig, outputs: dict, ws: Path) -> None: ui.info(f"SSH key: [highlight]{config.ssh.private_key_path}[/highlight]") ui.newline() + # Print the post-provision IAM-user setup. Operator needs to create a + # scoped runtime user in their own AWS account before `setup-env` runs + # — see src/iblai_infra/runtime_iam.py for the policy shape + why. + # Skipped for call-server (no S3 buckets, different credential flow). + from iblai_infra.runtime_iam import render_runtime_access_instructions + render_runtime_access_instructions(config, outputs, ws) + def _offer_setup(config: InfraConfig, state) -> None: """After successful provision, offer to run platform setup.""" diff --git a/src/iblai_infra/cli.py b/src/iblai_infra/cli.py index 171bcfb..39d6026 100644 --- a/src/iblai_infra/cli.py +++ b/src/iblai_infra/cli.py @@ -519,7 +519,7 @@ def launch( github_org: str = typer.Option("iblai", "--github-org", help="GitHub org owning the private CLI ops + prod images repos"), cli_ops_repo: str = typer.Option("iblai-cli-ops", "--cli-ops-repo", help="CLI ops repo, or 'repo/subdir' to install from a subdirectory of a monorepo"), prod_images_repo: str = typer.Option("iblai-prod-images", "--prod-images-repo", help="Prod images repo, or 'repo/subdir' to install from a subdirectory of a monorepo"), - admin_username: str = typer.Option("ibl_admin", "--admin-username", help="Admin username"), + admin_username: str = typer.Option("platform_admin", "--admin-username", help="Admin username (cannot be a reserved name like 'ibl_admin')"), openai_key: str = typer.Option("", "--openai-key", help="OpenAI API key (optional)"), enable_ai: bool = typer.Option(True, "--enable-ai/--no-ai", help="Enable AI features"), create_playwright_platforms: bool = typer.Option( @@ -547,8 +547,11 @@ def launch( google_sso_client_id: str = typer.Option("", "--google-sso-client-id", help="Google OAuth Client ID. Setting this enables the Google SSO ansible role."), google_sso_client_secret: str = typer.Option("", "--google-sso-client-secret", help="Google OAuth Client Secret"), google_sso_organization: str = typer.Option("", "--google-sso-organization", help="Organization short name to attach to the OAuth2ProviderConfig (optional)"), - # Platform name — drives SSO backend_name + platform_key. Always populated; defaults to "main" - platform_name: str = typer.Option("main", "--platform-name", help="Platform identifier (lowercase). Used to derive SSO backend_name (-oauth2) and other_settings.platform_key. Default 'main'."), + # Platform name — drives SSO backend_name + platform_key AND the + # ibl_tenant_platform role. Unset (or empty) resolves to 'main' (system + # default tenant — no tenant launch). 'main' is reserved as an explicit + # input so operators pick a real tenant key or leave it alone. + platform_name: str | None = typer.Option(None, "--platform-name", help="Tenant platform key (lowercase). Leave unset for 'main' (system default, no tenant launch). 'main' is reserved as an explicit value."), # Microsoft SSO — `--microsoft-sso-client-id` is the trigger; if empty, the role no-ops microsoft_sso_client_id: str = typer.Option("", "--microsoft-sso-client-id", help="Microsoft Azure AD Application (Client) ID. Setting this enables the Microsoft SSO ansible role."), microsoft_sso_client_secret: str = typer.Option("", "--microsoft-sso-client-secret", help="Microsoft Azure AD Client Secret value"), @@ -591,6 +594,46 @@ def launch( if not admin_email or not admin_password: ui.error("--admin-email and --admin-password are required for single/multi-server deployments.") raise typer.Exit(1) + from iblai_infra.models import ( + RESERVED_ADMIN_USERNAMES, + RESERVED_PLATFORM_NAMES, + is_reserved_admin_username, + is_reserved_platform_name, + ) + if is_reserved_admin_username(admin_username): + reserved = ", ".join(sorted(RESERVED_ADMIN_USERNAMES)) + ui.error( + f"--admin-username {admin_username!r} is reserved for system use." + ) + ui.muted(f"Reserved: {reserved}. Pick a different name (e.g. 'platform_admin').") + raise typer.Exit(1) + # --platform-name 'main' is rejected explicitly. Unset resolves to + # 'main' silently (system default tenant, no tenant launch). + if platform_name is not None and is_reserved_platform_name(platform_name): + reserved = ", ".join(sorted(RESERVED_PLATFORM_NAMES)) + ui.error( + f"--platform-name {platform_name!r} is reserved for the system default tenant." + ) + ui.muted( + f"Reserved: {reserved}. Omit --platform-name for the default, " + "or pick a tenant key like 'acme'." + ) + raise typer.Exit(1) + # Resolve None → 'main' downstream so SetupConfig + ansible see a value. + platform_name = (platform_name or "main").strip().lower() + + # Heads-up if the operator picked a 32 GB box AND wants AI on. Not blocking — + # they can still proceed. Skipped for call-server (LiveKit has different + # sizing constraints) and for instance types we don't know the RAM of. + if deployment_type != "call-server" and enable_ai: + from iblai_infra.models import instance_ram_gb + ram = instance_ram_gb(instance_type) + if ram is not None and ram <= 32: + ui.warning( + f"--instance-type [highlight]{instance_type}[/highlight] has {ram} GB RAM " + f"and AI features are enabled." + ) + ui.muted(" 64 GB (e.g. m5.4xlarge or r5.2xlarge) is strongly recommended for AI workloads.") _run_launch( ami_id=ami_id, domain=domain, hosted_zone_id=hosted_zone_id, @@ -706,7 +749,18 @@ def launch_env( volume_size = int(env.get("VOLUME_SIZE", "200")) environment = env.get("ENVIRONMENT", "staging") cli_tag = env.get("CLI_TAG", "3.19.0") - admin_username = env.get("ADMIN_USERNAME", "ibl_admin") + admin_username = env.get("ADMIN_USERNAME", "platform_admin").strip() + from iblai_infra.models import ( + RESERVED_ADMIN_USERNAMES, + is_reserved_admin_username, + ) + if is_reserved_admin_username(admin_username): + reserved = ", ".join(sorted(RESERVED_ADMIN_USERNAMES)) + ui.error( + f"ADMIN_USERNAME={admin_username!r} is reserved for system use." + ) + ui.muted(f"Reserved: {reserved}. Pick a different name (e.g. 'platform_admin').") + raise typer.Exit(1) openai_key = env.get("OPENAI_API_KEY", "") enable_ai = env.get("ENABLE_AI", "true").lower() in ("true", "1", "yes") create_playwright_platforms = env.get("CREATE_PLAYWRIGHT_PLATFORMS", "false").lower() in ("true", "1", "yes") @@ -730,7 +784,29 @@ def launch_env( google_sso_client_id = env.get("GOOGLE_SSO_CLIENT_ID", "") google_sso_client_secret = env.get("GOOGLE_SSO_CLIENT_SECRET", "") google_sso_organization = env.get("GOOGLE_SSO_ORGANIZATION", "") - platform_name = env.get("PLATFORM_NAME", "main") + # PLATFORM_NAME: blank/absent → 'main' (system default tenant, no tenant + # launch). Explicit 'main' is rejected — operator must either leave it + # alone or pick a real tenant key. + raw_platform_name = env.get("PLATFORM_NAME") + if raw_platform_name is not None and raw_platform_name.strip(): + from iblai_infra.models import ( + RESERVED_PLATFORM_NAMES, + is_reserved_platform_name, + ) + candidate = raw_platform_name.strip().lower() + if is_reserved_platform_name(candidate): + reserved = ", ".join(sorted(RESERVED_PLATFORM_NAMES)) + ui.error( + f"PLATFORM_NAME={candidate!r} is reserved for the system default tenant." + ) + ui.muted( + f"Reserved: {reserved}. Remove the line (or leave it unset) to " + "use the default, or pick a tenant key like 'acme'." + ) + raise typer.Exit(1) + platform_name = candidate + else: + platform_name = "main" microsoft_sso_client_id = env.get("MICROSOFT_SSO_CLIENT_ID", "") microsoft_sso_client_secret = env.get("MICROSOFT_SSO_CLIENT_SECRET", "") microsoft_sso_tenant_id = env.get("MICROSOFT_SSO_TENANT_ID", "") @@ -757,6 +833,16 @@ def launch_env( ] ui.summary_panel("Launch Configuration", rows) + # Same memory heads-up as the `launch` flag-driven flow. + if enable_ai: + from iblai_infra.models import instance_ram_gb + ram = instance_ram_gb(instance_type) + if ram is not None and ram <= 32: + ui.warning( + f"INSTANCE_TYPE={instance_type!r} has {ram} GB RAM and AI features are enabled." + ) + ui.muted(" 64 GB (e.g. m5.4xlarge or r5.2xlarge) is strongly recommended for AI workloads.") + confirm = questionary.confirm( "Proceed with launch?", default=True, @@ -857,6 +943,23 @@ def provision_env( rows.append(("AWS profile", config.credentials.profile)) ui.summary_panel("Provision Configuration", rows) + # Same memory heads-up the interactive `provision` wizard and the launch + # flows surface. provision-env doesn't know whether AI will be enabled + # downstream (that's a setup-step decision), so we warn unconditionally + # on 32 GB boxes — the operator can ignore if they're sure AI stays off. + from iblai_infra.models import instance_ram_gb + ram = instance_ram_gb(config.compute.instance_type) + if ram is not None and ram <= 32: + ui.warning( + f"INSTANCE_TYPE={config.compute.instance_type!r} has {ram} GB RAM." + ) + ui.muted( + " If you plan to enable AI features during setup (the default for IBL deployments)," + ) + ui.muted( + " 64 GB (e.g. m5.4xlarge or r5.2xlarge) is strongly recommended." + ) + ui.newline() ui.console.print(" [brand]Provisioning infrastructure...[/brand]") diff --git a/src/iblai_infra/env_provision.py b/src/iblai_infra/env_provision.py index b75ba80..dd69351 100644 --- a/src/iblai_infra/env_provision.py +++ b/src/iblai_infra/env_provision.py @@ -211,7 +211,7 @@ def _build_network(env: dict[str, str]) -> NetworkConfig: def _build_compute(env: dict[str, str]) -> ComputeConfig: instance_type = (env.get("INSTANCE_TYPE") or "t3.2xlarge").strip() volume_type = (env.get("VOLUME_TYPE") or "gp3").strip() - volume_raw = (env.get("VOLUME_SIZE") or "50").strip() + volume_raw = (env.get("VOLUME_SIZE") or "100").strip() try: volume_size = int(volume_raw) except ValueError: diff --git a/src/iblai_infra/env_setup.py b/src/iblai_infra/env_setup.py index 5cdae66..819a41f 100644 --- a/src/iblai_infra/env_setup.py +++ b/src/iblai_infra/env_setup.py @@ -35,9 +35,13 @@ InfraConfig, NetworkConfig, ProjectState, + RESERVED_ADMIN_USERNAMES, + RESERVED_PLATFORM_NAMES, SetupConfig, SSHConfig, SSHKeyMethod, + is_reserved_admin_username, + is_reserved_platform_name, ) from iblai_infra.prompts.setup import validate_key_permissions from iblai_infra.terraform.state import WORKSPACE_ROOT, load_state, save_state @@ -200,6 +204,33 @@ def build_setup_config_from_env( admin_password = env["ADMIN_PASSWORD"] if len(admin_password) < 8: raise _fail("ADMIN_PASSWORD must be at least 8 characters.") + admin_username = env["ADMIN_USERNAME"].strip() + if is_reserved_admin_username(admin_username): + reserved = ", ".join(sorted(RESERVED_ADMIN_USERNAMES)) + raise _fail( + f"ADMIN_USERNAME={admin_username!r} is reserved for system use.", + hint=f"Reserved usernames: {reserved}. Pick a different one (e.g. 'platform_admin').", + ) + + # PLATFORM_NAME: blank/absent → resolves to 'main' (system default + # tenant, no tenant launch). Explicitly setting it to 'main' is + # rejected — operators shouldn't pick the reserved name; they should + # either leave it unset or pick a real tenant key. + raw_platform_name = env.get("PLATFORM_NAME") + if raw_platform_name is not None and raw_platform_name.strip(): + candidate = raw_platform_name.strip().lower() + if is_reserved_platform_name(candidate): + reserved = ", ".join(sorted(RESERVED_PLATFORM_NAMES)) + raise _fail( + f"PLATFORM_NAME={candidate!r} is reserved for the system default tenant.", + hint=( + f"Reserved: {reserved}. Leave PLATFORM_NAME unset (or remove the line) " + f"to use the default, or pick a tenant key like 'acme'." + ), + ) + platform_name = candidate + else: + platform_name = "main" # Resolve "where to deploy" fields, allowing env to override state. target_host = (env.get("TARGET_HOST") or "").strip() @@ -268,12 +299,15 @@ def build_setup_config_from_env( aws_access_key_id=env["AWS_ACCESS_KEY_ID"].strip(), aws_secret_access_key=env["AWS_SECRET_ACCESS_KEY"].strip(), aws_default_region=region, + ecr_aws_access_key_id=(env.get("ECR_AWS_ACCESS_KEY_ID") or "").strip(), + ecr_aws_secret_access_key=(env.get("ECR_AWS_SECRET_ACCESS_KEY") or ""), + ecr_aws_default_region=(env.get("ECR_AWS_DEFAULT_REGION") or "").strip(), git_access_token=git_token, github_org=(env.get("GITHUB_ORG") or "iblai").strip(), cli_ops_repo=(env.get("CLI_OPS_REPO") or "iblai-cli-ops").strip(), prod_images_repo=(env.get("PROD_IMAGES_REPO") or "iblai-prod-images").strip(), openai_api_key=(env.get("OPENAI_API_KEY") or "").strip(), - admin_username=env["ADMIN_USERNAME"].strip(), + admin_username=admin_username, admin_email=admin_email, admin_password=admin_password, # SMTP @@ -299,7 +333,7 @@ def build_setup_config_from_env( env.get("STRIPE_CONNECT_WEBHOOK_SECRET") or "" ).strip(), # Platform name + SSO - platform_name=(env.get("PLATFORM_NAME") or "main").strip().lower(), + platform_name=platform_name, google_sso_enabled=google_sso_enabled, google_sso_client_id=google_sso_client_id, google_sso_client_secret=(env.get("GOOGLE_SSO_CLIENT_SECRET") or "").strip(), diff --git a/src/iblai_infra/models.py b/src/iblai_infra/models.py index c31ec28..637c037 100644 --- a/src/iblai_infra/models.py +++ b/src/iblai_infra/models.py @@ -9,7 +9,7 @@ from pathlib import Path from typing import Literal -from pydantic import BaseModel, Field, field_validator +from pydantic import BaseModel, Field, field_validator, model_validator # --------------------------------------------------------------------------- @@ -21,7 +21,7 @@ def parse_repo_path(value: str) -> tuple[str, str | None]: Used by the ansible runner to point installs at a package inside a monorepo. `iblai-cli-ops` -> ('iblai-cli-ops', None); - `kaplan-iblai-infra-ops/iblai-cli-ops` -> ('kaplan-iblai-infra-ops', + `-iblai-infra-ops/iblai-cli-ops` -> ('-iblai-infra-ops', 'iblai-cli-ops'). """ cleaned = (value or "").strip().strip("/") @@ -99,6 +99,22 @@ class DeploymentType(str, Enum): "r5.2xlarge": "8 vCPU, 64 GB RAM — Memory optimized", } +# RAM (in GB) for the instance types we publish in the picker. Used by the +# prompt + launch flows to warn operators when they pick a 32 GB box — +# AI-enabled platforms benefit substantially from 64 GB. +INSTANCE_RAM_GB: dict[str, int] = { + "t3.xlarge": 16, + "t3.2xlarge": 32, + "m5.2xlarge": 32, + "m5.4xlarge": 64, + "r5.2xlarge": 64, +} + + +def instance_ram_gb(instance_type: str) -> int | None: + """Return RAM in GB for a known instance type, or None for unknown/custom.""" + return INSTANCE_RAM_GB.get((instance_type or "").strip()) + # LiveKit (call-server) sizing recommendations. Per LiveKit's self-hosting # guide, SFU-only workloads fit on 2 vCPU boxes; egress/recording benefits # from CPU-optimized (c5) families. @@ -164,10 +180,17 @@ def validate_ip(cls, v: str) -> str: class ComputeConfig(BaseModel): instance_type: str = "t3.2xlarge" - volume_size: int = 50 + volume_size: int = 100 volume_type: str = "gp3" ami_id: str | None = None + # Floor of 20 GB here is the lower bound for *any* compute config — + # call-server reuses ComputeConfig as the synced-from-CallServerConfig + # placeholder and runs LiveKit on a small disk (~40 GB). The + # IBL-platform-minimum 100 GB floor is enforced on `InfraConfig` (and + # at the prompt / CLI / .env input layers) only for `DeploymentType.SINGLE`, + # since multi-server uses `MultiServerConfig.{app_server,services}_volume_size` + # and multi's own validator handles that case. @field_validator("volume_size") @classmethod def validate_volume(cls, v: int) -> int: @@ -236,8 +259,8 @@ def validate_app_server_count(cls, v: int) -> int: @field_validator("app_server_volume_size", "services_volume_size") @classmethod def validate_volume_sizes(cls, v: int) -> int: - if v < 20: - raise ValueError("Volume size must be at least 20 GB") + if v < 100: + raise ValueError("Volume size must be at least 100 GB") return v @@ -290,6 +313,19 @@ def validate_project_name(cls, v: str) -> str: raise ValueError("Project name must be 32 characters or fewer") return v + # Enforce the 100 GB platform-disk floor on SINGLE deployments. MULTI is + # already covered by `MultiServerConfig.validate_volume_sizes`; CALL uses + # `CallServerConfig.volume_size` (LiveKit only needs ~40 GB) and reuses + # ComputeConfig as a placeholder, so we don't enforce a 100 GB floor on it. + @model_validator(mode="after") + def _validate_single_server_volume_size(self) -> "InfraConfig": + if self.deployment_type == DeploymentType.SINGLE and self.compute.volume_size < 100: + raise ValueError( + "Single-server volume size must be at least 100 GB " + f"(got {self.compute.volume_size})" + ) + return self + @property def resource_prefix(self) -> str: return f"{self.project_name}-{self.environment.value}" @@ -316,6 +352,34 @@ class ProjectState(BaseModel): # Setup config — contract between setup prompts and AnsibleRunner # --------------------------------------------------------------------------- +# Usernames reserved for system / platform-internal use. The ibl_spa role +# looks up `ibl_admin` to own the `spa-sso` and `ibl_web` OAuth2 Application +# records on the LMS — that user is created by the platform's own bootstrap +# (`ibl edx` / `ibl dm` launch flows) before ibl_spa runs. Operators must +# pick a different name for their human superuser so the system account +# stays separate. +RESERVED_ADMIN_USERNAMES: frozenset[str] = frozenset({"ibl_admin"}) + + +def is_reserved_admin_username(value: str) -> bool: + """Return True if `value` collides with a reserved system username.""" + return (value or "").strip().lower() in RESERVED_ADMIN_USERNAMES + + +# Platform identifiers reserved for system / platform-internal use. `main` +# is the IBL default tenant the platform itself creates and maintains via +# `ibl launch`. Operators can't pick `main` as a tenant name — instead they +# leave the field blank/unset, which silently resolves to `main` for SSO +# backwards-compat (backend_name=`main-oauth2`) and skips the tenant +# launcher (see `ibl_tenant_platform` ansible role). +RESERVED_PLATFORM_NAMES: frozenset[str] = frozenset({"main"}) + + +def is_reserved_platform_name(value: str) -> bool: + """Return True if `value` collides with a reserved system platform name.""" + return (value or "").strip().lower() in RESERVED_PLATFORM_NAMES + + class SetupConfig(BaseModel): """Variables needed to bootstrap a provisioned VM. Never persisted to disk.""" ssh_private_key_path: Path @@ -329,9 +393,22 @@ class SetupConfig(BaseModel): enable_ai: bool = True is_resetup: bool = False create_playwright_platforms: bool = False + # S3 access keys — customer-created post-provision (scoped to the three + # dm-media / dm-static / backups buckets Terraform created). Written to + # `/ibl/config.yml` root by the `ibl_platform` role; consumed by DM / + # edX at runtime via iblai-cli-ops templating. aws_access_key_id: str aws_secret_access_key: str aws_default_region: str + # ECR pull keys — provided by ibl.ai out-of-band. Written to + # `~/.aws/credentials` `[default]` profile on the host by the `awscli` + # role; consumed by `aws ecr get-login-password` in any role that does + # `docker login`. Optional — when empty, the S3 keys above fall through + # (backwards-compatible with one-key-set deployments). Secret is + # `Field(exclude=True)` so it never lands in `state.json`. + ecr_aws_access_key_id: str = "" + ecr_aws_secret_access_key: str = Field(default="", exclude=True) + ecr_aws_default_region: str = "" git_access_token: str # GitHub org + repo names for the two private packages this setup # installs (iblai-prod-images directly, iblai-cli-ops transitively). @@ -339,13 +416,13 @@ class SetupConfig(BaseModel): # own repos. Defaults reflect the canonical IBL deployment. github_org: str = "iblai" # Each repo field accepts either a bare repo name (`iblai-cli-ops`) or a - # `repo/subdir` path (`kaplan-iblai-infra-ops/iblai-cli-ops`) to point at + # `repo/subdir` path (`-iblai-infra-ops/iblai-cli-ops`) to point at # a package inside a monorepo. Parsed by `parse_repo_path()` before the # install URL is built. cli_ops_repo: str = "iblai-cli-ops" prod_images_repo: str = "iblai-prod-images" openai_api_key: str = "" - admin_username: str = "ibl_admin" + admin_username: str = "platform_admin" admin_email: str = "" admin_password: str = "" # SMTP for outbound email (magic-link tests etc.). Disabled by default; @@ -402,6 +479,18 @@ class SetupConfig(BaseModel): microsoft_sso_tenant_id: str = "" microsoft_sso_organization: str = "" + @field_validator("admin_username") + @classmethod + def _validate_admin_username(cls, v: str) -> str: + s = (v or "").strip() + if not s: + raise ValueError("admin_username must not be empty") + if s.lower() in RESERVED_ADMIN_USERNAMES: + raise ValueError( + f"'{s}' is reserved for system use; pick a different admin username" + ) + return s + # --------------------------------------------------------------------------- # Ingress — pre-provisioned domain endpoints diff --git a/src/iblai_infra/prompts/infrastructure.py b/src/iblai_infra/prompts/infrastructure.py index f5bffe3..d593600 100644 --- a/src/iblai_infra/prompts/infrastructure.py +++ b/src/iblai_infra/prompts/infrastructure.py @@ -23,7 +23,31 @@ SSHConfig, SSHKeyMethod, generate_password, + instance_ram_gb, ) + + +def _warn_if_low_memory(instance_type: str, *, context: str = "") -> None: + """Warn the operator when they pick a 32 GB (or smaller) instance. + + AI workloads on the platform (mentor LLMs, embedding generation, retrieval) + can easily exhaust 32 GB once concurrent users + edX + DM are all in play. + We surface a non-blocking heads-up so the operator can revise their pick + before terraform provisions the box. Unknown / custom instance types are + skipped — we can't reason about their memory. + """ + ram = instance_ram_gb(instance_type) + if ram is None or ram > 32: + return + label = f"[highlight]{instance_type}[/highlight] ({ram} GB RAM)" + where = f" for {context}" if context else "" + ui.warning(f"Selected{where}: {label}.") + ui.muted( + " If you plan to enable AI features (the default for IBL deployments)," + ) + ui.muted( + " 64 GB (e.g. [brand]m5.4xlarge[/brand] or [brand]r5.2xlarge[/brand]) is strongly recommended." + ) from iblai_infra.providers.aws import ( detect_current_ip, get_session, @@ -156,11 +180,13 @@ def prompt_project_and_compute() -> ( if instance_type is None: ui.abort() + _warn_if_low_memory(instance_type) + # ----- single-server: volume ----- volume_size = questionary.text( "Root volume size in GB:", - default="50", - validate=lambda v: (v.isdigit() and int(v) >= 20) or "Must be a number >= 20", + default="100", + validate=lambda v: (v.isdigit() and int(v) >= 100) or "Must be a number >= 100", style=ui.PROMPT_STYLE, qmark=ui.QMARK, ).ask() @@ -231,10 +257,12 @@ def _prompt_multi_server_config() -> MultiServerConfig: if app_instance_type is None: ui.abort() + _warn_if_low_memory(app_instance_type, context="app server") + app_volume = questionary.text( "App server volume size (GB):", default="250", - validate=lambda v: (v.isdigit() and int(v) >= 20) or "Must be >= 20", + validate=lambda v: (v.isdigit() and int(v) >= 100) or "Must be >= 100", style=ui.PROMPT_STYLE, qmark=ui.QMARK, ).ask() @@ -270,10 +298,12 @@ def _prompt_multi_server_config() -> MultiServerConfig: if svc_instance_type is None: ui.abort() + _warn_if_low_memory(svc_instance_type, context="services server") + svc_volume = questionary.text( "Services server volume size (GB):", default="500", - validate=lambda v: (v.isdigit() and int(v) >= 20) or "Must be >= 20", + validate=lambda v: (v.isdigit() and int(v) >= 100) or "Must be >= 100", style=ui.PROMPT_STYLE, qmark=ui.QMARK, ).ask() diff --git a/src/iblai_infra/prompts/setup.py b/src/iblai_infra/prompts/setup.py index 1fc0532..d2afd57 100644 --- a/src/iblai_infra/prompts/setup.py +++ b/src/iblai_infra/prompts/setup.py @@ -8,7 +8,44 @@ import questionary from iblai_infra import ui -from iblai_infra.models import ProjectState, SetupConfig, SSHKeyMethod +from iblai_infra.models import ( + ProjectState, + RESERVED_ADMIN_USERNAMES, + RESERVED_PLATFORM_NAMES, + SetupConfig, + SSHKeyMethod, + is_reserved_admin_username, + is_reserved_platform_name, +) + + +def _validate_admin_username(value: str) -> bool | str: + """questionary-compatible validator. Returns True or an error string.""" + s = (value or "").strip() + if not s: + return "Admin username is required" + if is_reserved_admin_username(s): + reserved = ", ".join(sorted(RESERVED_ADMIN_USERNAMES)) + return f"'{s}' is reserved for system use. Reserved: {reserved}" + return True + + +def _validate_tenant_platform_name(value: str) -> bool | str: + """questionary-compatible validator. Blank is accepted (resolves to + `main` implicitly downstream). Explicit `main` is rejected so the + operator can't co-opt the system default tenant. + """ + s = (value or "").strip().lower() + if not s: + return True + if is_reserved_platform_name(s): + reserved = ", ".join(sorted(RESERVED_PLATFORM_NAMES)) + return ( + f"'{s}' is reserved for the system default tenant. " + f"Leave blank to use the default, or pick a different name. " + f"Reserved: {reserved}" + ) + return True SETUP_STEPS = 3 BOOTSTRAP_STEPS = 4 @@ -97,21 +134,26 @@ def _prompt_platform_config( ui.success(f"Domain: [highlight]{base_domain}[/highlight]") - # Platform name — first thing the operator sets in step 2. Drives the - # SSO ansible roles (backend_name = `-oauth2`, - # other_settings.platform_key). Defaults to "main" for canonical IBL - # single-tenant deploys; tenant deployments override. + # Platform name — drives the SSO ansible roles (backend_name = + # `-oauth2`, other_settings.platform_key) AND the + # ibl_tenant_platform role (launches a Platform + admin via + # run_launch_steps when value != 'main'). Blank input resolves to + # 'main' implicitly (the system default tenant the platform itself + # creates). Operators can't pick 'main' explicitly — it's reserved. platform_name = questionary.text( - "Platform name (lowercase identifier, default 'main'):", - default="main", - validate=lambda v: bool(v.strip()) or "Platform name is required", + "Tenant platform name (leave blank for default 'main', no tenant launch):", + default="", + validate=_validate_tenant_platform_name, style=ui.PROMPT_STYLE, qmark=ui.QMARK, ).ask() if platform_name is None: ui.abort() - platform_name = platform_name.strip().lower() - ui.success(f"Platform: [highlight]{platform_name}[/highlight]") + platform_name = platform_name.strip().lower() or "main" + if platform_name == "main": + ui.success("Platform: [highlight]main[/highlight] (default, no tenant launch)") + else: + ui.success(f"Tenant platform: [highlight]{platform_name}[/highlight] (will be launched)") edx_version = "sumac" ui.success(f"Open edX version: [highlight]Sumac[/highlight]") @@ -560,7 +602,7 @@ def _prompt_credentials( cli_ops_repo = questionary.text( "CLI ops repo (or repo/subdir for monorepo):", default="iblai-cli-ops", - instruction="(e.g. iblai-cli-ops, or kaplan-iblai-infra-ops/iblai-cli-ops)", + instruction="(e.g. iblai-cli-ops, or -iblai-infra-ops/iblai-cli-ops)", validate=lambda v: len(v.strip()) > 0 or "Required", style=ui.PROMPT_STYLE, qmark=ui.QMARK, @@ -572,7 +614,7 @@ def _prompt_credentials( prod_images_repo = questionary.text( "Prod images repo (or repo/subdir for monorepo):", default="iblai-prod-images", - instruction="(e.g. iblai-prod-images, or kaplan-iblai-infra-ops/kaplan-iblai-prod-images)", + instruction="(e.g. iblai-prod-images, or -iblai-infra-ops/-iblai-prod-images)", validate=lambda v: len(v.strip()) > 0 or "Required", style=ui.PROMPT_STYLE, qmark=ui.QMARK, @@ -665,7 +707,8 @@ def _prompt_credentials( admin_username = questionary.text( "Admin username:", - default="ibl_admin", + default="platform_admin", + validate=_validate_admin_username, style=ui.PROMPT_STYLE, qmark=ui.QMARK, ).ask() diff --git a/src/iblai_infra/runtime_iam.py b/src/iblai_infra/runtime_iam.py new file mode 100644 index 0000000..0169e30 --- /dev/null +++ b/src/iblai_infra/runtime_iam.py @@ -0,0 +1,174 @@ +"""Post-provision runtime IAM helper. + +Two distinct AWS credential sets are needed on the running platform: + + 1. **S3 access** to the dm-media, dm-static, and backups buckets + Terraform just created in the **operator's own AWS account**. The + operator mints these themselves by attaching the policy this module + generates to a scoped IAM user — one-time, post-provision. + 2. **ECR pulls** against IBL's image registry. These credentials are + **provided separately by IBL** (out-of-band) and are NOT in scope + for this module. The post-provision instructions intentionally do + not mention them — operators should follow IBL's hand-off + procedure for the ECR keys. + +The policy here is therefore **S3-only**: scoped to the literal bucket +ARNs Terraform created, with the verbs the platform actually uses (no +`s3:*`, no bucket-policy mutation, no lifecycle config — Terraform +configured those at provision time and the platform never revisits). + +The JSON is also written to the project workspace +(`/runtime-iam-policy.json`) so the operator can pipe it +directly into the CLI: + + aws iam put-user-policy \\ + --user-name -s3-runtime \\ + --policy-name iblai-s3-runtime \\ + --policy-document file:///runtime-iam-policy.json +""" + +from __future__ import annotations + +import json +from pathlib import Path + +from iblai_infra import ui +from iblai_infra.models import DeploymentType, InfraConfig + +# Tight S3 verbs the platform actually uses at runtime. Notably excludes +# bucket-policy / ACL mutations, lifecycle config, encryption config — all +# of which Terraform set up at provision time and the platform never +# revisits. +_S3_OBJECT_ACTIONS = [ + "s3:GetObject", + "s3:PutObject", + "s3:DeleteObject", + "s3:GetObjectAcl", + "s3:PutObjectAcl", +] +_S3_BUCKET_ACTIONS = [ + "s3:ListBucket", + "s3:GetBucketLocation", +] + +POLICY_FILENAME = "runtime-iam-policy.json" + + +def build_runtime_iam_policy(bucket_names: list[str]) -> dict: + """Build the **S3-only** IAM policy JSON document for the runtime user. + + `bucket_names` must be the literal S3 bucket names Terraform created + (the values of `s3_bucket_*` outputs). Returns a dict ready to + `json.dumps()` — no formatting opinions baked in here. + + ECR access is intentionally not included: the IBL provider hands off + those credentials separately (see module docstring). + """ + if not bucket_names: + raise ValueError("at least one S3 bucket name is required") + + bucket_arns = [f"arn:aws:s3:::{b}" for b in bucket_names] + object_arns = [f"arn:aws:s3:::{b}/*" for b in bucket_names] + + return { + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "PlatformBucketObjects", + "Effect": "Allow", + "Action": _S3_OBJECT_ACTIONS, + "Resource": object_arns, + }, + { + "Sid": "PlatformBucketList", + "Effect": "Allow", + "Action": _S3_BUCKET_ACTIONS, + "Resource": bucket_arns, + }, + ], + } + + +def extract_bucket_names(outputs: dict) -> list[str]: + """Pull bucket names out of a terraform outputs dict. + + Reads the three `s3_bucket_{backups,media,static}` outputs that the + single-server template emits. Returns an empty list when none are + present (e.g. call-server, which has no buckets). + """ + keys = ("s3_bucket_backups", "s3_bucket_media", "s3_bucket_static") + return [outputs[k] for k in keys if outputs.get(k)] + + +def render_runtime_access_instructions( + config: InfraConfig, + outputs: dict, + ws: Path, +) -> None: + """Print post-provision IAM-user setup instructions to the operator. + + Skips silently for `DeploymentType.CALL` (no S3 buckets and the call + stack uses its own credentials flow). Writes the policy JSON to the + workspace at `runtime-iam-policy.json` so the operator can pipe it + into `aws iam put-user-policy --policy-document file://...`. + """ + if config.deployment_type == DeploymentType.CALL: + return + + bucket_names = extract_bucket_names(outputs) + if not bucket_names: + # No buckets in outputs — terraform template might not have run S3 + # creation, or the operator pointed at a deployment shape we don't + # cover. Surface a soft note instead of printing a half-policy. + ui.muted( + "Skipping runtime IAM instructions: no S3 buckets in terraform " + "outputs." + ) + return + + policy = build_runtime_iam_policy(bucket_names) + policy_path = ws / POLICY_FILENAME + policy_path.write_text(json.dumps(policy, indent=2) + "\n") + + user_name = f"{config.project_name}-{config.environment.value}-s3-runtime" + + ui.newline() + ui.console.rule("[bold yellow]Next: create the S3 IAM user[/]") + ui.console.print( + "The platform server reads / writes the three S3 buckets Terraform\n" + "just created in [bold]your own AWS account[/bold]. Create a scoped IAM user\n" + "with the policy below and paste its access key into [highlight].env.setup[/highlight].\n" + ) + ui.console.print( + " [muted]The policy has already been saved to:[/muted]\n" + f" [highlight]{policy_path}[/highlight]\n" + ) + + # Show the policy verbatim so the operator can sanity-check before + # creating anything. Indented blob renders monospace via the IBL theme. + ui.console.rule("[muted]runtime-iam-policy.json[/muted]") + ui.console.print(json.dumps(policy, indent=2)) + ui.console.rule() + ui.newline() + + ui.console.print(" [bold]One-time setup — copy/paste into your shell:[/]\n") + ui.console.print( + f" [highlight]aws iam create-user --user-name {user_name}[/highlight]\n" + f" [highlight]aws iam put-user-policy \\\n" + f" --user-name {user_name} \\\n" + f" --policy-name iblai-s3-runtime \\\n" + f" --policy-document file://{policy_path}[/highlight]\n" + f" [highlight]aws iam create-access-key --user-name {user_name}[/highlight]\n" + ) + ui.console.print( + " Copy the [bold]AccessKeyId[/bold] + [bold]SecretAccessKey[/bold] from the last command into your\n" + " [highlight].env.setup[/highlight] as [highlight]AWS_ACCESS_KEY_ID[/highlight] and [highlight]AWS_SECRET_ACCESS_KEY[/highlight], then run:\n" + ) + ui.console.print( + f" [brand]iblai infra setup-env {config.project_name} -f .env.setup[/brand]\n" + ) + ui.muted( + " For ECR images, use AWS credentials provided by ibl.ai — " + "or contact us at https://ibl.ai/contact" + ) + ui.newline() diff --git a/tests/ansible/test_runner.py b/tests/ansible/test_runner.py index bc6dc71..8ca72be 100644 --- a/tests/ansible/test_runner.py +++ b/tests/ansible/test_runner.py @@ -408,15 +408,15 @@ def test_total_roles_matches_labels(self): assert TOTAL_ROLES == len(ROLE_LABELS) def test_expected_roles(self): - expected = {"docker", "awscli", "python", "ibl_cli_ops", "ibl_platform", "smtp_config", "ibl_dm", "ibl_edx", "ibl_spa", "integrations", "admin_setup", "data_seeding", "stripe_config", "google_sso_config", "microsoft_sso_config"} + expected = {"docker", "awscli", "python", "ibl_cli_ops", "ibl_platform", "smtp_config", "ibl_dm", "ibl_edx", "ibl_spa", "integrations", "admin_setup", "data_seeding", "ibl_tenant_platform", "stripe_config", "google_sso_config", "microsoft_sso_config"} assert set(ROLE_LABELS.keys()) == expected def test_launch_role_labels(self): - expected = {"ibl_cli_ops", "ibl_launch", "smtp_config", "ibl_launch_services", "integrations", "admin_setup", "data_seeding", "stripe_config", "google_sso_config", "microsoft_sso_config"} + expected = {"ibl_cli_ops", "ibl_launch", "smtp_config", "ibl_launch_services", "integrations", "admin_setup", "data_seeding", "ibl_tenant_platform", "stripe_config", "google_sso_config", "microsoft_sso_config"} assert set(LAUNCH_ROLE_LABELS.keys()) == expected def test_launch_role_labels_count(self): - assert len(LAUNCH_ROLE_LABELS) == 10 + assert len(LAUNCH_ROLE_LABELS) == 11 def test_service_update_role_labels(self): expected = {"ibl_cli_ops", "ibl_service_update"} @@ -454,12 +454,19 @@ def test_ssh_permission_denied(self, runner): with patch("iblai_infra.ansible.runner.subprocess.run", return_value=mock_result): assert runner._test_ssh() is False + # The five tests below exercise the retry path of `_test_ssh()` + # (10 retries × 15s sleep). They mock `time.sleep` so they finish + # in milliseconds while still asserting the exhaust-retries → False + # behavior end-to-end. def test_ssh_connection_refused(self, runner): mock_result = MagicMock() mock_result.returncode = 255 mock_result.stderr = "Connection refused" - with patch("iblai_infra.ansible.runner.subprocess.run", return_value=mock_result): + with ( + patch("iblai_infra.ansible.runner.subprocess.run", return_value=mock_result), + patch("iblai_infra.ansible.runner.time.sleep"), + ): assert runner._test_ssh() is False def test_ssh_connection_timed_out(self, runner): @@ -467,7 +474,10 @@ def test_ssh_connection_timed_out(self, runner): mock_result.returncode = 255 mock_result.stderr = "Connection timed out" - with patch("iblai_infra.ansible.runner.subprocess.run", return_value=mock_result): + with ( + patch("iblai_infra.ansible.runner.subprocess.run", return_value=mock_result), + patch("iblai_infra.ansible.runner.time.sleep"), + ): assert runner._test_ssh() is False def test_ssh_no_route(self, runner): @@ -475,7 +485,10 @@ def test_ssh_no_route(self, runner): mock_result.returncode = 255 mock_result.stderr = "No route to host" - with patch("iblai_infra.ansible.runner.subprocess.run", return_value=mock_result): + with ( + patch("iblai_infra.ansible.runner.subprocess.run", return_value=mock_result), + patch("iblai_infra.ansible.runner.time.sleep"), + ): assert runner._test_ssh() is False def test_ssh_other_error(self, runner): @@ -483,7 +496,10 @@ def test_ssh_other_error(self, runner): mock_result.returncode = 1 mock_result.stderr = "Some unexpected error" - with patch("iblai_infra.ansible.runner.subprocess.run", return_value=mock_result): + with ( + patch("iblai_infra.ansible.runner.subprocess.run", return_value=mock_result), + patch("iblai_infra.ansible.runner.time.sleep"), + ): assert runner._test_ssh() is False def test_ssh_empty_stderr(self, runner): @@ -491,7 +507,10 @@ def test_ssh_empty_stderr(self, runner): mock_result.returncode = 1 mock_result.stderr = "" - with patch("iblai_infra.ansible.runner.subprocess.run", return_value=mock_result): + with ( + patch("iblai_infra.ansible.runner.subprocess.run", return_value=mock_result), + patch("iblai_infra.ansible.runner.time.sleep"), + ): assert runner._test_ssh() is False diff --git a/tests/conftest.py b/tests/conftest.py index e9e0b96..067ed2e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -47,7 +47,7 @@ def infra_config(aws_credentials: AWSCredentials) -> InfraConfig: environment=Environment.DEV, credentials=aws_credentials, network=NetworkConfig(vpc_cidr="10.0.0.0/16", vpn_ip="203.0.113.42"), - compute=ComputeConfig(instance_type="t3.2xlarge", volume_size=50, volume_type="gp3"), + compute=ComputeConfig(instance_type="t3.2xlarge", volume_size=100, volume_type="gp3"), ssh=SSHConfig( method=SSHKeyMethod.GENERATE, key_name="testproject-dev", diff --git a/tests/prompts/test_review.py b/tests/prompts/test_review.py index 3c5c45b..017d89f 100644 --- a/tests/prompts/test_review.py +++ b/tests/prompts/test_review.py @@ -41,7 +41,7 @@ def _make_config( account_id="123456789012", ), network=NetworkConfig(vpn_ip="1.2.3.4"), - compute=ComputeConfig(instance_type="t3.2xlarge", volume_size=50, volume_type="gp3"), + compute=ComputeConfig(instance_type="t3.2xlarge", volume_size=100, volume_type="gp3"), ssh=SSHConfig( method=ssh_method, key_name="test-key", diff --git a/tests/prompts/test_setup.py b/tests/prompts/test_setup.py index 84a6443..548809d 100644 --- a/tests/prompts/test_setup.py +++ b/tests/prompts/test_setup.py @@ -192,7 +192,7 @@ def test_full_flow_reuse_credentials(self, tmp_path): mock_password.return_value.ask.side_effect = ["ghp_testtoken", "", "Admin1234"] # confirms: enable_ai, create_playwright_platforms, smtp_enabled, stripe_enabled, google_sso_enabled, microsoft_sso_enabled, reuse credentials mock_confirm.return_value.ask.side_effect = [True, False, False, False, False, False, True] - mock_text.return_value.ask.side_effect = ["main", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "ibl_admin", "admin@example.com"] + mock_text.return_value.ask.side_effect = ["main", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "platform_admin", "admin@example.com"] config = prompt_setup(state) @@ -206,7 +206,7 @@ def test_full_flow_reuse_credentials(self, tmp_path): assert config.git_access_token == "ghp_testtoken" assert config.target_host == "54.1.2.3" assert config.base_domain == "example.com" - assert config.admin_username == "ibl_admin" + assert config.admin_username == "platform_admin" assert config.admin_email == "admin@example.com" assert config.admin_password == "Admin1234" @@ -224,7 +224,7 @@ def test_full_flow_new_credentials(self, tmp_path): mock_password.return_value.ask.side_effect = ["ghp_testtoken", "NEW_SECRET", "sk-test-key", "Admin1234"] # confirms: enable_ai, create_playwright_platforms, smtp_enabled, stripe_enabled, google_sso_enabled, microsoft_sso_enabled, don't reuse credentials mock_confirm.return_value.ask.side_effect = [True, False, False, False, False, False, False] - mock_text.return_value.ask.side_effect = ["main", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "NEW_ACCESS_KEY", "ibl_admin", "admin@example.com"] + mock_text.return_value.ask.side_effect = ["main", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "NEW_ACCESS_KEY", "platform_admin", "admin@example.com"] config = prompt_setup(state) @@ -250,7 +250,7 @@ def test_flow_no_access_keys_prompts_directly(self, tmp_path): mock_password.return_value.ask.side_effect = ["ghp_testtoken", "SECRET", "", "Admin1234"] # confirms: enable_ai, create_playwright_platforms, smtp_enabled, stripe_enabled, google_sso_enabled, microsoft_sso_enabled (no reuse prompt when no access keys) mock_confirm.return_value.ask.side_effect = [True, True, False, False, False, False] - mock_text.return_value.ask.side_effect = ["main", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "ACCESS_KEY", "ibl_admin", "admin@example.com"] + mock_text.return_value.ask.side_effect = ["main", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "ACCESS_KEY", "platform_admin", "admin@example.com"] config = prompt_setup(state) @@ -280,7 +280,7 @@ def test_ssh_key_not_found_prompts(self, tmp_path): # confirms: enable_ai, create_playwright_platforms, smtp_enabled, stripe_enabled, google_sso_enabled, microsoft_sso_enabled, reuse credentials mock_confirm.return_value.ask.side_effect = [True, False, False, False, False, False, True] mock_path.return_value.ask.return_value = str(new_key) - mock_text.return_value.ask.side_effect = ["main", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "ibl_admin", "admin@example.com"] + mock_text.return_value.ask.side_effect = ["main", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "platform_admin", "admin@example.com"] config = prompt_setup(state) @@ -306,7 +306,7 @@ def test_existing_file_method_prompts_for_key(self, tmp_path): # confirms: enable_ai, create_playwright_platforms, smtp_enabled, stripe_enabled, google_sso_enabled, microsoft_sso_enabled, reuse credentials mock_confirm.return_value.ask.side_effect = [True, False, False, False, False, False, True] mock_path.return_value.ask.return_value = str(key) - mock_text.return_value.ask.side_effect = ["main", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "ibl_admin", "admin@example.com"] + mock_text.return_value.ask.side_effect = ["main", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "platform_admin", "admin@example.com"] config = prompt_setup(state) @@ -332,7 +332,7 @@ def test_aws_keypair_method_prompts_for_key(self, tmp_path): # confirms: enable_ai, create_playwright_platforms, smtp_enabled, stripe_enabled, google_sso_enabled, microsoft_sso_enabled, reuse credentials mock_confirm.return_value.ask.side_effect = [True, False, False, False, False, False, True] mock_path.return_value.ask.return_value = str(key) - mock_text.return_value.ask.side_effect = ["main", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "ibl_admin", "admin@example.com"] + mock_text.return_value.ask.side_effect = ["main", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "platform_admin", "admin@example.com"] config = prompt_setup(state) @@ -376,7 +376,7 @@ def test_full_flow_smtp_enabled(self, tmp_path): "iblai", "iblai-cli-ops", "iblai-prod-images", - "ibl_admin", + "platform_admin", "admin@example.com", ] @@ -433,7 +433,7 @@ def test_full_flow_stripe_enabled(self, tmp_path): "iblai", "iblai-cli-ops", "iblai-prod-images", - "ibl_admin", + "platform_admin", "admin@example.com", ] @@ -487,7 +487,7 @@ def test_full_flow_google_sso_enabled(self, tmp_path): "iblai", "iblai-cli-ops", "iblai-prod-images", - "ibl_admin", + "platform_admin", "admin@example.com", ] @@ -539,7 +539,7 @@ def test_full_flow_microsoft_sso_enabled(self, tmp_path): "iblai", "iblai-cli-ops", "iblai-prod-images", - "ibl_admin", + "platform_admin", "admin@example.com", ] @@ -575,7 +575,7 @@ def test_platform_name_lowercased_and_stripped(self, tmp_path): "iblai", "iblai-cli-ops", "iblai-prod-images", - "ibl_admin", + "platform_admin", "admin@example.com", ] @@ -637,7 +637,7 @@ def test_full_resetup_flow(self, tmp_path): # Only one confirm: reuse credentials mock_confirm.return_value.ask.return_value = True # text prompts: base_domain, cli_ops_release_tag, admin_username, admin_email - mock_text.return_value.ask.side_effect = ["new.example.com", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "ibl_admin", "admin@example.com"] + mock_text.return_value.ask.side_effect = ["new.example.com", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "platform_admin", "admin@example.com"] config = prompt_resetup(state) @@ -648,7 +648,7 @@ def test_full_resetup_flow(self, tmp_path): assert config.aws_access_key_id == "AKIA" assert config.aws_secret_access_key == "SECRET" assert config.git_access_token == "ghp_testtoken" - assert config.admin_username == "ibl_admin" + assert config.admin_username == "platform_admin" assert config.admin_email == "admin@example.com" assert config.admin_password == "Admin1234" @@ -666,7 +666,7 @@ def test_resetup_prompts_for_base_domain(self, tmp_path): ): mock_password.return_value.ask.side_effect = ["ghp_testtoken", "", "Admin1234"] mock_confirm.return_value.ask.return_value = True - mock_text.return_value.ask.side_effect = ["changed.example.com", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "ibl_admin", "admin@example.com"] + mock_text.return_value.ask.side_effect = ["changed.example.com", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "platform_admin", "admin@example.com"] config = prompt_resetup(state) @@ -688,7 +688,7 @@ def test_resetup_ssh_key_resolved(self, tmp_path): ): mock_password.return_value.ask.side_effect = ["ghp_testtoken", "", "Admin1234"] mock_confirm.return_value.ask.return_value = True - mock_text.return_value.ask.side_effect = ["new.example.com", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "ibl_admin", "admin@example.com"] + mock_text.return_value.ask.side_effect = ["new.example.com", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "platform_admin", "admin@example.com"] config = prompt_resetup(state) @@ -710,7 +710,7 @@ def test_resetup_new_credentials(self, tmp_path): # Decline reusing credentials mock_confirm.return_value.ask.return_value = False # Region is pre-populated from state, so not prompted - mock_text.return_value.ask.side_effect = ["new.example.com", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "NEW_KEY", "ibl_admin", "admin@example.com"] + mock_text.return_value.ask.side_effect = ["new.example.com", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "NEW_KEY", "platform_admin", "admin@example.com"] config = prompt_resetup(state) @@ -741,7 +741,7 @@ def test_resetup_with_ingress_selection(self, tmp_path): mock_password.return_value.ask.side_effect = ["ghp_testtoken", "", "Admin1234"] mock_confirm.return_value.ask.return_value = True # text prompts: cli_ops_release_tag, admin_username, admin_email - mock_text.return_value.ask.side_effect = ["main", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "ibl_admin", "admin@example.com"] + mock_text.return_value.ask.side_effect = ["main", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "platform_admin", "admin@example.com"] config = prompt_resetup(state) @@ -771,7 +771,7 @@ def test_resetup_ingress_custom_fallback(self, tmp_path): # text prompts: custom domain, cli_ops_release_tag, # github_org, cli_ops_repo, prod_images_repo, # admin_username, admin_email - mock_text.return_value.ask.side_effect = ["custom.example.com", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "ibl_admin", "admin@example.com"] + mock_text.return_value.ask.side_effect = ["custom.example.com", "3.19.0", "iblai", "iblai-cli-ops", "iblai-prod-images", "platform_admin", "admin@example.com"] config = prompt_resetup(state) diff --git a/tests/terraform/test_runner.py b/tests/terraform/test_runner.py index 8260e8c..a562663 100644 --- a/tests/terraform/test_runner.py +++ b/tests/terraform/test_runner.py @@ -169,7 +169,7 @@ def test_basic_tfvars(self, infra_config, tmp_path): assert 'environment = "dev"' in tfvars assert 'region = "us-east-1"' in tfvars assert 'instance_type = "t3.2xlarge"' in tfvars - assert "root_volume_size = 50" in tfvars + assert "root_volume_size = 100" in tfvars assert 'base_domain = "example.com"' in tfvars assert "create_key_pair = true" in tfvars diff --git a/tests/test_env_setup.py b/tests/test_env_setup.py index 7f4af9e..898ac13 100644 --- a/tests/test_env_setup.py +++ b/tests/test_env_setup.py @@ -34,7 +34,7 @@ def _required_env(**overrides) -> dict[str, str]: "AWS_ACCESS_KEY_ID": "AKIAIOSFODNN7EXAMPLE", "AWS_SECRET_ACCESS_KEY": "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY", "GIT_TOKEN": "test-pat-value", - "ADMIN_USERNAME": "ibl_admin", + "ADMIN_USERNAME": "platform_admin", "ADMIN_EMAIL": "admin@example.com", "ADMIN_PASSWORD": "change-me-min-8-chars", } @@ -100,7 +100,7 @@ def test_minimal_env_produces_valid_config(self, project_state): assert config.target_host == "54.123.45.67" # from project_state fixture assert config.base_domain == "example.com" assert config.aws_default_region == "us-east-1" - assert config.admin_username == "ibl_admin" + assert config.admin_username == "platform_admin" def test_aws_default_region_derived_from_state(self, project_state): project_state.config.credentials.region = "eu-west-1" diff --git a/tests/test_models.py b/tests/test_models.py index 7325015..cb15427 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -161,10 +161,14 @@ def test_default_vpc_cidr(self): class TestComputeConfigValidation: def test_valid_volume_size(self): - cc = ComputeConfig(volume_size=100) - assert cc.volume_size == 100 + cc = ComputeConfig(volume_size=200) + assert cc.volume_size == 200 def test_minimum_volume_size(self): + # ComputeConfig itself only enforces 20 GB (the call-server placeholder + # path reuses this model with ~40 GB). The 100 GB IBL-platform floor + # is enforced on `InfraConfig` for `DeploymentType.SINGLE` — see + # TestSingleServerVolumeFloor below. cc = ComputeConfig(volume_size=20) assert cc.volume_size == 20 @@ -183,7 +187,7 @@ def test_negative_raises(self): def test_defaults(self): cc = ComputeConfig() assert cc.instance_type == "t3.2xlarge" - assert cc.volume_size == 50 + assert cc.volume_size == 100 assert cc.volume_type == "gp3" diff --git a/tests/test_runtime_iam.py b/tests/test_runtime_iam.py new file mode 100644 index 0000000..92cf9c6 --- /dev/null +++ b/tests/test_runtime_iam.py @@ -0,0 +1,123 @@ +"""Tests for iblai_infra.runtime_iam — IAM policy generator + post-provision output.""" + +from __future__ import annotations + +import json +from pathlib import Path + +import pytest + +from iblai_infra.models import DeploymentType +from iblai_infra.runtime_iam import ( + POLICY_FILENAME, + build_runtime_iam_policy, + extract_bucket_names, + render_runtime_access_instructions, +) + + +class TestBuildPolicy: + def test_single_bucket(self): + policy = build_runtime_iam_policy(["my-backups"]) + assert policy["Version"] == "2012-10-17" + sids = {s["Sid"] for s in policy["Statement"]} + # S3-only by design — ECR credentials are provided separately by IBL. + assert sids == {"PlatformBucketObjects", "PlatformBucketList"} + + def test_no_ecr_statements(self): + # IBL's image registry creds are an out-of-band handoff. The + # customer-created policy must not include ECR scope. + policy = build_runtime_iam_policy(["b"]) + for stmt in policy["Statement"]: + for action in stmt["Action"]: + assert not action.startswith("ecr:"), ( + f"runtime IAM policy must be S3-only; found {action!r}" + ) + + def test_three_buckets_arn_shape(self): + policy = build_runtime_iam_policy([ + "p-staging-backups", + "p-staging-dm-media", + "p-staging-dm-static", + ]) + objects_stmt = next(s for s in policy["Statement"] if s["Sid"] == "PlatformBucketObjects") + list_stmt = next(s for s in policy["Statement"] if s["Sid"] == "PlatformBucketList") + # Object-level resources get the /* suffix; bucket-level don't. + assert objects_stmt["Resource"] == [ + "arn:aws:s3:::p-staging-backups/*", + "arn:aws:s3:::p-staging-dm-media/*", + "arn:aws:s3:::p-staging-dm-static/*", + ] + assert list_stmt["Resource"] == [ + "arn:aws:s3:::p-staging-backups", + "arn:aws:s3:::p-staging-dm-media", + "arn:aws:s3:::p-staging-dm-static", + ] + + def test_s3_actions_are_tight(self): + policy = build_runtime_iam_policy(["b"]) + obj_actions = next(s["Action"] for s in policy["Statement"] if s["Sid"] == "PlatformBucketObjects") + assert "s3:*" not in obj_actions + # Bucket policy / lifecycle / encryption mutations stay out. + for forbidden in ("s3:PutBucketPolicy", "s3:DeleteBucketPolicy", "s3:PutLifecycleConfiguration"): + assert forbidden not in obj_actions + + def test_empty_buckets_raises(self): + with pytest.raises(ValueError, match="at least one S3 bucket"): + build_runtime_iam_policy([]) + + def test_policy_is_json_serializable(self): + policy = build_runtime_iam_policy(["a", "b", "c"]) + # Round-trip — what we hand the operator must survive `aws iam put-user-policy`. + round_tripped = json.loads(json.dumps(policy)) + assert round_tripped == policy + + +class TestExtractBuckets: + def test_all_three_present(self): + outputs = { + "instance_public_ip": "1.2.3.4", + "s3_bucket_backups": "p-backups", + "s3_bucket_media": "p-media", + "s3_bucket_static": "p-static", + } + assert extract_bucket_names(outputs) == ["p-backups", "p-media", "p-static"] + + def test_partial_outputs(self): + outputs = {"s3_bucket_backups": "only-backups"} + assert extract_bucket_names(outputs) == ["only-backups"] + + def test_no_buckets(self): + assert extract_bucket_names({}) == [] + assert extract_bucket_names({"instance_public_ip": "1.2.3.4"}) == [] + + def test_empty_string_skipped(self): + # Terraform sometimes emits "" for an unset output rather than omitting. + outputs = {"s3_bucket_backups": "", "s3_bucket_media": "p-m"} + assert extract_bucket_names(outputs) == ["p-m"] + + +class TestRenderInstructions: + def test_writes_policy_file(self, infra_config, tmp_path): + outputs = { + "s3_bucket_backups": "test-backups", + "s3_bucket_media": "test-media", + "s3_bucket_static": "test-static", + } + render_runtime_access_instructions(infra_config, outputs, tmp_path) + policy_path = tmp_path / POLICY_FILENAME + assert policy_path.exists() + loaded = json.loads(policy_path.read_text()) + # File contents match what build_runtime_iam_policy emits. + expected = build_runtime_iam_policy(["test-backups", "test-media", "test-static"]) + assert loaded == expected + + def test_call_server_skipped(self, infra_config, tmp_path): + infra_config.deployment_type = DeploymentType.CALL + outputs = {"s3_bucket_backups": "would-not-be-touched"} + render_runtime_access_instructions(infra_config, outputs, tmp_path) + assert not (tmp_path / POLICY_FILENAME).exists() + + def test_no_buckets_skips_write(self, infra_config, tmp_path): + render_runtime_access_instructions(infra_config, outputs={}, ws=tmp_path) + assert not (tmp_path / POLICY_FILENAME).exists()