From 2aa38d28542d7141e1eff3e97a1bee34c97eafd4 Mon Sep 17 00:00:00 2001 From: Charith Nuwan Bimsara <59943919+nuwangeek@users.noreply.github.com> Date: Thu, 18 Jun 2026 13:04:15 +0530 Subject: [PATCH] Fix for Vault configuration issue (#473) * fixed vault configuration issue * address Vault via unique rag-vault alias to avoid cross -stack DNS collision * fixed vector indexer statistics analysis issue --- DSL/CronManager/DSL/data_resync.yml | 2 +- DSL/CronManager/DSL/delete_from_vault.yml | 2 +- DSL/CronManager/DSL/store_in_vault.yml | 2 +- .../script/delete_secrets_from_vault.sh | 4 +- .../script/store_secrets_in_vault.sh | 4 +- docker-compose-ec2.yml | 12 +- docker-compose.yml | 14 +- docs/VAULT_SECURITY_ARCHITECTURE.md | 81 ++-- docs/VAULT_SETUP_AND_USAGE.md | 355 ++++++++++++++++++ src/vector_indexer/contextual_processor.py | 14 +- src/vector_indexer/error_logger.py | 4 +- src/vector_indexer/main_indexer.py | 80 ++-- src/vector_indexer/models.py | 8 + vault-init.sh | 175 +++++---- vault/agents/cron/cron-agent.hcl | 6 +- vault/agents/gui/gui-agent.hcl | 6 +- vault/agents/llm/agent.hcl | 6 +- vault/config/vault.hcl | 44 +-- 18 files changed, 624 insertions(+), 195 deletions(-) create mode 100644 docs/VAULT_SETUP_AND_USAGE.md diff --git a/DSL/CronManager/DSL/data_resync.yml b/DSL/CronManager/DSL/data_resync.yml index b5994d1e..a232ba39 100644 --- a/DSL/CronManager/DSL/data_resync.yml +++ b/DSL/CronManager/DSL/data_resync.yml @@ -2,4 +2,4 @@ agency_data_resync: trigger: "0 0 0/1 * * ?" # trigger: off type: exec - command: "../app/scripts/agency_data_resync.sh -s 10" \ No newline at end of file + command: "/app/scripts/agency_data_resync.sh -s 10" \ No newline at end of file diff --git a/DSL/CronManager/DSL/delete_from_vault.yml b/DSL/CronManager/DSL/delete_from_vault.yml index d7f06cea..cde1df27 100644 --- a/DSL/CronManager/DSL/delete_from_vault.yml +++ b/DSL/CronManager/DSL/delete_from_vault.yml @@ -2,4 +2,4 @@ delete_secrets: trigger: off type: exec command: "/app/scripts/delete_secrets_from_vault.sh" - allowedEnvs: ['cookie','vaultUuid','llmPlatform', 'llmModel','embeddingModel','embeddingPlatform'] + allowedEnvs: ['cookie','vaultUuid','llmPlatform', 'llmModel','embeddingModel','embeddingPlatform', 'vaultAgentUrl'] diff --git a/DSL/CronManager/DSL/store_in_vault.yml b/DSL/CronManager/DSL/store_in_vault.yml index fa1a6ac1..46f861e6 100644 --- a/DSL/CronManager/DSL/store_in_vault.yml +++ b/DSL/CronManager/DSL/store_in_vault.yml @@ -2,4 +2,4 @@ store_secrets: trigger: off type: exec command: "/app/scripts/store_secrets_in_vault.sh" - allowedEnvs: ['cookie','vaultUuid','llmPlatform', 'llmModel','secretKey','accessKey','deploymentName','targetUrl','apiKey','embeddingModel','embeddingPlatform','embeddingAccessKey','embeddingSecretKey','embeddingDeploymentName','embeddingTargetUri','embeddingAzureApiKey','deploymentEnvironment'] \ No newline at end of file + allowedEnvs: ['cookie','vaultUuid','llmPlatform', 'llmModel','secretKey','accessKey','deploymentName','targetUrl','apiKey','embeddingModel','embeddingPlatform','embeddingAccessKey','embeddingSecretKey','embeddingDeploymentName','embeddingTargetUri','embeddingAzureApiKey','deploymentEnvironment', 'vaultAgentUrl'] \ No newline at end of file diff --git a/DSL/CronManager/script/delete_secrets_from_vault.sh b/DSL/CronManager/script/delete_secrets_from_vault.sh index a6423566..3b405927 100644 --- a/DSL/CronManager/script/delete_secrets_from_vault.sh +++ b/DSL/CronManager/script/delete_secrets_from_vault.sh @@ -6,9 +6,9 @@ set -e # Exit on any error # Configuration -# Use VAULT_AGENT_URL which points to vault-agent-cron proxy +# Use vaultAgentUrl which points to vault-agent-cron proxy # The agent automatically injects the authentication token -VAULT_ADDR="${VAULT_AGENT_URL:-http://vault-agent-cron:8203}" +VAULT_ADDR="${vaultAgentUrl:-http://vault-agent-cron:8203}" # Logging function log() { diff --git a/DSL/CronManager/script/store_secrets_in_vault.sh b/DSL/CronManager/script/store_secrets_in_vault.sh index 8f4056f8..60784eed 100644 --- a/DSL/CronManager/script/store_secrets_in_vault.sh +++ b/DSL/CronManager/script/store_secrets_in_vault.sh @@ -6,9 +6,9 @@ set -e # Exit on any error # Configuration -# Use VAULT_AGENT_URL which points to vault-agent-cron proxy +# Use vaultAgentUrl which points to vault-agent-cron proxy # The agent automatically injects the authentication token -VAULT_ADDR="${VAULT_AGENT_URL:-http://vault-agent-cron:8203}" +VAULT_ADDR="${vaultAgentUrl:-http://vault-agent-cron:8203}" # Decryption Configuration PRIVATE_KEY_CACHE="" diff --git a/docker-compose-ec2.yml b/docker-compose-ec2.yml index f1ab5f54..e6152dbe 100644 --- a/docker-compose-ec2.yml +++ b/docker-compose-ec2.yml @@ -503,7 +503,11 @@ services: - ./vault/config:/vault/config:ro - ./vault/logs:/vault/logs networks: - - vault-network # Only on vault-network for security + vault-network: # Only on vault-network for security + # Local testing: bare "vault" collides with the ckb stack on the shared + # bykstack network, so expose this Vault under a unique alias instead. + aliases: + - rag-vault restart: unless-stopped healthcheck: test: ["CMD", "sh", "-c", "wget -q -O- http://127.0.0.1:8200/v1/sys/health || exit 0"] @@ -520,7 +524,7 @@ services: vault: condition: service_healthy environment: - VAULT_ADDR: http://vault:8200 + VAULT_ADDR: http://rag-vault:8200 volumes: - vault-data:/vault/data - vault-agent-creds:/agent/credentials @@ -529,8 +533,8 @@ services: - vault-agent-llm-token:/agent/llm-token - ./vault-init.sh:/vault-init.sh:ro networks: - - vault-network # Access vault - - bykstack # Access to write agent tokens + # vault-network only: tokens/creds go via shared volumes, not the network. + - vault-network entrypoint: ["/bin/sh"] command: - -c diff --git a/docker-compose.yml b/docker-compose.yml index 5e9c2962..0befd6d7 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -193,7 +193,7 @@ services: environment: - server.port=9010 - PYTHONPATH=/app:/app/src/vector_indexer:/app/src/intent_data_enrichment:/app/src/api_tool_indexer - - VAULT_AGENT_URL=http://vault-agent-cron:8203 + - vaultAgentUrl=http://vault-agent-cron:8203 ports: - 9010:8080 depends_on: @@ -451,7 +451,11 @@ services: - ./vault/config:/vault/config:ro - ./vault/logs:/vault/logs networks: - - vault-network # Only on vault-network for security + vault-network: # Only on vault-network for security + # Local testing: bare "vault" collides with the ckb stack on the shared + # bykstack network, so expose this Vault under a unique alias instead. + aliases: + - rag-vault restart: unless-stopped healthcheck: test: ["CMD", "sh", "-c", "wget -q -O- http://127.0.0.1:8200/v1/sys/health || exit 0"] @@ -468,7 +472,7 @@ services: vault: condition: service_healthy environment: - VAULT_ADDR: http://vault:8200 + VAULT_ADDR: http://rag-vault:8200 volumes: - vault-data:/vault/data - vault-agent-creds:/agent/credentials @@ -477,8 +481,8 @@ services: - vault-agent-llm-token:/agent/llm-token - ./vault-init.sh:/vault-init.sh:ro networks: - - vault-network # Access vault - - bykstack # Access to write agent tokens + # vault-network only: tokens/creds go via shared volumes, not the network. + - vault-network entrypoint: ["/bin/sh"] command: - -c diff --git a/docs/VAULT_SECURITY_ARCHITECTURE.md b/docs/VAULT_SECURITY_ARCHITECTURE.md index fe6fd741..2c2c6836 100644 --- a/docs/VAULT_SECURITY_ARCHITECTURE.md +++ b/docs/VAULT_SECURITY_ARCHITECTURE.md @@ -197,9 +197,12 @@ Day 0+: Automatic Token Renewal: Container Restart: vault-init: Check if Vault is sealed ↓ - If unsealed: Regenerate secret_id only + If unsealed: Validate existing secret_ids ↓ - vault-agent: Re-authenticate with new secret_id + If valid: Reuse existing secret_id (no churn) + If invalid: Mint new secret_id and write to disk + ↓ + vault-agent: Re-authenticate with secret_id ↓ New token issued and cached ``` @@ -413,8 +416,9 @@ Connected Services: - GUI (React Frontend) Token Lifecycle: - - Default Lease: 768h (32 days) - - Auto-renewal: Before expiration + - Token type: periodic (token_period 20m, no max-TTL) + - Auto-renewal: Every ~13 minutes (~2/3 of period) + - Re-auth: only on agent restart (never in steady state) ``` #### Agent 2: vault-agent-cron @@ -429,8 +433,9 @@ Connected Services: - CronManager (Python worker) Token Lifecycle: - - Default Lease: 768h (32 days) - - Auto-renewal: Before expiration + - Token type: periodic (token_period 30m, no max-TTL) + - Auto-renewal: Every ~20 minutes (~2/3 of period) + - Re-auth: only on agent restart (never in steady state) ``` #### Agent 3: vault-agent-llm @@ -445,8 +450,9 @@ Connected Services: - LLM Orchestration Service (FastAPI) Token Lifecycle: - - Default Lease: 1h (shorter for higher security) - - Auto-renewal: Every ~45 minutes + - Token type: periodic (token_period 1h, no max-TTL) + - Auto-renewal: Every ~40 minutes (~2/3 of period) + - Re-auth: only on agent restart (never in steady state) ``` ### Token Caching and Auto-Renewal @@ -464,29 +470,31 @@ T=0: Initial Authentication ├─► POST /v1/auth/approle/login │ Body: { role_id, secret_id } │ - └─► Receives: { token, ttl: 3600s, renewable: true } + └─► Receives: { token, period: 3600s, renewable: true } ← periodic token, no max-TTL │ └─► Cache token in: /agent/llm-token/token -T=45min: Proactive Renewal (75% of TTL) +T≈40min: Proactive Renewal (~2/3 of period) vault-agent monitors expiration │ ├─► POST /v1/auth/token/renew-self │ Header: X-Vault-Token: │ - └─► Receives: { token, ttl: 3600s } (same token, extended) + └─► Receives: { token, period: 3600s } (same token, period reset) │ └─► Update cache: /agent/llm-token/token + │ + └─► Repeats forever — a periodic token never hits a max-TTL, + so steady-state operation never needs approle/login again. -T=59min: Renewal Failed (fallback) - If renewal fails: +On agent restart only: + vault-agent re-reads role_id + secret_id from disk │ - ├─► Re-authenticate from scratch - │ POST /v1/auth/approle/login + ├─► POST /v1/auth/approle/login (secret_id must still be valid) │ - └─► New token issued and cached + └─► New periodic token issued and cached Application Request (anytime): @@ -856,15 +864,16 @@ Step 12: Check Vault Seal Status └─► GET /v1/sys/seal-status └─► If unsealed: Skip unseal steps -Step 13: Regenerate Secret IDs Only - └─► POST /v1/auth/approle/role/gui-service/secret-id - └─► POST /v1/auth/approle/role/cron-manager-service/secret-id - └─► POST /v1/auth/approle/role/llm-orchestration-service/secret-id - └─► Write new secret_ids to /agent/credentials/ +Step 13: Validate and Reconcile Secret IDs + └─► For each role (gui, cron-manager, llm-orchestration): + ├─► Test existing on-disk secret_id via AppRole login + ├─► If valid: Reuse (no change to credential file) + └─► If invalid/missing: Mint new secret_id and write to disk Note: role_ids remain unchanged (static identifiers) Note: Existing secrets and policies preserved Note: RSA keypair NOT regenerated (preserved) +Note: Stable secret_ids across restarts reduce credential churn ═══════════════════════════════════════════════════════════════════ COMPLETION @@ -1128,13 +1137,14 @@ Startup Order: vault-init Behavior: - Detects Vault already initialized - Skips initialization steps - - Regenerates secret_ids only - - Updates credential files + - Validates existing secret_ids (reuses if still valid) + - Mints new secret_ids only if existing ones are invalid Result: - All services start with fresh credentials + All services start with validated credentials Existing secrets preserved No manual intervention needed + Stable secret_ids reduce unnecessary credential churn ``` ### Token Regeneration Strategy @@ -1143,22 +1153,23 @@ Result: Current Implementation: 1. On Every Container Restart: - └─► vault-init regenerates secret_ids - └─► Vault agents get new tokens - └─► Old tokens remain valid until expiration + └─► vault-init validates existing secret_ids + ├─► If valid: Reuse (agents continue with same credentials) + └─► If invalid: Mint new secret_id, agents re-authenticate 2. Token Lifecycle: - └─► Issue: vault-agent authenticates + └─► Issue: vault-agent authenticates (periodic token, token_period per role) └─► Use: Application makes requests - └─► Renew: vault-agent extends TTL - └─► Expire: Automatic renewal failed - └─► Re-issue: vault-agent re-authenticates + └─► Renew: vault-agent renews within the period (~2/3 of period) + └─► No max-TTL: renewal continues indefinitely + └─► Re-issue: only on agent restart, via secret_id login 3. Security Benefits: - Short-lived tokens (1 hour for LLM, 32 days for others) - Automatic rotation on agent restart - No manual token management - Compromised tokens have limited lifetime + Periodic tokens (period 1h LLM, 30m Cron, 20m GUI), renewed continuously + Steady-state operation never re-runs approle/login (a stale secret_id + cannot strand a running agent) + Stable secret_ids (no unnecessary churn on restart) + Compromised tokens limited to one un-renewed period ``` ### Audit Logging Capabilities diff --git a/docs/VAULT_SETUP_AND_USAGE.md b/docs/VAULT_SETUP_AND_USAGE.md new file mode 100644 index 00000000..e61d362b --- /dev/null +++ b/docs/VAULT_SETUP_AND_USAGE.md @@ -0,0 +1,355 @@ +# Vault Setup & Usage Guide + +A single reference for how HashiCorp Vault is deployed, initialized, and consumed in the +RAG-Module. It covers the topology, the three Vault Agents, the secret layout, and — in +depth — **how each agent renews its token and how secrets are rotated**. + +Source files this document describes: + +- `docker-compose.yml` — service/topology definition +- `vault/config/vault.hcl` — Vault server config +- `vault-init.sh` — one-time bootstrap + per-restart reconcile +- `vault/agents/{gui,cron,llm}/*.hcl` — the three Vault Agent configs +- `DSL/CronManager/script/store_secrets_in_vault.sh` — writes/rotates secrets +- `DSL/CronManager/script/delete_secrets_from_vault.sh` — deletes secrets + +For the security rationale (threat model, defense-in-depth, access matrix) see the +companion `docs/VAULT_SECURITY_ARCHITECTURE.md`. This guide focuses on the *operational* +mechanics. + +--- + +## 1. Topology at a glance + +``` + bykstack (application network) vault-network (internal: true) + ┌───────────────────────────────────────────────┐ ┌──────────────────────────────┐ + │ gui ──────────────► vault-agent-gui :8202 ───┼────────┤ │ + │ cron-manager ─────► vault-agent-cron :8203 ───┼────────┤ vault :8200 │ + │ llm-orchestration ► vault-agent-llm :8201 ───┼────────┤ (Raft storage, KV v2, │ + │ │ │ AppRole auth) │ + │ vault-init (also on vault-network) ───────────┼────────┤ │ + └───────────────────────────────────────────────┘ └──────────────────────────────┘ +``` + +- **`vault`** runs only on `vault-network`, which is `internal: true` — it has **no route to + or from the host or the internet**. Port 8200 is never published. +- **Vault Agents** straddle both networks: they reach `vault` on `vault-network` and are + reachable by their owning application on `bykstack`. +- **Applications** talk *only* to their agent (`VAULT_ADDR=http://vault-agent-*:820x`) and + never hold a Vault token themselves. The agent injects the token transparently. + +| Service | Agent it uses | Agent address | AppRole | Policy | +|---|---|---|---|---| +| `gui` | `vault-agent-gui` | `:8202` | `gui-service` | `gui-policy` | +| `cron-manager` | `vault-agent-cron` | `:8203` | `cron-manager-service` | `cron-manager-policy` | +| `llm-orchestration-service` | `vault-agent-llm` | `:8201` | `llm-orchestration-service` | `llm-orchestration-policy` | + +--- + +## 2. Vault server (`vault/config/vault.hcl`) + +- **Storage:** Raft, single node (`node_id = vault-node-1`, path `/vault/file`, persisted in + the `vault-data` volume). No `retry_join` — a lone node self-bootstraps; adding a self- + pointing join was found to cause "Vault is sealed" boot loops. +- **Listener:** `0.0.0.0:8200`, `tls_disable = true` (TLS is terminated at the network + boundary; the network itself is the isolation layer here). Port `8201` is *not* given its + own listener because Vault uses it as the internal cluster port automatically. +- **Lease defaults:** `default_lease_ttl = 168h` (7 days), `max_lease_ttl = 720h` (30 days). + These are *system ceilings*; the per-AppRole token TTLs (below) are much shorter and are + what actually governs agent renewal cadence. +- `disable_mlock = false`, `ui = false`, JSON logs at INFO. + +Vault boots **sealed**. It must be unsealed before any operation — that is `vault-init`'s +first job. + +--- + +## 3. Bootstrap & reconcile (`vault-init.sh`) + +`vault-init` is a **run-once-then-exit** container (`restart: "no"`). The agents declare +`depends_on: vault-init: condition: service_completed_successfully`, so they only start +after init has finished cleanly. It runs `su vault -s /bin/sh /vault-init.sh` after creating +and `chown`ing the shared agent directories. + +The script has two branches, selected by the presence of `/vault/data/.initialized`. + +### 3.1 First-time deployment + +1. Wait for `/v1/sys/health` to respond. +2. **Initialize** with Shamir's Secret Sharing: `secret_shares=5`, `secret_threshold=3`. + The full response (5 unseal keys + root token) is written to + `/vault/data/unseal-keys.json`. +3. **Unseal** by submitting 3 of the 5 keys. +4. **Enable engines:** KV v2 at `secret/`, and the AppRole auth method. +5. **Create three ACL policies** (see §5). +6. **Create three AppRoles** issuing periodic tokens (see §4 — this is the heart of renewal), + via the `ensure_approles` helper. The same helper re-runs on subsequent deploys, so AppRole + config changes land without re-initializing Vault. +7. **Issue credentials:** for each role, fetch the static `role_id` and mint a `secret_id`, + writing both to `/agent/credentials/_role_id` and `_secret_id` (`chmod 640`). +8. **Generate an RSA-2048 keypair** with `openssl` and store it in Vault at + `secret/encryption/public_key` and `secret/encryption/private_key` + (algorithm `RSA-OAEP`, with `key_id` and `created_at` metadata). +9. Seed a test LLM secret, then `touch /vault/data/.initialized`. + +### 3.2 Subsequent deployment (restart) + +1. Check `/v1/sys/seal-status`; if sealed, reload the 3 unseal keys from + `unseal-keys.json` and unseal. +2. **Reconcile each secret_id** via `reconcile_secret_id`: + - `ensure_role_id` — make sure the `role_id` file exists (re-fetch from Vault if missing). + - `validate_secret_id` — attempt an AppRole login with the on-disk `role_id` + `secret_id`. + If it returns a `client_token`, the credential is still good. + - **Valid → reuse** the existing `secret_id` (no churn). + - **Invalid/missing → `mint_secret_id`** writes a fresh one. + +This is deliberate: because the AppRoles are created with `secret_id_ttl=0` and +`secret_id_num_uses=0` (non-expiring, unlimited-use), a single long-lived `secret_id` +survives normal restarts instead of being regenerated every boot. The RSA keypair, policies, +and stored secrets are all preserved across restarts. + +> **Note on file permissions:** `vault-init.sh` writes credential files with `chmod 640`. +> (The older architecture doc mentions `644`; the script is the source of truth — `640`.) + +--- + +## 4. The three Vault Agents — auth, renewal & rotation + +This is the core of the question. All three agents are the same Vault binary +(`hashicorp/vault:1.20.3`) run as `vault agent -config=...`. They differ only in which +credentials they read, which token sink they write, and their listener port. + +### 4.1 What an agent config actually does + +Example (`vault/agents/llm/agent.hcl`; gui/cron are identical in shape): + +```hcl +vault { address = "http://vault:8200"; retry { num_retries = 5 } } + +auto_auth { + method "approle" { + mount_path = "auth/approle" + config = { + role_id_file_path = "/agent/credentials/llm_role_id" + secret_id_file_path = "/agent/credentials/llm_secret_id" + remove_secret_id_file_after_reading = false + } + } + sink "file" { config = { path = "/agent/llm-token/token"; mode = 0640 } } +} + +cache { default_lease_duration = "1h" } +listener "tcp" { address = "0.0.0.0:8201"; tls_disable = true } +api_proxy { use_auto_auth_token = true } +``` + +Three mechanisms are at work: + +1. **`auto_auth` (authentication + renewal):** On startup the agent reads `role_id` + + `secret_id` and calls `POST /v1/auth/approle/login`. Vault returns a **periodic token** + (the AppRoles set `token_period`, defined in `vault-init.sh`, *not* in the HCL). The agent + then runs Vault's **auto-auth lifecycle manager**, which **renews the token automatically + in the background** before each period elapses. A periodic token has **no max-TTL**, so the + agent renews it indefinitely and — during normal operation — **never has to call + `approle/login` again**. The agent only re-authenticates (and thus only needs the + `secret_id` again) if it is **restarted** or if a renewal is missed long enough for the + token to lapse. `remove_secret_id_file_after_reading = false` keeps the `secret_id` on disk + so the agent can re-auth after a restart without `vault-init` re-minting. + + > **Why periodic tokens?** An earlier design issued tokens with `token_ttl`/`token_max_ttl`, + > which forced a full re-login every time `token_max_ttl` was reached. If the `secret_id` + > had become invalid by then (expiry, clock skew, server re-init), the agent got stuck in an + > `invalid role or secret ID` 400 backoff loop with no way to self-heal. Periodic tokens + > remove that re-login from the steady state, so a stale `secret_id` can no longer strand a + > running agent. +2. **`sink "file"` (token hand-off):** Every time the agent obtains/renews a token it writes + it to a file (`/agent/-token/token`, mode `0640`). The compose **health check** for + each agent is simply `test -f && test -s ` — a non-empty token file means + the agent has authenticated successfully. +3. **`api_proxy { use_auto_auth_token = true }` (transparent injection):** The agent also + listens as an HTTP proxy on its port. When the application sends a token-less request, the + agent injects `X-Vault-Token: ` and forwards it to `vault:8200`. + This is why application code never sets `VAULT_TOKEN`. + +> **`cache.default_lease_duration` is not the token TTL.** It is the agent's cache lease +> hint. The authoritative token lifetime comes from the AppRole's `token_period` in +> `vault-init.sh`. The per-agent cache hint is set to match the period. + +### 4.2 Per-agent renewal parameters + +AppRole token settings are created in `vault-init.sh`; all three use +`token_period` (periodic token, **no max-TTL**), `secret_id_ttl=0`, `secret_id_num_uses=0`, +`token_num_uses=0`, `bind_secret_id=true`. + +| Agent | AppRole | `token_period` | Proactive renewal (~⅔ of period) | Re-login (`approle/login`) | +|---|---|---|---|---| +| `vault-agent-gui` | `gui-service` | **20m** | ~every 13 min | only on agent restart | +| `vault-agent-cron` | `cron-manager-service` | **30m** | ~every 20 min | only on agent restart | +| `vault-agent-llm` | `llm-orchestration-service` | **1h** | ~every 40 min | only on agent restart | + +Reading the lifecycle for, e.g., the LLM agent: + +``` +T=0 login → periodic token (period 1h) → written to /agent/llm-token/token +T≈40m renew-self → period resets to 1h → token file refreshed +... renew repeats forever; token never hits a max-TTL +(restart) agent re-runs approle/login with the on-disk secret_id → fresh token +``` + +The periods are tuned per service (shorter for the GUI, which only reads the public key; +longer for the high-traffic LLM read path), but functionally all three behave the same: +**renew forever, re-login only on restart.** + +### 4.3 Two distinct "rotation" concepts — keep them separate + +1. **Token rotation (automatic, continuous):** Handled entirely by the agent's `auto_auth` + loop as described above — the periodic token is renewed indefinitely with no human action + and no `vault-init` involvement. +2. **`secret_id` rotation (rare):** The `secret_id` is the long-lived credential the agent + uses to *log in* (at startup/restart only, now that tokens are periodic). It is configured + non-expiring (`secret_id_ttl=0`, `secret_id_num_uses=0`) and is only replaced by + `vault-init` on a restart when the existing one fails validation (§3.2). To force rotation, + delete the `secret_id` file (or invalidate it in Vault) and re-run `vault-init`, then + restart the agent so it logs in with the freshly minted one. + + > **Operational caveat (learned the hard way):** if a `secret_id` ever does become invalid + > while an agent is running, the periodic-token design means a *running* agent keeps working + > (it only renews, never re-logs-in). But a **restarted** agent needs a valid `secret_id` to + > log in. Recovery is always: re-run `vault-init` (mints a fresh `secret_id` via the §3.2 + > reconcile) → restart the affected agent. See `docs/` runbook / the troubleshooting note + > below. + +### 4.4 Restart behavior + +- **Restart an agent:** It re-reads `role_id`/`secret_id` from the (read-only) creds volume + and re-authenticates. New token, written to the sink. App sees a brief blip. +- **Restart `vault`:** Data persists; `vault-init` (or the existing agent tokens, if still + valid) handle re-unseal/re-auth. Existing tokens remain valid if not expired. +- **Full `down && up`:** Order is `vault → vault-init → agents → apps`. `vault-init` detects + the `.initialized` flag, skips first-time setup, reconciles secret_ids, and the agents + start with validated credentials. + +--- + +## 5. Authorization — policies (who can touch what) + +Created in `vault-init.sh`. Paths are KV v2, so data lives under `secret/data/...` and +listing/metadata under `secret/metadata/...`. + +| Path | `gui-policy` | `cron-manager-policy` | `llm-orchestration-policy` | +|---|---|---|---| +| `secret/data/encryption/public_key` | **read** | read | — | +| `secret/data/encryption/private_key` | **deny** | **read** | — | +| `secret/data/encryption/*` | — | — | **deny** | +| `secret/data/llm/connections/*` | deny | **create/read/update/delete** | **read, list** | +| `secret/data/embeddings/connections/*` | deny | **create/read/update/delete** | **read, list** | +| `auth/token/lookup-self` | — | read | read | + +The intent, by tier: + +- **GUI** — can read *only* the public key, to encrypt user-entered credentials in the + browser before they ever leave it. Everything else is explicitly denied. +- **CronManager** — the only writer. Reads the **private key** to decrypt what the GUI + encrypted, then writes plaintext credentials into Vault. Full CRUD on connection secrets. +- **LLM Orchestration** — read-only consumer of connection secrets. **Explicitly denied** all + encryption keys, so a compromise of this hot-path service cannot exfiltrate the private key. + +--- + +## 6. Secret layout (KV v2 under `secret/`) + +``` +secret/ +├── llm/connections// ← e.g. aws_bedrock, azure_openai +├── embeddings/connections// +└── encryption/ + ├── public_key { key, algorithm: RSA-OAEP, key_size: 2048, key_id, created_at } + └── private_key { key, algorithm: RSA-OAEP, key_size: 2048, key_id, created_at } +``` + +The current write/delete scripts key connection secrets by a stable **`vaultUuid`** as the +final path segment (environment is tracked in the DB, not the path). KV v2 versions every +write, so updating a credential keeps prior versions for audit/rollback. + +LLM secret shape (AWS): `{ connection_id, access_key, secret_key, model, tags }`. +Azure: `{ connection_id, endpoint, api_key, deployment_name, model, api_version, tags }`. + +--- + +## 7. Usage flows + +### 7.1 Storing / rotating a credential (`store_secrets_in_vault.sh`, via cron-manager) + +1. GUI encrypts the raw key with the RSA **public** key and submits it. +2. The cron-manager job runs the script against `vault-agent-cron:8203` (no token — the agent + injects it). +3. The script **fetches the private key** (`GET secret/data/encryption/private_key`), then + decrypts each sensitive field in-memory via `decrypt_vault_secrets.py` (RSA-OAEP). +4. It builds the JSON payload with `jq` and `POST`s plaintext to + `secret/data//connections//`. Re-posting the same path + = a KV v2 version bump = credential rotation. +5. Sensitive shell variables are `unset` immediately after use. + +### 7.2 Deleting a credential (`delete_secrets_from_vault.sh`) + +`DELETE`s both `secret/data/...` and `secret/metadata/...` for the connection (404 treated as +success), again through `vault-agent-cron` with no explicit token. + +### 7.3 Reading a credential (LLM orchestration) + +The LLM service issues a token-less `GET http://vault-agent-llm:8201/v1/secret/data/llm/...`. +`vault-agent-llm` injects its cached token, Vault validates it against +`llm-orchestration-policy`, and returns the secret. The service then calls AWS/Azure with it. + +--- + +## 8. Operational notes & known trade-offs + +- **Unseal keys + root token sit in the `vault-data` volume** (`unseal-keys.json`). This makes + auto-unseal on restart trivial but is a **dev/test convenience**. For production, switch to + auto-unseal backed by a cloud KMS/HSM and remove the keys from the volume. +- **Root token** is used only by `vault-init` and is never injected into app containers. Best + practice for production is to revoke it after bootstrap and use scoped admin policies. +- **TLS is disabled** on the Vault listener and agent listeners; isolation relies on the + `internal: true` `vault-network`. Add TLS for any non-local deployment. +- **Audit logging is available but not enabled.** Turn it on with + `vault audit enable file file_path=/vault/logs/audit.log` (the `./vault/logs` mount already + exists) for a full request trail. +- **Credential files are world-readable within the shared volume** (mode 640, single owner, + but all agents mount the same `vault-agent-creds` volume read-only) — isolation is at the + volume level, not per-file. Fine for this trust boundary; note it if the threat model + tightens. + +--- + +## 9. Troubleshooting: agents looping on `invalid role or secret ID` + +**Symptom:** an agent logs `lifetime watcher done channel triggered, re-authenticating` +followed by repeating `PUT .../auth/approle/login → Code: 400 ... invalid role or secret ID` +with growing backoff. Token *renewals* had been succeeding up to that point. + +**Cause:** the agent's `secret_id` became invalid server-side (expiry, clock skew, or a Vault +re-init), and the agent reached a point where it had to do a full `approle/login`. With the +old `token_ttl`/`token_max_ttl` design this happened on every `token_max_ttl` cycle; the +switch to **periodic tokens** (§4) removes re-login from steady state, so a *running* agent no +longer hits this — but a **restarted** agent still needs a valid `secret_id`. + +**Recovery:** + +```bash +# Mint fresh secret_ids (vault-init's reconcile detects the invalid ones and replaces them) +docker compose up -d --force-recreate vault-init +docker wait vault-init +# Restart the affected agents so they log in with the fresh secret_id +docker compose restart vault-agent-gui vault-agent-cron vault-agent-llm +``` + +**Confirm root cause (read-only):** + +```bash +ROOT=$(docker exec vault sh -c "grep -o '\"root_token\":\"[^\"]*\"' /vault/file/unseal-keys.json | cut -d: -f2 | tr -d '\"'") +docker exec -e VAULT_TOKEN=$ROOT -e VAULT_ADDR=http://127.0.0.1:8200 vault \ + vault read auth/approle/role/gui-service # expect token_period set, secret_id_ttl=0 +echo "host: $(date -u)"; docker exec vault date -u # check for WSL2/Docker clock drift +``` diff --git a/src/vector_indexer/contextual_processor.py b/src/vector_indexer/contextual_processor.py index b225cf30..6b21d326 100644 --- a/src/vector_indexer/contextual_processor.py +++ b/src/vector_indexer/contextual_processor.py @@ -41,7 +41,7 @@ def __init__( async def process_document( self, document: ProcessingDocument - ) -> List[ContextualChunk]: + ) -> tuple[List[ContextualChunk], int]: """ Process single document into contextual chunks. @@ -49,7 +49,8 @@ async def process_document( document: Document to process Returns: - List of contextual chunks with embeddings + Tuple of (contextual chunks with embeddings, number of chunks + dropped due to context-generation failure) """ logger.info( f"Processing document {document.document_hash} ({len(document.content)} characters)" @@ -69,11 +70,13 @@ async def process_document( # Step 3: Create contextual chunks (filter out failed context generations) contextual_chunks: List[ContextualChunk] = [] valid_contextual_contents: List[str] = [] + failed_chunks = 0 for i, (base_chunk, context) in enumerate( zip(base_chunks, contexts, strict=True) ): if isinstance(context, Exception): + failed_chunks += 1 self.error_logger.log_context_generation_failure( document.document_hash, i, str(context), self.config.max_retries ) @@ -128,7 +131,7 @@ async def process_document( logger.error( f"No valid chunks created for document {document.document_hash}" ) - return [] + return [], failed_chunks # Step 4: Create embeddings for all valid contextual chunks try: @@ -154,9 +157,10 @@ async def process_document( raise logger.info( - f"Successfully processed document {document.document_hash}: {len(contextual_chunks)} chunks" + f"Successfully processed document {document.document_hash}: " + f"{len(contextual_chunks)} chunks ({failed_chunks} dropped)" ) - return contextual_chunks + return contextual_chunks, failed_chunks except Exception as e: logger.error( diff --git a/src/vector_indexer/error_logger.py b/src/vector_indexer/error_logger.py index 1d11cba1..c62de79c 100644 --- a/src/vector_indexer/error_logger.py +++ b/src/vector_indexer/error_logger.py @@ -158,15 +158,17 @@ def log_processing_stats(self, stats: ProcessingStats) -> None: stats_dict["end_time"] = stats.end_time.isoformat() stats_dict["duration"] = stats.duration stats_dict["success_rate"] = stats.success_rate + stats_dict["chunk_success_rate"] = stats.chunk_success_rate with open(self.config.stats_log_file, "w", encoding="utf-8") as f: json.dump(stats_dict, f, indent=2) logger.info( f"Processing completed - Success rate: {stats.success_rate:.1%}, " + f"Chunk success rate: {stats.chunk_success_rate:.1%}, " f"Duration: {stats.duration}, " f"Processed: {stats.documents_processed}/{stats.total_documents} documents, " - f"Chunks: {stats.total_chunks_processed}" + f"Chunks: {stats.total_chunks_processed} ok / {stats.total_chunks_failed} failed" ) except Exception as e: logger.error(f"Failed to write stats log: {e}") diff --git a/src/vector_indexer/main_indexer.py b/src/vector_indexer/main_indexer.py index 45ce5ff6..bf407682 100644 --- a/src/vector_indexer/main_indexer.py +++ b/src/vector_indexer/main_indexer.py @@ -15,7 +15,7 @@ sys.path.append(str(Path(__file__).parent.parent)) from vector_indexer.config.config_loader import ConfigLoader -from vector_indexer.document_loader import DocumentLoader +from vector_indexer.document_loader import DocumentLoader, DocumentLoadError from vector_indexer.contextual_processor import ContextualProcessor from vector_indexer.qdrant_manager import QdrantManager from vector_indexer.error_logger import ErrorLogger @@ -169,7 +169,7 @@ async def process_all_documents(self) -> ProcessingStats: # Process documents with controlled concurrency semaphore = asyncio.Semaphore(self.config.max_concurrent_documents) - tasks: List[asyncio.Task[tuple[int, str]]] = [] + tasks: List[asyncio.Task[tuple[int, str, int]]] = [] for doc_info in documents: task = asyncio.create_task( @@ -189,6 +189,9 @@ async def process_all_documents(self) -> ProcessingStats: chunks_info: Dict[ str, Dict[str, Any] ] = {} # Track chunk counts for metadata update + # Only documents that processed successfully are marked as + # processed in DVC tracking, so failures are retried next run. + processed_documents: List[DocumentInfo] = [] for i, result in enumerate(results): if isinstance(result, Exception): doc_info = documents[i] @@ -200,16 +203,18 @@ async def process_all_documents(self) -> ProcessingStats: doc_info.document_hash, str(result) ) else: - # Result should be tuple of (chunk_count, content_hash) + # Result should be tuple of (chunk_count, content_hash, failed_chunks) doc_info = documents[i] self.stats.documents_processed += 1 - if isinstance(result, tuple) and len(result) == 2: - chunk_count, content_hash = result + processed_documents.append(doc_info) + if isinstance(result, tuple) and len(result) == 3: + chunk_count, content_hash, failed_chunks = result self.stats.total_chunks_processed += chunk_count + self.stats.total_chunks_failed += failed_chunks # Track chunk count using content_hash (not directory hash) chunks_info[content_hash] = {"chunk_count": chunk_count} logger.info( - f"CHUNK COUNT: Document {doc_info.document_hash[:12]}... (content: {content_hash[:12]}...) -> {chunk_count} chunks" + f"CHUNK COUNT: Document {doc_info.document_hash[:12]}... (content: {content_hash[:12]}...) -> {chunk_count} chunks ({failed_chunks} failed)" ) # Log the complete chunks_info dictionary @@ -227,10 +232,10 @@ async def process_all_documents(self) -> ProcessingStats: # Step 4: Update processed files tracking (even if no new documents processed) if diff_detector: try: - # Update metadata for newly processed files - if documents: + # Update metadata for newly processed files (successful only) + if processed_documents: processed_paths = [ - doc.cleaned_txt_path for doc in documents + doc.cleaned_txt_path for doc in processed_documents ] if processed_paths: logger.debug( @@ -290,7 +295,7 @@ async def _process_single_document( doc_info: DocumentInfo, qdrant_manager: QdrantManager, semaphore: asyncio.Semaphore, - ) -> tuple[int, str]: + ) -> tuple[int, str, int]: """ Process a single document with contextual retrieval. @@ -300,7 +305,9 @@ async def _process_single_document( semaphore: Concurrency control semaphore Returns: - tuple: (chunk_count: int, content_hash: str) or Exception on error + tuple: (chunk_count: int, content_hash: str, failed_chunks: int). + Raises on any failure (including load failure or zero usable chunks), + so the document is counted as failed rather than as success. """ async with semaphore: logger.info(f"Processing document: {doc_info.document_hash}") @@ -310,29 +317,31 @@ async def _process_single_document( document = self.document_loader.load_document(doc_info) if not document: - logger.warning(f"Could not load document: {doc_info.document_hash}") - return (0, doc_info.document_hash) + raise DocumentLoadError( + f"Could not load document: {doc_info.document_hash}" + ) # Process document with contextual retrieval - contextual_chunks = await self.contextual_processor.process_document( - document - ) + ( + contextual_chunks, + failed_chunks, + ) = await self.contextual_processor.process_document(document) if not contextual_chunks: - logger.warning( - f"No chunks created for document: {doc_info.document_hash}" + raise RuntimeError( + f"No chunks created for document: {doc_info.document_hash} " + f"({failed_chunks} chunks failed context generation)" ) - return (0, document.document_hash) # Store chunks in Qdrant await qdrant_manager.store_chunks(contextual_chunks) logger.info( f"Successfully processed document {doc_info.document_hash}: " - f"{len(contextual_chunks)} chunks" + f"{len(contextual_chunks)} chunks ({failed_chunks} dropped)" ) - return (len(contextual_chunks), document.document_hash) + return (len(contextual_chunks), document.document_hash, failed_chunks) except Exception as e: logger.error(f"Error processing document {doc_info.document_hash}: {e}") @@ -352,10 +361,12 @@ def _log_final_summary(self) -> None: logger.info(f" • Failed Chunks: {self.stats.total_chunks_failed}") if self.stats.total_documents > 0: - success_rate = ( - self.stats.documents_processed / self.stats.total_documents - ) * 100 - logger.info(f"Success Rate: {success_rate:.1f}%") + logger.info(f"Success Rate: {self.stats.success_rate * 100:.1f}%") + + if self.stats.total_chunks_processed + self.stats.total_chunks_failed > 0: + logger.info( + f"Chunk Success Rate: {self.stats.chunk_success_rate * 100:.1f}%" + ) logger.info(f"Processing Duration: {self.stats.duration}") @@ -365,6 +376,11 @@ def _log_final_summary(self) -> None: ) logger.info("Check failure logs for details") + if self.stats.total_chunks_failed > 0: + logger.warning( + f" {self.stats.total_chunks_failed} chunks failed processing" + ) + async def run_health_check(self) -> bool: """ Run health check on all components. @@ -617,12 +633,20 @@ async def _execute_cleanup_operations( return total_deleted def _cleanup_datasets(self) -> None: - """Remove datasets folder after processing.""" + """Remove datasets folder contents after processing. + + Only the folder's contents are removed, not the folder itself, since + the datasets path is a mounted volume in the container. + """ try: datasets_path = Path(self.config.dataset_base_path) if datasets_path.exists(): - shutil.rmtree(str(datasets_path)) - logger.info(f"Datasets folder cleaned up: {datasets_path}") + for child in datasets_path.iterdir(): + if child.is_dir(): + shutil.rmtree(str(child)) + else: + child.unlink() + logger.info(f"Datasets folder contents cleaned up: {datasets_path}") else: logger.debug(f"Datasets folder does not exist: {datasets_path}") except Exception as e: diff --git a/src/vector_indexer/models.py b/src/vector_indexer/models.py index 752ea02a..41ae1ce1 100644 --- a/src/vector_indexer/models.py +++ b/src/vector_indexer/models.py @@ -96,6 +96,14 @@ def success_rate(self) -> float: return self.documents_processed / self.total_documents return 0.0 + @property + def chunk_success_rate(self) -> float: + """Calculate chunk success rate (processed vs processed + failed).""" + total_chunks = self.total_chunks_processed + self.total_chunks_failed + if total_chunks > 0: + return self.total_chunks_processed / total_chunks + return 0.0 + class ProcessingError(BaseModel): """Error information for failed processing.""" diff --git a/vault-init.sh b/vault-init.sh index eada7518..0e759f8e 100644 --- a/vault-init.sh +++ b/vault-init.sh @@ -7,6 +7,88 @@ INIT_FLAG="/vault/data/.initialized" echo "=== Vault Initialization Script ===" +# --------------------------------------------------------------------------- +# Helpers (used by the SUBSEQUENT DEPLOYMENT branch) +# --------------------------------------------------------------------------- + +# Ensure a role_id file exists on disk; fetch from Vault if missing. +# Usage: ensure_role_id +ensure_role_id() { + role="$1"; rid_file="$2" + if [ -f "$rid_file" ] && [ -s "$rid_file" ]; then + return 0 + fi + echo "Fetching role_id for $role..." + rid=$(wget -q -O- \ + --header="X-Vault-Token: $ROOT_TOKEN" \ + "$VAULT_ADDR/v1/auth/approle/role/$role/role-id" | \ + grep -o '"role_id":"[^"]*"' | cut -d':' -f2 | tr -d '"') + echo "$rid" > "$rid_file" + chmod 640 "$rid_file" +} + +# Return 0 if the on-disk role_id + secret_id still authenticate, 1 otherwise. +# Usage: validate_secret_id +validate_secret_id() { + rid_file="$1"; sid_file="$2" + [ -f "$rid_file" ] && [ -f "$sid_file" ] || return 1 + rid=$(cat "$rid_file"); sid=$(cat "$sid_file") + [ -n "$rid" ] && [ -n "$sid" ] || return 1 + # wget returns non-zero on HTTP 400 (invalid creds); also confirm a token came back. + resp=$(wget -q -O- \ + --post-data="{\"role_id\":\"$rid\",\"secret_id\":\"$sid\"}" \ + --header='Content-Type: application/json' \ + "$VAULT_ADDR/v1/auth/approle/login" 2>/dev/null) || return 1 + echo "$resp" | grep -q '"client_token"' || return 1 + return 0 +} + +# Mint a fresh secret_id for a role and write it to disk. +# Usage: mint_secret_id +mint_secret_id() { + role="$1"; sid_file="$2" + sid=$(wget -q -O- --post-data='' \ + --header="X-Vault-Token: $ROOT_TOKEN" \ + "$VAULT_ADDR/v1/auth/approle/role/$role/secret-id" | \ + grep -o '"secret_id":"[^"]*"' | cut -d':' -f2 | tr -d '"') + echo "$sid" > "$sid_file" + chmod 640 "$sid_file" +} + +# Reuse the existing secret_id if it still authenticates; otherwise mint a new one. +# Usage: reconcile_secret_id +reconcile_secret_id() { + role="$1"; rid_file="$2"; sid_file="$3" + ensure_role_id "$role" "$rid_file" + if validate_secret_id "$rid_file" "$sid_file"; then + echo "$role: existing secret_id still valid - reusing" + else + echo "$role: secret_id invalid or missing - minting a new one" + mint_secret_id "$role" "$sid_file" + fi +} + +# Create or update an AppRole that issues a PERIODIC token (no max_ttl): the +# agent renews it forever and never re-runs approle/login in steady state. +# secret_id_ttl=0 + secret_id_num_uses=0 keep the secret_id valid across +# restarts. Idempotent: does not invalidate existing secret_ids, safe per run. +# Usage: upsert_approle +upsert_approle() { + role="$1"; policy="$2"; period="$3" + wget -q -O- --post-data='{"token_policies":["'"$policy"'"],"token_period":"'"$period"'","token_num_uses":0,"secret_id_ttl":"0","secret_id_num_uses":0,"bind_secret_id":true}' \ + --header="X-Vault-Token: $ROOT_TOKEN" \ + --header='Content-Type: application/json' \ + "$VAULT_ADDR/v1/auth/approle/role/$role" >/dev/null +} + +# Apply the current AppRole definitions for all three services. +ensure_approles() { + echo "Ensuring AppRole configs (periodic tokens)..." + upsert_approle "gui-service" "gui-policy" "20m" + upsert_approle "cron-manager-service" "cron-manager-policy" "30m" + upsert_approle "llm-orchestration-service" "llm-orchestration-policy" "1h" +} + # Wait for Vault to be ready echo "Waiting for Vault..." for i in $(seq 1 30); do @@ -114,27 +196,9 @@ path "auth/token/lookup-self" { capabilities = ["read"] }' --header='Content-Type: application/json' \ "$VAULT_ADDR/v1/sys/policies/acl/llm-orchestration-policy" >/dev/null - # Create GUI AppRole - echo "Creating gui-service AppRole..." - wget -q -O- --post-data='{"token_policies":["gui-policy"],"token_no_default_policy":true,"token_ttl":"15m","token_max_ttl":"1h","secret_id_ttl":"24h","secret_id_num_uses":0,"bind_secret_id":true}' \ - --header="X-Vault-Token: $ROOT_TOKEN" \ - --header='Content-Type: application/json' \ - "$VAULT_ADDR/v1/auth/approle/role/gui-service" >/dev/null - - # Create CronManager AppRole - echo "Creating cron-manager-service AppRole..." - wget -q -O- --post-data='{"token_policies":["cron-manager-policy"],"token_no_default_policy":true,"token_ttl":"30m","token_max_ttl":"8h","secret_id_ttl":"24h","secret_id_num_uses":0,"bind_secret_id":true}' \ - --header="X-Vault-Token: $ROOT_TOKEN" \ - --header='Content-Type: application/json' \ - "$VAULT_ADDR/v1/auth/approle/role/cron-manager-service" >/dev/null - - # Create LLM Orchestration AppRole - echo "Creating llm-orchestration-service AppRole..." - wget -q -O- --post-data='{"token_policies":["llm-orchestration-policy"],"token_no_default_policy":true,"token_ttl":"1h","token_max_ttl":"8h","secret_id_ttl":"24h","secret_id_num_uses":0,"bind_secret_id":true}' \ - --header="X-Vault-Token: $ROOT_TOKEN" \ - --header='Content-Type: application/json' \ - "$VAULT_ADDR/v1/auth/approle/role/llm-orchestration-service" >/dev/null - + # Create the three AppRoles (periodic tokens - see upsert_approle). + ensure_approles + # Ensure credentials directory exists mkdir -p /agent/credentials @@ -276,65 +340,22 @@ else # Get root token ROOT_TOKEN=$(grep -o '"root_token":"[^"]*"' "$UNSEAL_KEYS_FILE" | cut -d':' -f2 | tr -d '"') export VAULT_TOKEN="$ROOT_TOKEN" - + + # Re-apply AppRole definitions so config changes (e.g. periodic tokens) + # take effect on redeploy without re-initializing Vault. Idempotent and + # does not invalidate existing secret_ids. + ensure_approles + # Ensure credentials directory exists mkdir -p /agent/credentials - # Always regenerate all secret_ids on restart - echo "Regenerating GUI secret_id..." - GUI_SECRET_ID=$(wget -q -O- --post-data='' \ - --header="X-Vault-Token: $ROOT_TOKEN" \ - "$VAULT_ADDR/v1/auth/approle/role/gui-service/secret-id" | \ - grep -o '"secret_id":"[^"]*"' | cut -d':' -f2 | tr -d '"') - echo "$GUI_SECRET_ID" > /agent/credentials/gui_secret_id - - echo "Regenerating CronManager secret_id..." - CRON_SECRET_ID=$(wget -q -O- --post-data='' \ - --header="X-Vault-Token: $ROOT_TOKEN" \ - "$VAULT_ADDR/v1/auth/approle/role/cron-manager-service/secret-id" | \ - grep -o '"secret_id":"[^"]*"' | cut -d':' -f2 | tr -d '"') - echo "$CRON_SECRET_ID" > /agent/credentials/cron_secret_id - - echo "Regenerating LLM secret_id..." - LLM_SECRET_ID=$(wget -q -O- --post-data='' \ - --header="X-Vault-Token: $ROOT_TOKEN" \ - "$VAULT_ADDR/v1/auth/approle/role/llm-orchestration-service/secret-id" | \ - grep -o '"secret_id":"[^"]*"' | cut -d':' -f2 | tr -d '"') - echo "$LLM_SECRET_ID" > /agent/credentials/llm_secret_id - - # Set permissions - chmod 640 /agent/credentials/*_secret_id - - # Ensure role_ids exist - if [ ! -f /agent/credentials/gui_role_id ]; then - echo "Copying GUI role_id..." - GUI_ROLE_ID=$(wget -q -O- \ - --header="X-Vault-Token: $ROOT_TOKEN" \ - "$VAULT_ADDR/v1/auth/approle/role/gui-service/role-id" | \ - grep -o '"role_id":"[^"]*"' | cut -d':' -f2 | tr -d '"') - echo "$GUI_ROLE_ID" > /agent/credentials/gui_role_id - chmod 640 /agent/credentials/gui_role_id - fi - - if [ ! -f /agent/credentials/cron_role_id ]; then - echo "Copying CronManager role_id..." - CRON_ROLE_ID=$(wget -q -O- \ - --header="X-Vault-Token: $ROOT_TOKEN" \ - "$VAULT_ADDR/v1/auth/approle/role/cron-manager-service/role-id" | \ - grep -o '"role_id":"[^"]*"' | cut -d':' -f2 | tr -d '"') - echo "$CRON_ROLE_ID" > /agent/credentials/cron_role_id - chmod 640 /agent/credentials/cron_role_id - fi - - if [ ! -f /agent/credentials/llm_role_id ]; then - echo "Copying LLM role_id..." - LLM_ROLE_ID=$(wget -q -O- \ - --header="X-Vault-Token: $ROOT_TOKEN" \ - "$VAULT_ADDR/v1/auth/approle/role/llm-orchestration-service/role-id" | \ - grep -o '"role_id":"[^"]*"' | cut -d':' -f2 | tr -d '"') - echo "$LLM_ROLE_ID" > /agent/credentials/llm_role_id - chmod 640 /agent/credentials/llm_role_id - fi + # Reconcile secret_ids: reuse the existing one if it still authenticates, + # mint a new one only if invalid or missing - keeps one stable secret_id + # across restarts instead of rotating every boot. reconcile_secret_id also + # ensures the role_id file exists first (validation needs both). + reconcile_secret_id "gui-service" /agent/credentials/gui_role_id /agent/credentials/gui_secret_id + reconcile_secret_id "cron-manager-service" /agent/credentials/cron_role_id /agent/credentials/cron_secret_id + reconcile_secret_id "llm-orchestration-service" /agent/credentials/llm_role_id /agent/credentials/llm_secret_id fi echo "=== Vault init complete ===" \ No newline at end of file diff --git a/vault/agents/cron/cron-agent.hcl b/vault/agents/cron/cron-agent.hcl index f2db227e..9454c9b7 100644 --- a/vault/agents/cron/cron-agent.hcl +++ b/vault/agents/cron/cron-agent.hcl @@ -2,7 +2,9 @@ # This agent provides CronManager with access to encryption keys and write access to secrets vault { - address = "http://vault:8200" + # Local testing: use rag-vault, not bare "vault" — that name collides with the + # ckb stack on the shared bykstack network and authenticates the wrong Vault. + address = "http://rag-vault:8200" retry { num_retries = 5 } @@ -42,6 +44,4 @@ listener "tcp" { # API proxy configuration api_proxy { use_auto_auth_token = true - enforce_consistency = "always" - when_inconsistent = "forward" } diff --git a/vault/agents/gui/gui-agent.hcl b/vault/agents/gui/gui-agent.hcl index a28db871..672d6d4d 100644 --- a/vault/agents/gui/gui-agent.hcl +++ b/vault/agents/gui/gui-agent.hcl @@ -2,7 +2,9 @@ # This agent provides GUI with access to public encryption key only vault { - address = "http://vault:8200" + # Local testing: use rag-vault, not bare "vault" — that name collides with the + # ckb stack on the shared bykstack network and authenticates the wrong Vault. + address = "http://rag-vault:8200" retry { num_retries = 5 } @@ -42,6 +44,4 @@ listener "tcp" { # API proxy configuration api_proxy { use_auto_auth_token = true - enforce_consistency = "always" - when_inconsistent = "forward" } diff --git a/vault/agents/llm/agent.hcl b/vault/agents/llm/agent.hcl index d7237be7..1a575260 100644 --- a/vault/agents/llm/agent.hcl +++ b/vault/agents/llm/agent.hcl @@ -1,5 +1,7 @@ vault { - address = "http://vault:8200" + # Local testing: use rag-vault, not bare "vault" — that name collides with the + # ckb stack on the shared bykstack network and authenticates the wrong Vault. + address = "http://rag-vault:8200" retry { num_retries = 5 } @@ -34,6 +36,4 @@ listener "tcp" { api_proxy { use_auto_auth_token = true - enforce_consistency = "always" - when_inconsistent = "forward" } diff --git a/vault/config/vault.hcl b/vault/config/vault.hcl index eaef415a..64ab325e 100644 --- a/vault/config/vault.hcl +++ b/vault/config/vault.hcl @@ -1,22 +1,27 @@ # HashiCorp Vault Server Configuration -# Production-ready configuration for LLM Orchestration Service +# Single-node Raft for the RAG-Module services -# Storage backend - Raft for high availability +# Storage backend - Raft storage "raft" { path = "/vault/file" node_id = "vault-node-1" - - # Retry join configuration for clustering (single node for now) - retry_join { - leader_api_addr = "http://vault:8200" - } + + # NOTE: No retry_join for a single node. A lone node self-bootstraps. + # A retry_join pointing at itself causes repeated + # "failed to get raft challenge ... Vault is sealed" errors and a + # messy double Raft init on every boot. Add retry_join back only when + # you actually have peer nodes to join. } -# HTTP listener configuration +# HTTP API listener. +# Vault automatically uses the next port up (8201) as its internal +# cluster port, so do NOT define a separate listener on 8201 — that +# collides with the cluster listener ("bind: address already in use") +# and degrades the login/request-forwarding path the agents rely on. listener "tcp" { - address = "0.0.0.0:8200" - tls_disable = true - + address = "0.0.0.0:8200" + tls_disable = true + # Enable CORS for web UI access cors_enabled = true cors_allowed_origins = [ @@ -25,14 +30,9 @@ listener "tcp" { ] } -# Cluster listener for HA (required even for single node) -listener "tcp" { - address = "0.0.0.0:8201" - cluster_addr = "http://0.0.0.0:8201" - tls_disable = true -} - -# API and cluster addresses +# API and cluster addresses. +# cluster_addr tells Vault where its internal cluster port (8201) is +# reachable; Vault binds that port itself — no listener block needed. api_addr = "http://vault:8200" cluster_addr = "http://vault:8201" @@ -46,9 +46,5 @@ default_lease_ttl = "168h" # 7 days max_lease_ttl = "720h" # 30 days # Logging configuration -log_level = "INFO" +log_level = "INFO" log_format = "json" - -# Development settings (remove in production) -# Note: In production, you should not use dev mode -# and should properly initialize and unseal the vault \ No newline at end of file