From 2aa38d28542d7141e1eff3e97a1bee34c97eafd4 Mon Sep 17 00:00:00 2001
From: Charith Nuwan Bimsara <59943919+nuwangeek@users.noreply.github.com>
Date: Thu, 18 Jun 2026 13:04:15 +0530
Subject: [PATCH] Fix for Vault configuration issue (#473)

* fixed vault configuration issue

* address Vault via unique rag-vault alias to avoid cross -stack DNS collision

* fixed vector indexer statistics analysis issue
---
 DSL/CronManager/DSL/data_resync.yml           |   2 +-
 DSL/CronManager/DSL/delete_from_vault.yml     |   2 +-
 DSL/CronManager/DSL/store_in_vault.yml        |   2 +-
 .../script/delete_secrets_from_vault.sh       |   4 +-
 .../script/store_secrets_in_vault.sh          |   4 +-
 docker-compose-ec2.yml                        |  12 +-
 docker-compose.yml                            |  14 +-
 docs/VAULT_SECURITY_ARCHITECTURE.md           |  81 ++--
 docs/VAULT_SETUP_AND_USAGE.md                 | 355 ++++++++++++++++++
 src/vector_indexer/contextual_processor.py    |  14 +-
 src/vector_indexer/error_logger.py            |   4 +-
 src/vector_indexer/main_indexer.py            |  80 ++--
 src/vector_indexer/models.py                  |   8 +
 vault-init.sh                                 | 175 +++++----
 vault/agents/cron/cron-agent.hcl              |   6 +-
 vault/agents/gui/gui-agent.hcl                |   6 +-
 vault/agents/llm/agent.hcl                    |   6 +-
 vault/config/vault.hcl                        |  44 +--
 18 files changed, 624 insertions(+), 195 deletions(-)
 create mode 100644 docs/VAULT_SETUP_AND_USAGE.md

diff --git a/DSL/CronManager/DSL/data_resync.yml b/DSL/CronManager/DSL/data_resync.yml
index b5994d1e..a232ba39 100644
--- a/DSL/CronManager/DSL/data_resync.yml
+++ b/DSL/CronManager/DSL/data_resync.yml
@@ -2,4 +2,4 @@ agency_data_resync:
   trigger: "0 0 0/1 * * ?"
   # trigger: off
   type: exec
-  command: "../app/scripts/agency_data_resync.sh -s 10"
\ No newline at end of file
+  command: "/app/scripts/agency_data_resync.sh -s 10"
\ No newline at end of file
diff --git a/DSL/CronManager/DSL/delete_from_vault.yml b/DSL/CronManager/DSL/delete_from_vault.yml
index d7f06cea..cde1df27 100644
--- a/DSL/CronManager/DSL/delete_from_vault.yml
+++ b/DSL/CronManager/DSL/delete_from_vault.yml
@@ -2,4 +2,4 @@ delete_secrets:
   trigger: off
   type: exec
   command: "/app/scripts/delete_secrets_from_vault.sh"
-  allowedEnvs: ['cookie','vaultUuid','llmPlatform', 'llmModel','embeddingModel','embeddingPlatform']
+  allowedEnvs: ['cookie','vaultUuid','llmPlatform', 'llmModel','embeddingModel','embeddingPlatform', 'vaultAgentUrl']
diff --git a/DSL/CronManager/DSL/store_in_vault.yml b/DSL/CronManager/DSL/store_in_vault.yml
index fa1a6ac1..46f861e6 100644
--- a/DSL/CronManager/DSL/store_in_vault.yml
+++ b/DSL/CronManager/DSL/store_in_vault.yml
@@ -2,4 +2,4 @@ store_secrets:
   trigger: off
   type: exec
   command: "/app/scripts/store_secrets_in_vault.sh"
-  allowedEnvs: ['cookie','vaultUuid','llmPlatform', 'llmModel','secretKey','accessKey','deploymentName','targetUrl','apiKey','embeddingModel','embeddingPlatform','embeddingAccessKey','embeddingSecretKey','embeddingDeploymentName','embeddingTargetUri','embeddingAzureApiKey','deploymentEnvironment']
\ No newline at end of file
+  allowedEnvs: ['cookie','vaultUuid','llmPlatform', 'llmModel','secretKey','accessKey','deploymentName','targetUrl','apiKey','embeddingModel','embeddingPlatform','embeddingAccessKey','embeddingSecretKey','embeddingDeploymentName','embeddingTargetUri','embeddingAzureApiKey','deploymentEnvironment', 'vaultAgentUrl']
\ No newline at end of file
diff --git a/DSL/CronManager/script/delete_secrets_from_vault.sh b/DSL/CronManager/script/delete_secrets_from_vault.sh
index a6423566..3b405927 100644
--- a/DSL/CronManager/script/delete_secrets_from_vault.sh
+++ b/DSL/CronManager/script/delete_secrets_from_vault.sh
@@ -6,9 +6,9 @@
 set -e  # Exit on any error
 
 # Configuration
-# Use VAULT_AGENT_URL which points to vault-agent-cron proxy
+# Use vaultAgentUrl which points to vault-agent-cron proxy
 # The agent automatically injects the authentication token
-VAULT_ADDR="${VAULT_AGENT_URL:-http://vault-agent-cron:8203}"
+VAULT_ADDR="${vaultAgentUrl:-http://vault-agent-cron:8203}"
 
 # Logging function
 log() {
diff --git a/DSL/CronManager/script/store_secrets_in_vault.sh b/DSL/CronManager/script/store_secrets_in_vault.sh
index 8f4056f8..60784eed 100644
--- a/DSL/CronManager/script/store_secrets_in_vault.sh
+++ b/DSL/CronManager/script/store_secrets_in_vault.sh
@@ -6,9 +6,9 @@
 set -e  # Exit on any error
 
 # Configuration
-# Use VAULT_AGENT_URL which points to vault-agent-cron proxy
+# Use vaultAgentUrl which points to vault-agent-cron proxy
 # The agent automatically injects the authentication token
-VAULT_ADDR="${VAULT_AGENT_URL:-http://vault-agent-cron:8203}"
+VAULT_ADDR="${vaultAgentUrl:-http://vault-agent-cron:8203}"
 
 # Decryption Configuration
 PRIVATE_KEY_CACHE=""
diff --git a/docker-compose-ec2.yml b/docker-compose-ec2.yml
index f1ab5f54..e6152dbe 100644
--- a/docker-compose-ec2.yml
+++ b/docker-compose-ec2.yml
@@ -503,7 +503,11 @@ services:
       - ./vault/config:/vault/config:ro
       - ./vault/logs:/vault/logs
     networks:
-      - vault-network  # Only on vault-network for security
+      vault-network:  # Only on vault-network for security
+        # Local testing: bare "vault" collides with the ckb stack on the shared
+        # bykstack network, so expose this Vault under a unique alias instead.
+        aliases:
+          - rag-vault
     restart: unless-stopped
     healthcheck:
       test: ["CMD", "sh", "-c", "wget -q -O- http://127.0.0.1:8200/v1/sys/health || exit 0"]
@@ -520,7 +524,7 @@ services:
       vault:
         condition: service_healthy
     environment:
-      VAULT_ADDR: http://vault:8200
+      VAULT_ADDR: http://rag-vault:8200
     volumes:
       - vault-data:/vault/data
       - vault-agent-creds:/agent/credentials
@@ -529,8 +533,8 @@ services:
       - vault-agent-llm-token:/agent/llm-token
       - ./vault-init.sh:/vault-init.sh:ro
     networks:
-      - vault-network  # Access vault
-      - bykstack       # Access to write agent tokens
+      # vault-network only: tokens/creds go via shared volumes, not the network.
+      - vault-network
     entrypoint: ["/bin/sh"]
     command:
       - -c
diff --git a/docker-compose.yml b/docker-compose.yml
index 5e9c2962..0befd6d7 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -193,7 +193,7 @@ services:
     environment:
       - server.port=9010
       - PYTHONPATH=/app:/app/src/vector_indexer:/app/src/intent_data_enrichment:/app/src/api_tool_indexer
-      - VAULT_AGENT_URL=http://vault-agent-cron:8203
+      - vaultAgentUrl=http://vault-agent-cron:8203
     ports:
       - 9010:8080
     depends_on:
@@ -451,7 +451,11 @@ services:
       - ./vault/config:/vault/config:ro
       - ./vault/logs:/vault/logs
     networks:
-      - vault-network  # Only on vault-network for security
+      vault-network:  # Only on vault-network for security
+        # Local testing: bare "vault" collides with the ckb stack on the shared
+        # bykstack network, so expose this Vault under a unique alias instead.
+        aliases:
+          - rag-vault
     restart: unless-stopped
     healthcheck:
       test: ["CMD", "sh", "-c", "wget -q -O- http://127.0.0.1:8200/v1/sys/health || exit 0"]
@@ -468,7 +472,7 @@ services:
       vault:
         condition: service_healthy
     environment:
-      VAULT_ADDR: http://vault:8200
+      VAULT_ADDR: http://rag-vault:8200
     volumes:
       - vault-data:/vault/data
       - vault-agent-creds:/agent/credentials
@@ -477,8 +481,8 @@ services:
       - vault-agent-llm-token:/agent/llm-token
       - ./vault-init.sh:/vault-init.sh:ro
     networks:
-      - vault-network  # Access vault
-      - bykstack       # Access to write agent tokens
+      # vault-network only: tokens/creds go via shared volumes, not the network.
+      - vault-network
     entrypoint: ["/bin/sh"]
     command:
       - -c
diff --git a/docs/VAULT_SECURITY_ARCHITECTURE.md b/docs/VAULT_SECURITY_ARCHITECTURE.md
index fe6fd741..2c2c6836 100644
--- a/docs/VAULT_SECURITY_ARCHITECTURE.md
+++ b/docs/VAULT_SECURITY_ARCHITECTURE.md
@@ -197,9 +197,12 @@ Day 0+: Automatic Token Renewal:
 Container Restart:
   vault-init: Check if Vault is sealed
              ↓
-  If unsealed: Regenerate secret_id only
+  If unsealed: Validate existing secret_ids
              ↓
-  vault-agent: Re-authenticate with new secret_id
+  If valid: Reuse existing secret_id (no churn)
+  If invalid: Mint new secret_id and write to disk
+             ↓
+  vault-agent: Re-authenticate with secret_id
              ↓
   New token issued and cached
 ```
@@ -413,8 +416,9 @@ Connected Services:
   - GUI (React Frontend)
 
 Token Lifecycle:
-  - Default Lease: 768h (32 days)
-  - Auto-renewal: Before expiration
+  - Token type: periodic (token_period 20m, no max-TTL)
+  - Auto-renewal: Every ~13 minutes (~2/3 of period)
+  - Re-auth: only on agent restart (never in steady state)
 ```
 
 #### Agent 2: vault-agent-cron
@@ -429,8 +433,9 @@ Connected Services:
   - CronManager (Python worker)
 
 Token Lifecycle:
-  - Default Lease: 768h (32 days)
-  - Auto-renewal: Before expiration
+  - Token type: periodic (token_period 30m, no max-TTL)
+  - Auto-renewal: Every ~20 minutes (~2/3 of period)
+  - Re-auth: only on agent restart (never in steady state)
 ```
 
 #### Agent 3: vault-agent-llm
@@ -445,8 +450,9 @@ Connected Services:
   - LLM Orchestration Service (FastAPI)
 
 Token Lifecycle:
-  - Default Lease: 1h (shorter for higher security)
-  - Auto-renewal: Every ~45 minutes
+  - Token type: periodic (token_period 1h, no max-TTL)
+  - Auto-renewal: Every ~40 minutes (~2/3 of period)
+  - Re-auth: only on agent restart (never in steady state)
 ```
 
 ### Token Caching and Auto-Renewal
@@ -464,29 +470,31 @@ T=0: Initial Authentication
         ├─► POST /v1/auth/approle/login
         │   Body: { role_id, secret_id }
         │
-        └─► Receives: { token, ttl: 3600s, renewable: true }
+        └─► Receives: { token, period: 3600s, renewable: true }   ← periodic token, no max-TTL
              │
              └─► Cache token in: /agent/llm-token/token
 
 
-T=45min: Proactive Renewal (75% of TTL)
+T≈40min: Proactive Renewal (~2/3 of period)
      vault-agent monitors expiration
         │
         ├─► POST /v1/auth/token/renew-self
         │   Header: X-Vault-Token: <current_token>
         │
-        └─► Receives: { token, ttl: 3600s } (same token, extended)
+        └─► Receives: { token, period: 3600s } (same token, period reset)
              │
              └─► Update cache: /agent/llm-token/token
+             │
+             └─► Repeats forever — a periodic token never hits a max-TTL,
+                 so steady-state operation never needs approle/login again.
 
 
-T=59min: Renewal Failed (fallback)
-     If renewal fails:
+On agent restart only:
+     vault-agent re-reads role_id + secret_id from disk
         │
-        ├─► Re-authenticate from scratch
-        │   POST /v1/auth/approle/login
+        ├─► POST /v1/auth/approle/login   (secret_id must still be valid)
         │
-        └─► New token issued and cached
+        └─► New periodic token issued and cached
 
 
 Application Request (anytime):
@@ -856,15 +864,16 @@ Step 12: Check Vault Seal Status
    └─► GET /v1/sys/seal-status
        └─► If unsealed: Skip unseal steps
 
-Step 13: Regenerate Secret IDs Only
-   └─► POST /v1/auth/approle/role/gui-service/secret-id
-   └─► POST /v1/auth/approle/role/cron-manager-service/secret-id
-   └─► POST /v1/auth/approle/role/llm-orchestration-service/secret-id
-   └─► Write new secret_ids to /agent/credentials/
+Step 13: Validate and Reconcile Secret IDs
+   └─► For each role (gui, cron-manager, llm-orchestration):
+       ├─► Test existing on-disk secret_id via AppRole login
+       ├─► If valid: Reuse (no change to credential file)
+       └─► If invalid/missing: Mint new secret_id and write to disk
 
 Note: role_ids remain unchanged (static identifiers)
 Note: Existing secrets and policies preserved
 Note: RSA keypair NOT regenerated (preserved)
+Note: Stable secret_ids across restarts reduce credential churn
 
 ═══════════════════════════════════════════════════════════════════
 COMPLETION
@@ -1128,13 +1137,14 @@ Startup Order:
 vault-init Behavior:
   - Detects Vault already initialized
   - Skips initialization steps
-  - Regenerates secret_ids only
-  - Updates credential files
+  - Validates existing secret_ids (reuses if still valid)
+  - Mints new secret_ids only if existing ones are invalid
 
 Result:
-   All services start with fresh credentials
+   All services start with validated credentials
    Existing secrets preserved
    No manual intervention needed
+   Stable secret_ids reduce unnecessary credential churn
 ```
 
 ### Token Regeneration Strategy
@@ -1143,22 +1153,23 @@ Result:
 Current Implementation:
 
 1. On Every Container Restart:
-   └─► vault-init regenerates secret_ids
-       └─► Vault agents get new tokens
-           └─► Old tokens remain valid until expiration
+   └─► vault-init validates existing secret_ids
+       ├─► If valid: Reuse (agents continue with same credentials)
+       └─► If invalid: Mint new secret_id, agents re-authenticate
 
 2. Token Lifecycle:
-   └─► Issue: vault-agent authenticates
+   └─► Issue: vault-agent authenticates (periodic token, token_period per role)
    └─► Use: Application makes requests
-   └─► Renew: vault-agent extends TTL
-   └─► Expire: Automatic renewal failed
-   └─► Re-issue: vault-agent re-authenticates
+   └─► Renew: vault-agent renews within the period (~2/3 of period)
+   └─► No max-TTL: renewal continues indefinitely
+   └─► Re-issue: only on agent restart, via secret_id login
 
 3. Security Benefits:
-    Short-lived tokens (1 hour for LLM, 32 days for others)
-    Automatic rotation on agent restart
-    No manual token management
-    Compromised tokens have limited lifetime
+    Periodic tokens (period 1h LLM, 30m Cron, 20m GUI), renewed continuously
+    Steady-state operation never re-runs approle/login (a stale secret_id
+      cannot strand a running agent)
+    Stable secret_ids (no unnecessary churn on restart)
+    Compromised tokens limited to one un-renewed period
 ```
 
 ### Audit Logging Capabilities
diff --git a/docs/VAULT_SETUP_AND_USAGE.md b/docs/VAULT_SETUP_AND_USAGE.md
new file mode 100644
index 00000000..e61d362b
--- /dev/null
+++ b/docs/VAULT_SETUP_AND_USAGE.md
@@ -0,0 +1,355 @@
+# Vault Setup & Usage Guide
+
+A single reference for how HashiCorp Vault is deployed, initialized, and consumed in the
+RAG-Module. It covers the topology, the three Vault Agents, the secret layout, and — in
+depth — **how each agent renews its token and how secrets are rotated**.
+
+Source files this document describes:
+
+- `docker-compose.yml` — service/topology definition
+- `vault/config/vault.hcl` — Vault server config
+- `vault-init.sh` — one-time bootstrap + per-restart reconcile
+- `vault/agents/{gui,cron,llm}/*.hcl` — the three Vault Agent configs
+- `DSL/CronManager/script/store_secrets_in_vault.sh` — writes/rotates secrets
+- `DSL/CronManager/script/delete_secrets_from_vault.sh` — deletes secrets
+
+For the security rationale (threat model, defense-in-depth, access matrix) see the
+companion `docs/VAULT_SECURITY_ARCHITECTURE.md`. This guide focuses on the *operational*
+mechanics.
+
+---
+
+## 1. Topology at a glance
+
+```
+                bykstack (application network)                  vault-network (internal: true)
+ ┌───────────────────────────────────────────────┐        ┌──────────────────────────────┐
+ │  gui ──────────────► vault-agent-gui  :8202 ───┼────────┤                              │
+ │  cron-manager ─────► vault-agent-cron :8203 ───┼────────┤        vault  :8200          │
+ │  llm-orchestration ► vault-agent-llm  :8201 ───┼────────┤   (Raft storage, KV v2,      │
+ │                                                │        │    AppRole auth)             │
+ │  vault-init (also on vault-network) ───────────┼────────┤                              │
+ └───────────────────────────────────────────────┘        └──────────────────────────────┘
+```
+
+- **`vault`** runs only on `vault-network`, which is `internal: true` — it has **no route to
+  or from the host or the internet**. Port 8200 is never published.
+- **Vault Agents** straddle both networks: they reach `vault` on `vault-network` and are
+  reachable by their owning application on `bykstack`.
+- **Applications** talk *only* to their agent (`VAULT_ADDR=http://vault-agent-*:820x`) and
+  never hold a Vault token themselves. The agent injects the token transparently.
+
+| Service | Agent it uses | Agent address | AppRole | Policy |
+|---|---|---|---|---|
+| `gui` | `vault-agent-gui` | `:8202` | `gui-service` | `gui-policy` |
+| `cron-manager` | `vault-agent-cron` | `:8203` | `cron-manager-service` | `cron-manager-policy` |
+| `llm-orchestration-service` | `vault-agent-llm` | `:8201` | `llm-orchestration-service` | `llm-orchestration-policy` |
+
+---
+
+## 2. Vault server (`vault/config/vault.hcl`)
+
+- **Storage:** Raft, single node (`node_id = vault-node-1`, path `/vault/file`, persisted in
+  the `vault-data` volume). No `retry_join` — a lone node self-bootstraps; adding a self-
+  pointing join was found to cause "Vault is sealed" boot loops.
+- **Listener:** `0.0.0.0:8200`, `tls_disable = true` (TLS is terminated at the network
+  boundary; the network itself is the isolation layer here). Port `8201` is *not* given its
+  own listener because Vault uses it as the internal cluster port automatically.
+- **Lease defaults:** `default_lease_ttl = 168h` (7 days), `max_lease_ttl = 720h` (30 days).
+  These are *system ceilings*; the per-AppRole token TTLs (below) are much shorter and are
+  what actually governs agent renewal cadence.
+- `disable_mlock = false`, `ui = false`, JSON logs at INFO.
+
+Vault boots **sealed**. It must be unsealed before any operation — that is `vault-init`'s
+first job.
+
+---
+
+## 3. Bootstrap & reconcile (`vault-init.sh`)
+
+`vault-init` is a **run-once-then-exit** container (`restart: "no"`). The agents declare
+`depends_on: vault-init: condition: service_completed_successfully`, so they only start
+after init has finished cleanly. It runs `su vault -s /bin/sh /vault-init.sh` after creating
+and `chown`ing the shared agent directories.
+
+The script has two branches, selected by the presence of `/vault/data/.initialized`.
+
+### 3.1 First-time deployment
+
+1. Wait for `/v1/sys/health` to respond.
+2. **Initialize** with Shamir's Secret Sharing: `secret_shares=5`, `secret_threshold=3`.
+   The full response (5 unseal keys + root token) is written to
+   `/vault/data/unseal-keys.json`.
+3. **Unseal** by submitting 3 of the 5 keys.
+4. **Enable engines:** KV v2 at `secret/`, and the AppRole auth method.
+5. **Create three ACL policies** (see §5).
+6. **Create three AppRoles** issuing periodic tokens (see §4 — this is the heart of renewal),
+   via the `ensure_approles` helper. The same helper re-runs on subsequent deploys, so AppRole
+   config changes land without re-initializing Vault.
+7. **Issue credentials:** for each role, fetch the static `role_id` and mint a `secret_id`,
+   writing both to `/agent/credentials/<svc>_role_id` and `<svc>_secret_id` (`chmod 640`).
+8. **Generate an RSA-2048 keypair** with `openssl` and store it in Vault at
+   `secret/encryption/public_key` and `secret/encryption/private_key`
+   (algorithm `RSA-OAEP`, with `key_id` and `created_at` metadata).
+9. Seed a test LLM secret, then `touch /vault/data/.initialized`.
+
+### 3.2 Subsequent deployment (restart)
+
+1. Check `/v1/sys/seal-status`; if sealed, reload the 3 unseal keys from
+   `unseal-keys.json` and unseal.
+2. **Reconcile each secret_id** via `reconcile_secret_id`:
+   - `ensure_role_id` — make sure the `role_id` file exists (re-fetch from Vault if missing).
+   - `validate_secret_id` — attempt an AppRole login with the on-disk `role_id` + `secret_id`.
+     If it returns a `client_token`, the credential is still good.
+   - **Valid → reuse** the existing `secret_id` (no churn).
+   - **Invalid/missing → `mint_secret_id`** writes a fresh one.
+
+This is deliberate: because the AppRoles are created with `secret_id_ttl=0` and
+`secret_id_num_uses=0` (non-expiring, unlimited-use), a single long-lived `secret_id`
+survives normal restarts instead of being regenerated every boot. The RSA keypair, policies,
+and stored secrets are all preserved across restarts.
+
+> **Note on file permissions:** `vault-init.sh` writes credential files with `chmod 640`.
+> (The older architecture doc mentions `644`; the script is the source of truth — `640`.)
+
+---
+
+## 4. The three Vault Agents — auth, renewal & rotation
+
+This is the core of the question. All three agents are the same Vault binary
+(`hashicorp/vault:1.20.3`) run as `vault agent -config=...`. They differ only in which
+credentials they read, which token sink they write, and their listener port.
+
+### 4.1 What an agent config actually does
+
+Example (`vault/agents/llm/agent.hcl`; gui/cron are identical in shape):
+
+```hcl
+vault { address = "http://vault:8200"; retry { num_retries = 5 } }
+
+auto_auth {
+  method "approle" {
+    mount_path = "auth/approle"
+    config = {
+      role_id_file_path   = "/agent/credentials/llm_role_id"
+      secret_id_file_path = "/agent/credentials/llm_secret_id"
+      remove_secret_id_file_after_reading = false
+    }
+  }
+  sink "file" { config = { path = "/agent/llm-token/token"; mode = 0640 } }
+}
+
+cache { default_lease_duration = "1h" }
+listener "tcp" { address = "0.0.0.0:8201"; tls_disable = true }
+api_proxy { use_auto_auth_token = true }
+```
+
+Three mechanisms are at work:
+
+1. **`auto_auth` (authentication + renewal):** On startup the agent reads `role_id` +
+   `secret_id` and calls `POST /v1/auth/approle/login`. Vault returns a **periodic token**
+   (the AppRoles set `token_period`, defined in `vault-init.sh`, *not* in the HCL). The agent
+   then runs Vault's **auto-auth lifecycle manager**, which **renews the token automatically
+   in the background** before each period elapses. A periodic token has **no max-TTL**, so the
+   agent renews it indefinitely and — during normal operation — **never has to call
+   `approle/login` again**. The agent only re-authenticates (and thus only needs the
+   `secret_id` again) if it is **restarted** or if a renewal is missed long enough for the
+   token to lapse. `remove_secret_id_file_after_reading = false` keeps the `secret_id` on disk
+   so the agent can re-auth after a restart without `vault-init` re-minting.
+
+   > **Why periodic tokens?** An earlier design issued tokens with `token_ttl`/`token_max_ttl`,
+   > which forced a full re-login every time `token_max_ttl` was reached. If the `secret_id`
+   > had become invalid by then (expiry, clock skew, server re-init), the agent got stuck in an
+   > `invalid role or secret ID` 400 backoff loop with no way to self-heal. Periodic tokens
+   > remove that re-login from the steady state, so a stale `secret_id` can no longer strand a
+   > running agent.
+2. **`sink "file"` (token hand-off):** Every time the agent obtains/renews a token it writes
+   it to a file (`/agent/<svc>-token/token`, mode `0640`). The compose **health check** for
+   each agent is simply `test -f <token> && test -s <token>` — a non-empty token file means
+   the agent has authenticated successfully.
+3. **`api_proxy { use_auto_auth_token = true }` (transparent injection):** The agent also
+   listens as an HTTP proxy on its port. When the application sends a token-less request, the
+   agent injects `X-Vault-Token: <current cached token>` and forwards it to `vault:8200`.
+   This is why application code never sets `VAULT_TOKEN`.
+
+> **`cache.default_lease_duration` is not the token TTL.** It is the agent's cache lease
+> hint. The authoritative token lifetime comes from the AppRole's `token_period` in
+> `vault-init.sh`. The per-agent cache hint is set to match the period.
+
+### 4.2 Per-agent renewal parameters
+
+AppRole token settings are created in `vault-init.sh`; all three use
+`token_period` (periodic token, **no max-TTL**), `secret_id_ttl=0`, `secret_id_num_uses=0`,
+`token_num_uses=0`, `bind_secret_id=true`.
+
+| Agent | AppRole | `token_period` | Proactive renewal (~⅔ of period) | Re-login (`approle/login`) |
+|---|---|---|---|---|
+| `vault-agent-gui` | `gui-service` | **20m** | ~every 13 min | only on agent restart |
+| `vault-agent-cron` | `cron-manager-service` | **30m** | ~every 20 min | only on agent restart |
+| `vault-agent-llm` | `llm-orchestration-service` | **1h** | ~every 40 min | only on agent restart |
+
+Reading the lifecycle for, e.g., the LLM agent:
+
+```
+T=0       login → periodic token (period 1h)        → written to /agent/llm-token/token
+T≈40m     renew-self → period resets to 1h          → token file refreshed
+...       renew repeats forever; token never hits a max-TTL
+(restart) agent re-runs approle/login with the on-disk secret_id → fresh token
+```
+
+The periods are tuned per service (shorter for the GUI, which only reads the public key;
+longer for the high-traffic LLM read path), but functionally all three behave the same:
+**renew forever, re-login only on restart.**
+
+### 4.3 Two distinct "rotation" concepts — keep them separate
+
+1. **Token rotation (automatic, continuous):** Handled entirely by the agent's `auto_auth`
+   loop as described above — the periodic token is renewed indefinitely with no human action
+   and no `vault-init` involvement.
+2. **`secret_id` rotation (rare):** The `secret_id` is the long-lived credential the agent
+   uses to *log in* (at startup/restart only, now that tokens are periodic). It is configured
+   non-expiring (`secret_id_ttl=0`, `secret_id_num_uses=0`) and is only replaced by
+   `vault-init` on a restart when the existing one fails validation (§3.2). To force rotation,
+   delete the `secret_id` file (or invalidate it in Vault) and re-run `vault-init`, then
+   restart the agent so it logs in with the freshly minted one.
+
+   > **Operational caveat (learned the hard way):** if a `secret_id` ever does become invalid
+   > while an agent is running, the periodic-token design means a *running* agent keeps working
+   > (it only renews, never re-logs-in). But a **restarted** agent needs a valid `secret_id` to
+   > log in. Recovery is always: re-run `vault-init` (mints a fresh `secret_id` via the §3.2
+   > reconcile) → restart the affected agent. See `docs/` runbook / the troubleshooting note
+   > below.
+
+### 4.4 Restart behavior
+
+- **Restart an agent:** It re-reads `role_id`/`secret_id` from the (read-only) creds volume
+  and re-authenticates. New token, written to the sink. App sees a brief blip.
+- **Restart `vault`:** Data persists; `vault-init` (or the existing agent tokens, if still
+  valid) handle re-unseal/re-auth. Existing tokens remain valid if not expired.
+- **Full `down && up`:** Order is `vault → vault-init → agents → apps`. `vault-init` detects
+  the `.initialized` flag, skips first-time setup, reconciles secret_ids, and the agents
+  start with validated credentials.
+
+---
+
+## 5. Authorization — policies (who can touch what)
+
+Created in `vault-init.sh`. Paths are KV v2, so data lives under `secret/data/...` and
+listing/metadata under `secret/metadata/...`.
+
+| Path | `gui-policy` | `cron-manager-policy` | `llm-orchestration-policy` |
+|---|---|---|---|
+| `secret/data/encryption/public_key` | **read** | read | — |
+| `secret/data/encryption/private_key` | **deny** | **read** | — |
+| `secret/data/encryption/*` | — | — | **deny** |
+| `secret/data/llm/connections/*` | deny | **create/read/update/delete** | **read, list** |
+| `secret/data/embeddings/connections/*` | deny | **create/read/update/delete** | **read, list** |
+| `auth/token/lookup-self` | — | read | read |
+
+The intent, by tier:
+
+- **GUI** — can read *only* the public key, to encrypt user-entered credentials in the
+  browser before they ever leave it. Everything else is explicitly denied.
+- **CronManager** — the only writer. Reads the **private key** to decrypt what the GUI
+  encrypted, then writes plaintext credentials into Vault. Full CRUD on connection secrets.
+- **LLM Orchestration** — read-only consumer of connection secrets. **Explicitly denied** all
+  encryption keys, so a compromise of this hot-path service cannot exfiltrate the private key.
+
+---
+
+## 6. Secret layout (KV v2 under `secret/`)
+
+```
+secret/
+├── llm/connections/<platform>/<vaultUuid>          ← e.g. aws_bedrock, azure_openai
+├── embeddings/connections/<platform>/<vaultUuid>
+└── encryption/
+    ├── public_key     { key, algorithm: RSA-OAEP, key_size: 2048, key_id, created_at }
+    └── private_key    { key, algorithm: RSA-OAEP, key_size: 2048, key_id, created_at }
+```
+
+The current write/delete scripts key connection secrets by a stable **`vaultUuid`** as the
+final path segment (environment is tracked in the DB, not the path). KV v2 versions every
+write, so updating a credential keeps prior versions for audit/rollback.
+
+LLM secret shape (AWS): `{ connection_id, access_key, secret_key, model, tags }`.
+Azure: `{ connection_id, endpoint, api_key, deployment_name, model, api_version, tags }`.
+
+---
+
+## 7. Usage flows
+
+### 7.1 Storing / rotating a credential (`store_secrets_in_vault.sh`, via cron-manager)
+
+1. GUI encrypts the raw key with the RSA **public** key and submits it.
+2. The cron-manager job runs the script against `vault-agent-cron:8203` (no token — the agent
+   injects it).
+3. The script **fetches the private key** (`GET secret/data/encryption/private_key`), then
+   decrypts each sensitive field in-memory via `decrypt_vault_secrets.py` (RSA-OAEP).
+4. It builds the JSON payload with `jq` and `POST`s plaintext to
+   `secret/data/<llm|embeddings>/connections/<platform>/<vaultUuid>`. Re-posting the same path
+   = a KV v2 version bump = credential rotation.
+5. Sensitive shell variables are `unset` immediately after use.
+
+### 7.2 Deleting a credential (`delete_secrets_from_vault.sh`)
+
+`DELETE`s both `secret/data/...` and `secret/metadata/...` for the connection (404 treated as
+success), again through `vault-agent-cron` with no explicit token.
+
+### 7.3 Reading a credential (LLM orchestration)
+
+The LLM service issues a token-less `GET http://vault-agent-llm:8201/v1/secret/data/llm/...`.
+`vault-agent-llm` injects its cached token, Vault validates it against
+`llm-orchestration-policy`, and returns the secret. The service then calls AWS/Azure with it.
+
+---
+
+## 8. Operational notes & known trade-offs
+
+- **Unseal keys + root token sit in the `vault-data` volume** (`unseal-keys.json`). This makes
+  auto-unseal on restart trivial but is a **dev/test convenience**. For production, switch to
+  auto-unseal backed by a cloud KMS/HSM and remove the keys from the volume.
+- **Root token** is used only by `vault-init` and is never injected into app containers. Best
+  practice for production is to revoke it after bootstrap and use scoped admin policies.
+- **TLS is disabled** on the Vault listener and agent listeners; isolation relies on the
+  `internal: true` `vault-network`. Add TLS for any non-local deployment.
+- **Audit logging is available but not enabled.** Turn it on with
+  `vault audit enable file file_path=/vault/logs/audit.log` (the `./vault/logs` mount already
+  exists) for a full request trail.
+- **Credential files are world-readable within the shared volume** (mode 640, single owner,
+  but all agents mount the same `vault-agent-creds` volume read-only) — isolation is at the
+  volume level, not per-file. Fine for this trust boundary; note it if the threat model
+  tightens.
+
+---
+
+## 9. Troubleshooting: agents looping on `invalid role or secret ID`
+
+**Symptom:** an agent logs `lifetime watcher done channel triggered, re-authenticating`
+followed by repeating `PUT .../auth/approle/login → Code: 400 ... invalid role or secret ID`
+with growing backoff. Token *renewals* had been succeeding up to that point.
+
+**Cause:** the agent's `secret_id` became invalid server-side (expiry, clock skew, or a Vault
+re-init), and the agent reached a point where it had to do a full `approle/login`. With the
+old `token_ttl`/`token_max_ttl` design this happened on every `token_max_ttl` cycle; the
+switch to **periodic tokens** (§4) removes re-login from steady state, so a *running* agent no
+longer hits this — but a **restarted** agent still needs a valid `secret_id`.
+
+**Recovery:**
+
+```bash
+# Mint fresh secret_ids (vault-init's reconcile detects the invalid ones and replaces them)
+docker compose up -d --force-recreate vault-init
+docker wait vault-init
+# Restart the affected agents so they log in with the fresh secret_id
+docker compose restart vault-agent-gui vault-agent-cron vault-agent-llm
+```
+
+**Confirm root cause (read-only):**
+
+```bash
+ROOT=$(docker exec vault sh -c "grep -o '\"root_token\":\"[^\"]*\"' /vault/file/unseal-keys.json | cut -d: -f2 | tr -d '\"'")
+docker exec -e VAULT_TOKEN=$ROOT -e VAULT_ADDR=http://127.0.0.1:8200 vault \
+  vault read auth/approle/role/gui-service          # expect token_period set, secret_id_ttl=0
+echo "host: $(date -u)"; docker exec vault date -u  # check for WSL2/Docker clock drift
+```
diff --git a/src/vector_indexer/contextual_processor.py b/src/vector_indexer/contextual_processor.py
index b225cf30..6b21d326 100644
--- a/src/vector_indexer/contextual_processor.py
+++ b/src/vector_indexer/contextual_processor.py
@@ -41,7 +41,7 @@ def __init__(
 
     async def process_document(
         self, document: ProcessingDocument
-    ) -> List[ContextualChunk]:
+    ) -> tuple[List[ContextualChunk], int]:
         """
         Process single document into contextual chunks.
 
@@ -49,7 +49,8 @@ async def process_document(
             document: Document to process
 
         Returns:
-            List of contextual chunks with embeddings
+            Tuple of (contextual chunks with embeddings, number of chunks
+            dropped due to context-generation failure)
         """
         logger.info(
             f"Processing document {document.document_hash} ({len(document.content)} characters)"
@@ -69,11 +70,13 @@ async def process_document(
             # Step 3: Create contextual chunks (filter out failed context generations)
             contextual_chunks: List[ContextualChunk] = []
             valid_contextual_contents: List[str] = []
+            failed_chunks = 0
 
             for i, (base_chunk, context) in enumerate(
                 zip(base_chunks, contexts, strict=True)
             ):
                 if isinstance(context, Exception):
+                    failed_chunks += 1
                     self.error_logger.log_context_generation_failure(
                         document.document_hash, i, str(context), self.config.max_retries
                     )
@@ -128,7 +131,7 @@ async def process_document(
                 logger.error(
                     f"No valid chunks created for document {document.document_hash}"
                 )
-                return []
+                return [], failed_chunks
 
             # Step 4: Create embeddings for all valid contextual chunks
             try:
@@ -154,9 +157,10 @@ async def process_document(
                 raise
 
             logger.info(
-                f"Successfully processed document {document.document_hash}: {len(contextual_chunks)} chunks"
+                f"Successfully processed document {document.document_hash}: "
+                f"{len(contextual_chunks)} chunks ({failed_chunks} dropped)"
             )
-            return contextual_chunks
+            return contextual_chunks, failed_chunks
 
         except Exception as e:
             logger.error(
diff --git a/src/vector_indexer/error_logger.py b/src/vector_indexer/error_logger.py
index 1d11cba1..c62de79c 100644
--- a/src/vector_indexer/error_logger.py
+++ b/src/vector_indexer/error_logger.py
@@ -158,15 +158,17 @@ def log_processing_stats(self, stats: ProcessingStats) -> None:
                 stats_dict["end_time"] = stats.end_time.isoformat()
             stats_dict["duration"] = stats.duration
             stats_dict["success_rate"] = stats.success_rate
+            stats_dict["chunk_success_rate"] = stats.chunk_success_rate
 
             with open(self.config.stats_log_file, "w", encoding="utf-8") as f:
                 json.dump(stats_dict, f, indent=2)
 
             logger.info(
                 f"Processing completed - Success rate: {stats.success_rate:.1%}, "
+                f"Chunk success rate: {stats.chunk_success_rate:.1%}, "
                 f"Duration: {stats.duration}, "
                 f"Processed: {stats.documents_processed}/{stats.total_documents} documents, "
-                f"Chunks: {stats.total_chunks_processed}"
+                f"Chunks: {stats.total_chunks_processed} ok / {stats.total_chunks_failed} failed"
             )
         except Exception as e:
             logger.error(f"Failed to write stats log: {e}")
diff --git a/src/vector_indexer/main_indexer.py b/src/vector_indexer/main_indexer.py
index 45ce5ff6..bf407682 100644
--- a/src/vector_indexer/main_indexer.py
+++ b/src/vector_indexer/main_indexer.py
@@ -15,7 +15,7 @@
 sys.path.append(str(Path(__file__).parent.parent))
 
 from vector_indexer.config.config_loader import ConfigLoader
-from vector_indexer.document_loader import DocumentLoader
+from vector_indexer.document_loader import DocumentLoader, DocumentLoadError
 from vector_indexer.contextual_processor import ContextualProcessor
 from vector_indexer.qdrant_manager import QdrantManager
 from vector_indexer.error_logger import ErrorLogger
@@ -169,7 +169,7 @@ async def process_all_documents(self) -> ProcessingStats:
 
                 # Process documents with controlled concurrency
                 semaphore = asyncio.Semaphore(self.config.max_concurrent_documents)
-                tasks: List[asyncio.Task[tuple[int, str]]] = []
+                tasks: List[asyncio.Task[tuple[int, str, int]]] = []
 
                 for doc_info in documents:
                     task = asyncio.create_task(
@@ -189,6 +189,9 @@ async def process_all_documents(self) -> ProcessingStats:
                 chunks_info: Dict[
                     str, Dict[str, Any]
                 ] = {}  # Track chunk counts for metadata update
+                # Only documents that processed successfully are marked as
+                # processed in DVC tracking, so failures are retried next run.
+                processed_documents: List[DocumentInfo] = []
                 for i, result in enumerate(results):
                     if isinstance(result, Exception):
                         doc_info = documents[i]
@@ -200,16 +203,18 @@ async def process_all_documents(self) -> ProcessingStats:
                             doc_info.document_hash, str(result)
                         )
                     else:
-                        # Result should be tuple of (chunk_count, content_hash)
+                        # Result should be tuple of (chunk_count, content_hash, failed_chunks)
                         doc_info = documents[i]
                         self.stats.documents_processed += 1
-                        if isinstance(result, tuple) and len(result) == 2:
-                            chunk_count, content_hash = result
+                        processed_documents.append(doc_info)
+                        if isinstance(result, tuple) and len(result) == 3:
+                            chunk_count, content_hash, failed_chunks = result
                             self.stats.total_chunks_processed += chunk_count
+                            self.stats.total_chunks_failed += failed_chunks
                             # Track chunk count using content_hash (not directory hash)
                             chunks_info[content_hash] = {"chunk_count": chunk_count}
                             logger.info(
-                                f"CHUNK COUNT: Document {doc_info.document_hash[:12]}... (content: {content_hash[:12]}...) -> {chunk_count} chunks"
+                                f"CHUNK COUNT: Document {doc_info.document_hash[:12]}... (content: {content_hash[:12]}...) -> {chunk_count} chunks ({failed_chunks} failed)"
                             )
 
                 # Log the complete chunks_info dictionary
@@ -227,10 +232,10 @@ async def process_all_documents(self) -> ProcessingStats:
                 # Step 4: Update processed files tracking (even if no new documents processed)
                 if diff_detector:
                     try:
-                        # Update metadata for newly processed files
-                        if documents:
+                        # Update metadata for newly processed files (successful only)
+                        if processed_documents:
                             processed_paths = [
-                                doc.cleaned_txt_path for doc in documents
+                                doc.cleaned_txt_path for doc in processed_documents
                             ]
                             if processed_paths:
                                 logger.debug(
@@ -290,7 +295,7 @@ async def _process_single_document(
         doc_info: DocumentInfo,
         qdrant_manager: QdrantManager,
         semaphore: asyncio.Semaphore,
-    ) -> tuple[int, str]:
+    ) -> tuple[int, str, int]:
         """
         Process a single document with contextual retrieval.
 
@@ -300,7 +305,9 @@ async def _process_single_document(
             semaphore: Concurrency control semaphore
 
         Returns:
-            tuple: (chunk_count: int, content_hash: str) or Exception on error
+            tuple: (chunk_count: int, content_hash: str, failed_chunks: int).
+            Raises on any failure (including load failure or zero usable chunks),
+            so the document is counted as failed rather than as success.
         """
         async with semaphore:
             logger.info(f"Processing document: {doc_info.document_hash}")
@@ -310,29 +317,31 @@ async def _process_single_document(
                 document = self.document_loader.load_document(doc_info)
 
                 if not document:
-                    logger.warning(f"Could not load document: {doc_info.document_hash}")
-                    return (0, doc_info.document_hash)
+                    raise DocumentLoadError(
+                        f"Could not load document: {doc_info.document_hash}"
+                    )
 
                 # Process document with contextual retrieval
-                contextual_chunks = await self.contextual_processor.process_document(
-                    document
-                )
+                (
+                    contextual_chunks,
+                    failed_chunks,
+                ) = await self.contextual_processor.process_document(document)
 
                 if not contextual_chunks:
-                    logger.warning(
-                        f"No chunks created for document: {doc_info.document_hash}"
+                    raise RuntimeError(
+                        f"No chunks created for document: {doc_info.document_hash} "
+                        f"({failed_chunks} chunks failed context generation)"
                     )
-                    return (0, document.document_hash)
 
                 # Store chunks in Qdrant
                 await qdrant_manager.store_chunks(contextual_chunks)
 
                 logger.info(
                     f"Successfully processed document {doc_info.document_hash}: "
-                    f"{len(contextual_chunks)} chunks"
+                    f"{len(contextual_chunks)} chunks ({failed_chunks} dropped)"
                 )
 
-                return (len(contextual_chunks), document.document_hash)
+                return (len(contextual_chunks), document.document_hash, failed_chunks)
 
             except Exception as e:
                 logger.error(f"Error processing document {doc_info.document_hash}: {e}")
@@ -352,10 +361,12 @@ def _log_final_summary(self) -> None:
         logger.info(f"   • Failed Chunks: {self.stats.total_chunks_failed}")
 
         if self.stats.total_documents > 0:
-            success_rate = (
-                self.stats.documents_processed / self.stats.total_documents
-            ) * 100
-            logger.info(f"Success Rate: {success_rate:.1f}%")
+            logger.info(f"Success Rate: {self.stats.success_rate * 100:.1f}%")
+
+        if self.stats.total_chunks_processed + self.stats.total_chunks_failed > 0:
+            logger.info(
+                f"Chunk Success Rate: {self.stats.chunk_success_rate * 100:.1f}%"
+            )
 
         logger.info(f"Processing Duration: {self.stats.duration}")
 
@@ -365,6 +376,11 @@ def _log_final_summary(self) -> None:
             )
             logger.info("Check failure logs for details")
 
+        if self.stats.total_chunks_failed > 0:
+            logger.warning(
+                f"  {self.stats.total_chunks_failed} chunks failed processing"
+            )
+
     async def run_health_check(self) -> bool:
         """
         Run health check on all components.
@@ -617,12 +633,20 @@ async def _execute_cleanup_operations(
         return total_deleted
 
     def _cleanup_datasets(self) -> None:
-        """Remove datasets folder after processing."""
+        """Remove datasets folder contents after processing.
+
+        Only the folder's contents are removed, not the folder itself, since
+        the datasets path is a mounted volume in the container.
+        """
         try:
             datasets_path = Path(self.config.dataset_base_path)
             if datasets_path.exists():
-                shutil.rmtree(str(datasets_path))
-                logger.info(f"Datasets folder cleaned up: {datasets_path}")
+                for child in datasets_path.iterdir():
+                    if child.is_dir():
+                        shutil.rmtree(str(child))
+                    else:
+                        child.unlink()
+                logger.info(f"Datasets folder contents cleaned up: {datasets_path}")
             else:
                 logger.debug(f"Datasets folder does not exist: {datasets_path}")
         except Exception as e:
diff --git a/src/vector_indexer/models.py b/src/vector_indexer/models.py
index 752ea02a..41ae1ce1 100644
--- a/src/vector_indexer/models.py
+++ b/src/vector_indexer/models.py
@@ -96,6 +96,14 @@ def success_rate(self) -> float:
             return self.documents_processed / self.total_documents
         return 0.0
 
+    @property
+    def chunk_success_rate(self) -> float:
+        """Calculate chunk success rate (processed vs processed + failed)."""
+        total_chunks = self.total_chunks_processed + self.total_chunks_failed
+        if total_chunks > 0:
+            return self.total_chunks_processed / total_chunks
+        return 0.0
+
 
 class ProcessingError(BaseModel):
     """Error information for failed processing."""
diff --git a/vault-init.sh b/vault-init.sh
index eada7518..0e759f8e 100644
--- a/vault-init.sh
+++ b/vault-init.sh
@@ -7,6 +7,88 @@ INIT_FLAG="/vault/data/.initialized"
 
 echo "=== Vault Initialization Script ==="
 
+# ---------------------------------------------------------------------------
+# Helpers (used by the SUBSEQUENT DEPLOYMENT branch)
+# ---------------------------------------------------------------------------
+
+# Ensure a role_id file exists on disk; fetch from Vault if missing.
+# Usage: ensure_role_id <role-name> <role_id_file>
+ensure_role_id() {
+    role="$1"; rid_file="$2"
+    if [ -f "$rid_file" ] && [ -s "$rid_file" ]; then
+        return 0
+    fi
+    echo "Fetching role_id for $role..."
+    rid=$(wget -q -O- \
+        --header="X-Vault-Token: $ROOT_TOKEN" \
+        "$VAULT_ADDR/v1/auth/approle/role/$role/role-id" | \
+        grep -o '"role_id":"[^"]*"' | cut -d':' -f2 | tr -d '"')
+    echo "$rid" > "$rid_file"
+    chmod 640 "$rid_file"
+}
+
+# Return 0 if the on-disk role_id + secret_id still authenticate, 1 otherwise.
+# Usage: validate_secret_id <role_id_file> <secret_id_file>
+validate_secret_id() {
+    rid_file="$1"; sid_file="$2"
+    [ -f "$rid_file" ] && [ -f "$sid_file" ] || return 1
+    rid=$(cat "$rid_file"); sid=$(cat "$sid_file")
+    [ -n "$rid" ] && [ -n "$sid" ] || return 1
+    # wget returns non-zero on HTTP 400 (invalid creds); also confirm a token came back.
+    resp=$(wget -q -O- \
+        --post-data="{\"role_id\":\"$rid\",\"secret_id\":\"$sid\"}" \
+        --header='Content-Type: application/json' \
+        "$VAULT_ADDR/v1/auth/approle/login" 2>/dev/null) || return 1
+    echo "$resp" | grep -q '"client_token"' || return 1
+    return 0
+}
+
+# Mint a fresh secret_id for a role and write it to disk.
+# Usage: mint_secret_id <role-name> <secret_id_file>
+mint_secret_id() {
+    role="$1"; sid_file="$2"
+    sid=$(wget -q -O- --post-data='' \
+        --header="X-Vault-Token: $ROOT_TOKEN" \
+        "$VAULT_ADDR/v1/auth/approle/role/$role/secret-id" | \
+        grep -o '"secret_id":"[^"]*"' | cut -d':' -f2 | tr -d '"')
+    echo "$sid" > "$sid_file"
+    chmod 640 "$sid_file"
+}
+
+# Reuse the existing secret_id if it still authenticates; otherwise mint a new one.
+# Usage: reconcile_secret_id <role-name> <role_id_file> <secret_id_file>
+reconcile_secret_id() {
+    role="$1"; rid_file="$2"; sid_file="$3"
+    ensure_role_id "$role" "$rid_file"
+    if validate_secret_id "$rid_file" "$sid_file"; then
+        echo "$role: existing secret_id still valid - reusing"
+    else
+        echo "$role: secret_id invalid or missing - minting a new one"
+        mint_secret_id "$role" "$sid_file"
+    fi
+}
+
+# Create or update an AppRole that issues a PERIODIC token (no max_ttl): the
+# agent renews it forever and never re-runs approle/login in steady state.
+# secret_id_ttl=0 + secret_id_num_uses=0 keep the secret_id valid across
+# restarts. Idempotent: does not invalidate existing secret_ids, safe per run.
+# Usage: upsert_approle <role-name> <policy-name> <token-period>
+upsert_approle() {
+    role="$1"; policy="$2"; period="$3"
+    wget -q -O- --post-data='{"token_policies":["'"$policy"'"],"token_period":"'"$period"'","token_num_uses":0,"secret_id_ttl":"0","secret_id_num_uses":0,"bind_secret_id":true}' \
+        --header="X-Vault-Token: $ROOT_TOKEN" \
+        --header='Content-Type: application/json' \
+        "$VAULT_ADDR/v1/auth/approle/role/$role" >/dev/null
+}
+
+# Apply the current AppRole definitions for all three services.
+ensure_approles() {
+    echo "Ensuring AppRole configs (periodic tokens)..."
+    upsert_approle "gui-service"               "gui-policy"                 "20m"
+    upsert_approle "cron-manager-service"      "cron-manager-policy"        "30m"
+    upsert_approle "llm-orchestration-service" "llm-orchestration-policy"   "1h"
+}
+
 # Wait for Vault to be ready
 echo "Waiting for Vault..."
 for i in $(seq 1 30); do
@@ -114,27 +196,9 @@ path "auth/token/lookup-self" { capabilities = ["read"] }'
         --header='Content-Type: application/json' \
         "$VAULT_ADDR/v1/sys/policies/acl/llm-orchestration-policy" >/dev/null
     
-    # Create GUI AppRole
-    echo "Creating gui-service AppRole..."
-    wget -q -O- --post-data='{"token_policies":["gui-policy"],"token_no_default_policy":true,"token_ttl":"15m","token_max_ttl":"1h","secret_id_ttl":"24h","secret_id_num_uses":0,"bind_secret_id":true}' \
-        --header="X-Vault-Token: $ROOT_TOKEN" \
-        --header='Content-Type: application/json' \
-        "$VAULT_ADDR/v1/auth/approle/role/gui-service" >/dev/null
-    
-    # Create CronManager AppRole
-    echo "Creating cron-manager-service AppRole..."
-    wget -q -O- --post-data='{"token_policies":["cron-manager-policy"],"token_no_default_policy":true,"token_ttl":"30m","token_max_ttl":"8h","secret_id_ttl":"24h","secret_id_num_uses":0,"bind_secret_id":true}' \
-        --header="X-Vault-Token: $ROOT_TOKEN" \
-        --header='Content-Type: application/json' \
-        "$VAULT_ADDR/v1/auth/approle/role/cron-manager-service" >/dev/null
-    
-    # Create LLM Orchestration AppRole
-    echo "Creating llm-orchestration-service AppRole..."
-    wget -q -O- --post-data='{"token_policies":["llm-orchestration-policy"],"token_no_default_policy":true,"token_ttl":"1h","token_max_ttl":"8h","secret_id_ttl":"24h","secret_id_num_uses":0,"bind_secret_id":true}' \
-        --header="X-Vault-Token: $ROOT_TOKEN" \
-        --header='Content-Type: application/json' \
-        "$VAULT_ADDR/v1/auth/approle/role/llm-orchestration-service" >/dev/null
-    
+    # Create the three AppRoles (periodic tokens - see upsert_approle).
+    ensure_approles
+
     # Ensure credentials directory exists
     mkdir -p /agent/credentials
     
@@ -276,65 +340,22 @@ else
     # Get root token
     ROOT_TOKEN=$(grep -o '"root_token":"[^"]*"' "$UNSEAL_KEYS_FILE" | cut -d':' -f2 | tr -d '"')
     export VAULT_TOKEN="$ROOT_TOKEN"
-    
+
+    # Re-apply AppRole definitions so config changes (e.g. periodic tokens)
+    # take effect on redeploy without re-initializing Vault. Idempotent and
+    # does not invalidate existing secret_ids.
+    ensure_approles
+
     # Ensure credentials directory exists
     mkdir -p /agent/credentials
     
-    # Always regenerate all secret_ids on restart
-    echo "Regenerating GUI secret_id..."
-    GUI_SECRET_ID=$(wget -q -O- --post-data='' \
-        --header="X-Vault-Token: $ROOT_TOKEN" \
-        "$VAULT_ADDR/v1/auth/approle/role/gui-service/secret-id" | \
-        grep -o '"secret_id":"[^"]*"' | cut -d':' -f2 | tr -d '"')
-    echo "$GUI_SECRET_ID" > /agent/credentials/gui_secret_id
-    
-    echo "Regenerating CronManager secret_id..."
-    CRON_SECRET_ID=$(wget -q -O- --post-data='' \
-        --header="X-Vault-Token: $ROOT_TOKEN" \
-        "$VAULT_ADDR/v1/auth/approle/role/cron-manager-service/secret-id" | \
-        grep -o '"secret_id":"[^"]*"' | cut -d':' -f2 | tr -d '"')
-    echo "$CRON_SECRET_ID" > /agent/credentials/cron_secret_id
-    
-    echo "Regenerating LLM secret_id..."
-    LLM_SECRET_ID=$(wget -q -O- --post-data='' \
-        --header="X-Vault-Token: $ROOT_TOKEN" \
-        "$VAULT_ADDR/v1/auth/approle/role/llm-orchestration-service/secret-id" | \
-        grep -o '"secret_id":"[^"]*"' | cut -d':' -f2 | tr -d '"')
-    echo "$LLM_SECRET_ID" > /agent/credentials/llm_secret_id
-    
-    # Set permissions
-    chmod 640 /agent/credentials/*_secret_id
-    
-    # Ensure role_ids exist
-    if [ ! -f /agent/credentials/gui_role_id ]; then
-        echo "Copying GUI role_id..."
-        GUI_ROLE_ID=$(wget -q -O- \
-            --header="X-Vault-Token: $ROOT_TOKEN" \
-            "$VAULT_ADDR/v1/auth/approle/role/gui-service/role-id" | \
-            grep -o '"role_id":"[^"]*"' | cut -d':' -f2 | tr -d '"')
-        echo "$GUI_ROLE_ID" > /agent/credentials/gui_role_id
-        chmod 640 /agent/credentials/gui_role_id
-    fi
-    
-    if [ ! -f /agent/credentials/cron_role_id ]; then
-        echo "Copying CronManager role_id..."
-        CRON_ROLE_ID=$(wget -q -O- \
-            --header="X-Vault-Token: $ROOT_TOKEN" \
-            "$VAULT_ADDR/v1/auth/approle/role/cron-manager-service/role-id" | \
-            grep -o '"role_id":"[^"]*"' | cut -d':' -f2 | tr -d '"')
-        echo "$CRON_ROLE_ID" > /agent/credentials/cron_role_id
-        chmod 640 /agent/credentials/cron_role_id
-    fi
-    
-    if [ ! -f /agent/credentials/llm_role_id ]; then
-        echo "Copying LLM role_id..."
-        LLM_ROLE_ID=$(wget -q -O- \
-            --header="X-Vault-Token: $ROOT_TOKEN" \
-            "$VAULT_ADDR/v1/auth/approle/role/llm-orchestration-service/role-id" | \
-            grep -o '"role_id":"[^"]*"' | cut -d':' -f2 | tr -d '"')
-        echo "$LLM_ROLE_ID" > /agent/credentials/llm_role_id
-        chmod 640 /agent/credentials/llm_role_id
-    fi
+    # Reconcile secret_ids: reuse the existing one if it still authenticates,
+    # mint a new one only if invalid or missing - keeps one stable secret_id
+    # across restarts instead of rotating every boot. reconcile_secret_id also
+    # ensures the role_id file exists first (validation needs both).
+    reconcile_secret_id "gui-service"                 /agent/credentials/gui_role_id  /agent/credentials/gui_secret_id
+    reconcile_secret_id "cron-manager-service"        /agent/credentials/cron_role_id /agent/credentials/cron_secret_id
+    reconcile_secret_id "llm-orchestration-service"   /agent/credentials/llm_role_id  /agent/credentials/llm_secret_id
 fi
 
 echo "=== Vault init complete ==="
\ No newline at end of file
diff --git a/vault/agents/cron/cron-agent.hcl b/vault/agents/cron/cron-agent.hcl
index f2db227e..9454c9b7 100644
--- a/vault/agents/cron/cron-agent.hcl
+++ b/vault/agents/cron/cron-agent.hcl
@@ -2,7 +2,9 @@
 # This agent provides CronManager with access to encryption keys and write access to secrets
 
 vault {
-  address = "http://vault:8200"
+  # Local testing: use rag-vault, not bare "vault" — that name collides with the
+  # ckb stack on the shared bykstack network and authenticates the wrong Vault.
+  address = "http://rag-vault:8200"
   retry {
     num_retries = 5
   }
@@ -42,6 +44,4 @@ listener "tcp" {
 # API proxy configuration
 api_proxy {
   use_auto_auth_token = true
-  enforce_consistency = "always"
-  when_inconsistent = "forward"
 }
diff --git a/vault/agents/gui/gui-agent.hcl b/vault/agents/gui/gui-agent.hcl
index a28db871..672d6d4d 100644
--- a/vault/agents/gui/gui-agent.hcl
+++ b/vault/agents/gui/gui-agent.hcl
@@ -2,7 +2,9 @@
 # This agent provides GUI with access to public encryption key only
 
 vault {
-  address = "http://vault:8200"
+  # Local testing: use rag-vault, not bare "vault" — that name collides with the
+  # ckb stack on the shared bykstack network and authenticates the wrong Vault.
+  address = "http://rag-vault:8200"
   retry {
     num_retries = 5
   }
@@ -42,6 +44,4 @@ listener "tcp" {
 # API proxy configuration
 api_proxy {
   use_auto_auth_token = true
-  enforce_consistency = "always"
-  when_inconsistent = "forward"
 }
diff --git a/vault/agents/llm/agent.hcl b/vault/agents/llm/agent.hcl
index d7237be7..1a575260 100644
--- a/vault/agents/llm/agent.hcl
+++ b/vault/agents/llm/agent.hcl
@@ -1,5 +1,7 @@
 vault {
-  address = "http://vault:8200"
+  # Local testing: use rag-vault, not bare "vault" — that name collides with the
+  # ckb stack on the shared bykstack network and authenticates the wrong Vault.
+  address = "http://rag-vault:8200"
   retry {
     num_retries = 5
   }
@@ -34,6 +36,4 @@ listener "tcp" {
 
 api_proxy {
   use_auto_auth_token = true
-  enforce_consistency = "always"
-  when_inconsistent = "forward"
 }
diff --git a/vault/config/vault.hcl b/vault/config/vault.hcl
index eaef415a..64ab325e 100644
--- a/vault/config/vault.hcl
+++ b/vault/config/vault.hcl
@@ -1,22 +1,27 @@
 # HashiCorp Vault Server Configuration
-# Production-ready configuration for LLM Orchestration Service
+# Single-node Raft for the RAG-Module services
 
-# Storage backend - Raft for high availability
+# Storage backend - Raft
 storage "raft" {
   path    = "/vault/file"
   node_id = "vault-node-1"
-  
-  # Retry join configuration for clustering (single node for now)
-  retry_join {
-    leader_api_addr = "http://vault:8200"
-  }
+
+  # NOTE: No retry_join for a single node. A lone node self-bootstraps.
+  # A retry_join pointing at itself causes repeated
+  # "failed to get raft challenge ... Vault is sealed" errors and a
+  # messy double Raft init on every boot. Add retry_join back only when
+  # you actually have peer nodes to join.
 }
 
-# HTTP listener configuration
+# HTTP API listener.
+# Vault automatically uses the next port up (8201) as its internal
+# cluster port, so do NOT define a separate listener on 8201 — that
+# collides with the cluster listener ("bind: address already in use")
+# and degrades the login/request-forwarding path the agents rely on.
 listener "tcp" {
-  address       = "0.0.0.0:8200"
-  tls_disable   = true
-  
+  address     = "0.0.0.0:8200"
+  tls_disable = true
+
   # Enable CORS for web UI access
   cors_enabled = true
   cors_allowed_origins = [
@@ -25,14 +30,9 @@ listener "tcp" {
   ]
 }
 
-# Cluster listener for HA (required even for single node)
-listener "tcp" {
-  address       = "0.0.0.0:8201"
-  cluster_addr  = "http://0.0.0.0:8201"
-  tls_disable   = true
-}
-
-# API and cluster addresses
+# API and cluster addresses.
+# cluster_addr tells Vault where its internal cluster port (8201) is
+# reachable; Vault binds that port itself — no listener block needed.
 api_addr     = "http://vault:8200"
 cluster_addr = "http://vault:8201"
 
@@ -46,9 +46,5 @@ default_lease_ttl = "168h"  # 7 days
 max_lease_ttl     = "720h"  # 30 days
 
 # Logging configuration
-log_level = "INFO"
+log_level  = "INFO"
 log_format = "json"
-
-# Development settings (remove in production)
-# Note: In production, you should not use dev mode
-# and should properly initialize and unseal the vault
\ No newline at end of file