From b743c8aa4b72a828afdb37515bc4ae1f83ffd6d2 Mon Sep 17 00:00:00 2001 From: Casey Brooks Date: Tue, 19 May 2026 11:50:02 +0000 Subject: [PATCH] fix(k8s): default bootstrap to single node --- .github/actions/provision/action.yml | 68 +++++++++++++++++++++++ .github/scripts/verify_platform_health.sh | 3 + apply.sh | 48 +++++++++++++++- stacks/k8s/.terraform.lock.hcl | 2 + stacks/k8s/variables.tf | 12 +++- 5 files changed, 130 insertions(+), 3 deletions(-) diff --git a/.github/actions/provision/action.yml b/.github/actions/provision/action.yml index 0165a3d9..8f9c0133 100644 --- a/.github/actions/provision/action.yml +++ b/.github/actions/provision/action.yml @@ -5,6 +5,14 @@ inputs: description: Bootstrap ref to provision from. required: false default: main + servers: + description: Number of k3d server nodes to provision. + required: false + default: "1" + agents: + description: Number of k3d agent nodes to provision. + required: false + default: "0" outputs: kubeconfig: description: Absolute path to the generated kubeconfig file. @@ -16,11 +24,15 @@ runs: shell: bash run: | set -euo pipefail + echo "Docker usage before cleanup:" + docker system df echo "Disk usage before cleanup:" df -h sudo rm -rf /usr/local/lib/android /usr/local/share/boost /opt/ghc /usr/share/dotnet /usr/share/swift sudo apt-get clean docker system prune -af + echo "Docker usage after cleanup:" + docker system df echo "Disk usage after cleanup:" df -h @@ -48,11 +60,43 @@ runs: with: terraform_version: 1.6.6 + - name: Log pre-provision metrics + shell: bash + run: | + set -euo pipefail + echo "Docker usage before provisioning:" + docker system df + echo "Disk usage before provisioning:" + df -h + - name: Apply bootstrap stacks shell: bash working-directory: bootstrap + env: + K3D_SERVERS: ${{ inputs.servers }} + K3D_AGENTS: ${{ inputs.agents }} run: ./apply.sh -y + - name: Log post-provision metrics + if: always() + shell: bash + working-directory: bootstrap + run: | + set -euo pipefail + kubeconfig_path="$(pwd)/stacks/k8s/.kube/agyn-local-kubeconfig.yaml" + echo "Docker usage after provisioning:" + docker system df + echo "Disk usage after provisioning:" + df -h + if [[ -f "${kubeconfig_path}" ]]; then + echo "Kubernetes nodes after provisioning:" + kubectl --kubeconfig "${kubeconfig_path}" get nodes -o wide || true + echo "Kubernetes pods after provisioning:" + kubectl --kubeconfig "${kubeconfig_path}" get pods -A -o wide || true + else + echo "Kubeconfig not found at ${kubeconfig_path}; skipping Kubernetes metrics." + fi + - name: Export kubeconfig id: kubeconfig shell: bash @@ -67,6 +111,30 @@ runs: echo "kubeconfig=${kubeconfig_path}" >> "$GITHUB_OUTPUT" echo "KUBECONFIG=${kubeconfig_path}" >> "$GITHUB_ENV" + - name: Verify cluster node count + shell: bash + working-directory: bootstrap + env: + EXPECTED_K3D_SERVERS: ${{ inputs.servers }} + EXPECTED_K3D_AGENTS: ${{ inputs.agents }} + run: | + set -euo pipefail + if ! [[ "${EXPECTED_K3D_SERVERS}" =~ ^[0-9]+$ && "${EXPECTED_K3D_AGENTS}" =~ ^[0-9]+$ ]]; then + echo "Expected k3d server and agent counts must be integers." >&2 + exit 1 + fi + expected_nodes=$(( EXPECTED_K3D_SERVERS + EXPECTED_K3D_AGENTS )) + actual_nodes=$(kubectl --kubeconfig "$KUBECONFIG" get nodes --no-headers | wc -l | tr -d '[:space:]') + echo "Kubernetes nodes: expected=${expected_nodes} actual=${actual_nodes}" + echo "Kubernetes node details:" + kubectl --kubeconfig "$KUBECONFIG" get nodes -o wide + echo "Kubernetes node taints:" + kubectl --kubeconfig "$KUBECONFIG" get nodes -o custom-columns='NAME:.metadata.name,TAINTS:.spec.taints' + if [[ "${actual_nodes}" != "${expected_nodes}" ]]; then + echo "Expected ${expected_nodes} Kubernetes node(s), found ${actual_nodes}." >&2 + exit 1 + fi + - name: Verify platform health shell: bash working-directory: bootstrap diff --git a/.github/scripts/verify_platform_health.sh b/.github/scripts/verify_platform_health.sh index 96ec3776..23028edf 100755 --- a/.github/scripts/verify_platform_health.sh +++ b/.github/scripts/verify_platform_health.sh @@ -36,6 +36,9 @@ log() { dump_diagnostics() { log "Collecting diagnostics before exit" + kubectl --kubeconfig "$KUBECONFIG_PATH" get nodes -o wide || true + kubectl --kubeconfig "$KUBECONFIG_PATH" get nodes -o custom-columns='NAME:.metadata.name,TAINTS:.spec.taints' || true + kubectl --kubeconfig "$KUBECONFIG_PATH" get pods -A -o wide || true kubectl --kubeconfig "$KUBECONFIG_PATH" -n "$PLATFORM_NAMESPACE" get pods -o wide || true kubectl --kubeconfig "$KUBECONFIG_PATH" -n "$PLATFORM_NAMESPACE" describe pods || true kubectl --kubeconfig "$KUBECONFIG_PATH" -n "$ARGO_NAMESPACE" get applications.argoproj.io -o yaml | grep -E "(name:|status:)" | sed -n '1,200p' || true diff --git a/apply.sh b/apply.sh index 490ade3c..10597d2d 100755 --- a/apply.sh +++ b/apply.sh @@ -4,6 +4,8 @@ set -euo pipefail DEFAULT_DOMAIN="agyn.dev" DEFAULT_PORT="2496" +DEFAULT_K3D_SERVERS="1" +DEFAULT_K3D_AGENTS="0" DEFAULT_OIDC_ISSUER_URL="https://mockauth.dev/r/301ebb13-15a8-48f4-baac-e3fa25be29fc/oidc" DEFAULT_OIDC_CLIENT_ID="client_MU95KU3gHQf5Ir7p" DEFAULT_OIDC_CLIENT_SECRET="XPKka2i9uzISrKZ95zxli8sY51BK4eTJ" @@ -22,6 +24,8 @@ Options: Environment variables: DOMAIN Override the ingress domain (default: agyn.dev) PORT Override the ingress port (default: 2496) + K3D_SERVERS Override k3d server node count (default: 1) + K3D_AGENTS Override k3d agent node count (default: 0) OIDC_ISSUER_URL Override the OIDC issuer URL (default: https://mockauth.dev/r/301ebb13-15a8-48f4-baac-e3fa25be29fc/oidc) OIDC_CLIENT_ID Override the OIDC client ID (default: client_MU95KU3gHQf5Ir7p) TRACING_APP_OIDC_CLIENT_ID Override the tracing-app OIDC client ID (default: client_tzqVFAYTvpkfUzy5) @@ -112,6 +116,40 @@ if (( port < 1 || port > 65535 )); then exit 1 fi +k3d_servers="${K3D_SERVERS:-${DEFAULT_K3D_SERVERS}}" +if [[ -n "${K3D_SERVERS:-}" ]]; then + echo "K3D server node count provided via K3D_SERVERS environment variable: ${k3d_servers}" +else + echo "K3D server node count defaulting to ${k3d_servers}." +fi + +if ! [[ "${k3d_servers}" =~ ^[0-9]+$ ]]; then + echo "Error: K3D_SERVERS must be an integer." >&2 + exit 1 +fi + +if (( k3d_servers < 1 )); then + echo "Error: K3D_SERVERS must be greater than or equal to 1." >&2 + exit 1 +fi + +k3d_agents="${K3D_AGENTS:-${DEFAULT_K3D_AGENTS}}" +if [[ -n "${K3D_AGENTS:-}" ]]; then + echo "K3D agent node count provided via K3D_AGENTS environment variable: ${k3d_agents}" +else + echo "K3D agent node count defaulting to ${k3d_agents}." +fi + +if ! [[ "${k3d_agents}" =~ ^[0-9]+$ ]]; then + echo "Error: K3D_AGENTS must be an integer." >&2 + exit 1 +fi + +if (( k3d_agents < 0 )); then + echo "Error: K3D_AGENTS must be greater than or equal to 0." >&2 + exit 1 +fi + oidc_issuer_url="${OIDC_ISSUER_URL:-}" if [[ -z "${oidc_issuer_url}" ]]; then if [[ "${auto_approve}" == "true" ]]; then @@ -161,7 +199,8 @@ fi ghcr_username="${GHCR_USERNAME:-}" ghcr_token="${GHCR_TOKEN:-}" -printf '\nUsing domain: %s\nUsing port: %s\n\n' "${domain}" "${port}" +printf '\nUsing domain: %s\nUsing port: %s\nUsing k3d servers: %s\nUsing k3d agents: %s\n\n' \ + "${domain}" "${port}" "${k3d_servers}" "${k3d_agents}" run_stack() { local stack="$1" @@ -179,7 +218,12 @@ run_stack() { local apply_cmd=(terraform -chdir="stacks/${stack}" apply) if [[ "${stack}" == "k8s" ]]; then - apply_cmd+=(-var "domain=${domain}" -var "port=${port}") + apply_cmd+=( + -var "domain=${domain}" + -var "port=${port}" + -var "servers=${k3d_servers}" + -var "agents=${k3d_agents}" + ) fi if [[ "${stack}" == "platform" ]]; then diff --git a/stacks/k8s/.terraform.lock.hcl b/stacks/k8s/.terraform.lock.hcl index 445577a5..54594b34 100644 --- a/stacks/k8s/.terraform.lock.hcl +++ b/stacks/k8s/.terraform.lock.hcl @@ -7,6 +7,7 @@ provider "registry.terraform.io/agynio/k3d" { hashes = [ "h1:7rRtyBqzmGsS8zzpLpFUmsEF6dI3j8EB+ka9PGDC5ww=", "h1:obPj2o7tS99lnM+ZbIrlWoxi0eUpcjnPoQK7SAYNjbY=", + "h1:zG3YHPySNClFKO4AyMuq17ALU75l0WeVGUx22CwV4ZM=", "zh:106474ecb58ca44940a3708016d885c3c1dc824ddcf9ca544a3272b9d2db2f4a", "zh:6caf22c0ec0c4a06289b7ed0c30355d060a34d63ed6170d692f0711ca555d88b", "zh:6e85c053e430c4efac45542f7ffa17ecd81fe714963d0733cdf5d04537d852db", @@ -21,6 +22,7 @@ provider "registry.terraform.io/hashicorp/local" { constraints = "~> 2.5" hashes = [ "h1:2RYa3j7m/0WmET2fqotY4CHxE1Hpk0fgn47/126l+Og=", + "h1:ZM8+dCHsSogRknZkPUeUMzHyY1UAZ0GWFJ24YP8v+AQ=", "h1:sSwlfp2etjCaE9hIF7bJBDjRIhDCVFglEOVyiCI7vgs=", "zh:261fec71bca13e0a7812dc0d8ae9af2b4326b24d9b2e9beab3d2400fab5c5f9a", "zh:308da3b5376a9ede815042deec5af1050ec96a5a5410a2206ae847d82070a23e", diff --git a/stacks/k8s/variables.tf b/stacks/k8s/variables.tf index 3bcf6021..013cc1ba 100644 --- a/stacks/k8s/variables.tf +++ b/stacks/k8s/variables.tf @@ -8,12 +8,22 @@ variable "servers" { type = number description = "Number of server nodes" default = 1 + + validation { + condition = var.servers >= 1 + error_message = "Servers must be greater than or equal to 1." + } } variable "agents" { type = number description = "Number of agent nodes" - default = 2 + default = 0 + + validation { + condition = var.agents >= 0 + error_message = "Agents must be greater than or equal to 0." + } } variable "k3s_version" {