diff --git a/hack/e2e/README.md b/hack/e2e/README.md index c328a0cb..b7e7725b 100644 --- a/hack/e2e/README.md +++ b/hack/e2e/README.md @@ -29,7 +29,7 @@ The default `all` command runs: 1. Build the local `aks-flex-node` binary unless `--binary` or `--skip-build` is used. 2. Deploy AKS and three VMs with Bicep. 3. Join all three VMs. -4. Validate node readiness and run smoke workloads. +4. Validate node readiness, node-problem-detector status, and run smoke workloads. 5. Unjoin all Flex Nodes and verify they are absent. 6. Rejoin all Flex Nodes and validate again. 7. Run local-machine-driven repave validation. @@ -51,7 +51,7 @@ The default `all` command runs: | `unjoin-msi` | Unjoin only the managed-identity node. | | `unjoin-token` | Unjoin only the bootstrap-token node. | | `unjoin-kubeadm` | Unjoin only the kubeadm-style node. | -| `validate` | Verify joined nodes and run smoke tests. | +| `validate` | Verify joined nodes, node-problem-detector status, and run smoke tests. | | `validate-absent` | Verify Flex Node objects are absent after unjoin. | | `smoke` | Run smoke workloads only. | | `upgrade-drift` | Validate local-machine-driven repave to the alternate nspawn side. | @@ -197,6 +197,6 @@ Logs are collected under `$E2E_WORK_DIR/logs/`. - **Missing prerequisites:** run `./hack/e2e/run.sh --help` and confirm `az`, `jq`, `kubectl`, `ssh`, `scp`, and `openssl` are available. - **Azure auth failures:** run `az account show` and `az login` if needed. - **SSH failures:** inspect `state.json` for VM public IPs and confirm the SSH key configured by `E2E_SSH_KEY_FILE` is available. -- **Node join failures:** run `./hack/e2e/run.sh logs` and inspect agent, bootstrap unit, kubelet, and containerd logs. +- **Node join failures:** run `./hack/e2e/run.sh logs` and inspect agent, bootstrap unit, kubelet, containerd, and node-problem-detector logs. - **Repave failures:** check `aks-flex-node-agent` logs, `machinectl list`, and kubelet versions inside `kube1` and `kube2`. - **Leftover resources:** run `E2E_RESOURCE_GROUP= ./hack/e2e/run.sh cleanup`. diff --git a/hack/e2e/lib/cleanup.sh b/hack/e2e/lib/cleanup.sh index fcc98f9a..f4957e60 100755 --- a/hack/e2e/lib/cleanup.sh +++ b/hack/e2e/lib/cleanup.sh @@ -52,6 +52,39 @@ _collect_vm_logs() { fi" \ > "${E2E_LOG_DIR}/${prefix}-containerd.log" 2>/dev/null || true + remote_exec "${vm_ip}" "bash -s" <<'REMOTE' > "${E2E_LOG_DIR}/${prefix}-npd.log" 2>&1 || true +npd_service="node-problem-detector.service" +active_machine="$(sudo python3 - <<'PY' +import json +import sys + +try: + with open("/etc/aks-flex-node/daemon-state.json", encoding="utf-8") as state: + active_machine = json.load(state).get("activeMachine", "") + if active_machine: + print(active_machine) + else: + print("daemon state does not include activeMachine", file=sys.stderr) +except FileNotFoundError as exc: + print(f"daemon state not found: {exc}", file=sys.stderr) +except json.JSONDecodeError as exc: + print(f"daemon state is not valid JSON: {exc}", file=sys.stderr) +except PermissionError as exc: + print(f"daemon state permission denied: {exc}", file=sys.stderr) +PY +)" +if [ -n "${active_machine}" ]; then + echo "=== ${npd_service} logs (${active_machine}) ===" + # Match the agent and kubelet log depth; NPD entries are sparse but useful across node lifecycle phases. + sudo systemd-run --machine="${active_machine}" --quiet --pipe journalctl -u "${npd_service}" -n 500 --no-pager || \ + echo "warning: failed to collect ${npd_service} logs from ${active_machine}" +else + echo "warning: active machine unknown; falling back to host journal" + sudo journalctl -u "${npd_service}" -n 500 --no-pager || \ + echo "warning: failed to collect ${npd_service} logs from host" +fi +REMOTE + # Collect CNI config and nspawn machine state for networking diagnostics. # Read directly from the nspawn rootfs at /var/lib/machines/kube1/. local kube1_root="/var/lib/machines/kube1" diff --git a/hack/e2e/lib/validate.sh b/hack/e2e/lib/validate.sh index 3bf60097..4b24edda 100755 --- a/hack/e2e/lib/validate.sh +++ b/hack/e2e/lib/validate.sh @@ -5,6 +5,7 @@ # Functions: # validate_node_joined - Wait for a specific node to appear in kubectl # validate_all_nodes - Verify MSI, token, and kubeadm nodes joined +# validate_npd_status - Verify node-problem-detector is active # validate_node_absent - Wait for a node to disappear from kubectl # validate_all_nodes_absent - Verify all flex nodes are gone after unjoin # smoke_test