Azure · Copilot · Jun 16, 2026 · Jun 16, 2026 · Jun 16, 2026 · Jun 16, 2026
diff --git a/hack/e2e/README.md b/hack/e2e/README.md
@@ -29,7 +29,7 @@ The default `all` command runs:
 1. Build the local `aks-flex-node` binary unless `--binary` or `--skip-build` is used.
 2. Deploy AKS and three VMs with Bicep.
 3. Join all three VMs.
-4. Validate node readiness and run smoke workloads.
+4. Validate node readiness, node-problem-detector status, and run smoke workloads.
 5. Unjoin all Flex Nodes and verify they are absent.
 6. Rejoin all Flex Nodes and validate again.
 7. Run local-machine-driven repave validation.
@@ -51,7 +51,7 @@ The default `all` command runs:
 | `unjoin-msi` | Unjoin only the managed-identity node. |
 | `unjoin-token` | Unjoin only the bootstrap-token node. |
 | `unjoin-kubeadm` | Unjoin only the kubeadm-style node. |
-| `validate` | Verify joined nodes and run smoke tests. |
+| `validate` | Verify joined nodes, node-problem-detector status, and run smoke tests. |
 | `validate-absent` | Verify Flex Node objects are absent after unjoin. |
 | `smoke` | Run smoke workloads only. |
 | `upgrade-drift` | Validate local-machine-driven repave to the alternate nspawn side. |
@@ -197,6 +197,6 @@ Logs are collected under `$E2E_WORK_DIR/logs/`.
 - **Missing prerequisites:** run `./hack/e2e/run.sh --help` and confirm `az`, `jq`, `kubectl`, `ssh`, `scp`, and `openssl` are available.
 - **Azure auth failures:** run `az account show` and `az login` if needed.
 - **SSH failures:** inspect `state.json` for VM public IPs and confirm the SSH key configured by `E2E_SSH_KEY_FILE` is available.
-- **Node join failures:** run `./hack/e2e/run.sh logs` and inspect agent, bootstrap unit, kubelet, and containerd logs.
+- **Node join failures:** run `./hack/e2e/run.sh logs` and inspect agent, bootstrap unit, kubelet, containerd, and node-problem-detector logs.
 - **Repave failures:** check `aks-flex-node-agent` logs, `machinectl list`, and kubelet versions inside `kube1` and `kube2`.
 - **Leftover resources:** run `E2E_RESOURCE_GROUP=<rg> ./hack/e2e/run.sh cleanup`.
diff --git a/hack/e2e/lib/cleanup.sh b/hack/e2e/lib/cleanup.sh
@@ -52,6 +52,39 @@ _collect_vm_logs() {
      fi" \
     > "${E2E_LOG_DIR}/${prefix}-containerd.log" 2>/dev/null || true
 
+  remote_exec "${vm_ip}" "bash -s" <<'REMOTE' > "${E2E_LOG_DIR}/${prefix}-npd.log" 2>&1 || true
+npd_service="node-problem-detector.service"
+active_machine="$(sudo python3 - <<'PY'
+import json
+import sys
+
+try:
+    with open("/etc/aks-flex-node/daemon-state.json", encoding="utf-8") as state:
+        active_machine = json.load(state).get("activeMachine", "")
+    if active_machine:
+        print(active_machine)
+    else:
+        print("daemon state does not include activeMachine", file=sys.stderr)
+except FileNotFoundError as exc:
+    print(f"daemon state not found: {exc}", file=sys.stderr)
+except json.JSONDecodeError as exc:
+    print(f"daemon state is not valid JSON: {exc}", file=sys.stderr)
+except PermissionError as exc:
+    print(f"daemon state permission denied: {exc}", file=sys.stderr)
+PY
+)"
+if [ -n "${active_machine}" ]; then
+  echo "=== ${npd_service} logs (${active_machine}) ==="
+  # Match the agent and kubelet log depth; NPD entries are sparse but useful across node lifecycle phases.
+  sudo systemd-run --machine="${active_machine}" --quiet --pipe journalctl -u "${npd_service}" -n 500 --no-pager || \
+    echo "warning: failed to collect ${npd_service} logs from ${active_machine}"
+else
+  echo "warning: active machine unknown; falling back to host journal"
+  sudo journalctl -u "${npd_service}" -n 500 --no-pager || \
+    echo "warning: failed to collect ${npd_service} logs from host"
+fi
+REMOTE
+
   # Collect CNI config and nspawn machine state for networking diagnostics.
   # Read directly from the nspawn rootfs at /var/lib/machines/kube1/.
   local kube1_root="/var/lib/machines/kube1"

diff --git a/hack/e2e/lib/validate.sh b/hack/e2e/lib/validate.sh
@@ -5,6 +5,7 @@
 # Functions:
 #   validate_node_joined  <vm_name>  - Wait for a specific node to appear in kubectl
 #   validate_all_nodes                - Verify MSI, token, and kubeadm nodes joined
+#   validate_npd_status   <vm_name> <vm_ip> - Verify node-problem-detector is active
 #   validate_node_absent  <vm_name>  - Wait for a node to disappear from kubectl
 #   validate_all_nodes_absent         - Verify all flex nodes are gone after unjoin
 #   smoke_test            <vm_name> <label>  - Schedule an nginx pod on a node
@@ -73,6 +74,93 @@ validate_node_ip() {
   return 1
 }
 
+# ---------------------------------------------------------------------------
+# validate_npd_status - Ensure node-problem-detector is active and reporting
+# ---------------------------------------------------------------------------
+validate_npd_status() {
+  local vm_name="$1"
+  local vm_ip="$2"
+  local timeout="${E2E_NODE_JOIN_TIMEOUT}"
+  local elapsed=0
+  local npd_condition_jsonpath='{.status.conditions[?(@.type=="KernelDeadlock")].status}'
+  local condition_error="${E2E_WORK_DIR}/npd-condition-${vm_name}.err"
+  local quoted_timeout
+
+  log_info "Validating node-problem-detector on '${vm_name}'..."
+
+  if ! [[ "${timeout}" =~ ^[0-9]+$ ]]; then
+    log_error "E2E_NODE_JOIN_TIMEOUT must be numeric, got '${timeout}'"
+    return 1
+  fi
+  printf -v quoted_timeout "%q" "${timeout}"
+
+  remote_exec "${vm_ip}" "E2E_NODE_JOIN_TIMEOUT=${quoted_timeout} bash -s" <<'REMOTE'
+set -euo pipefail
+
+deadline=$((SECONDS + E2E_NODE_JOIN_TIMEOUT))
+active_machine_error="/tmp/aks-flex-node-e2e-active-machine-$$.err"
+status_error="/tmp/aks-flex-node-e2e-npd-status-$$.err"
+while true; do
+  if [[ ! -f /etc/aks-flex-node/daemon-state.json ]]; then
+    active_machine=""
+    echo "/etc/aks-flex-node/daemon-state.json is missing" > "${active_machine_error}"
+  else
+    active_machine="$(sudo python3 - <<'PY' 2>"${active_machine_error}" || true
+import json
+with open("/etc/aks-flex-node/daemon-state.json", encoding="utf-8") as state:
+    print(json.load(state).get("activeMachine", ""))
+PY
+)"
+  fi
+  if [[ -n "${active_machine}" ]] && machinectl show "${active_machine}" &>/dev/null; then
+    status="$(sudo systemd-run --machine="${active_machine}" --quiet --pipe systemctl is-active node-problem-detector.service 2>"${status_error}" || true)"
+    if [[ "${status}" == "active" ]]; then
+      echo "node-problem-detector.service is active in ${active_machine}"
+      exit 0
+    fi
+  fi
+
+  if (( SECONDS >= deadline )); then
+    echo "node-problem-detector.service did not become active"
+    if [[ -s "${active_machine_error}" ]]; then
+      cat "${active_machine_error}"
+    fi
+    if [[ -s "${status_error}" ]]; then
+      cat "${status_error}"
+    fi
+    machinectl list --no-pager || true
+    if [[ -n "${active_machine:-}" ]]; then
+      sudo systemd-run --machine="${active_machine}" --quiet --pipe systemctl status node-problem-detector.service --no-pager -l || true
+      sudo systemd-run --machine="${active_machine}" --quiet --pipe journalctl -u node-problem-detector.service -n 50 --no-pager || true
+    fi
+    exit 1
+  fi
+
+  sleep 5
+done
+REMOTE
+
+  local kernel_deadlock
+  while [[ "${elapsed}" -lt "${timeout}" ]]; do
+    kernel_deadlock="$(kubectl get node "${vm_name}" -o jsonpath="${npd_condition_jsonpath}" 2>"${condition_error}" || true)"
+    if [[ "${kernel_deadlock}" == "False" ]]; then
+      log_success "node-problem-detector is active and reporting on '${vm_name}'"
+      return 0
+    fi
+
+    sleep 10
+    elapsed=$((elapsed + 10))
+    log_debug "Waiting for node-problem-detector condition on ${vm_name}... (${elapsed}/${timeout}s)"
+  done
+
+  log_error "node-problem-detector did not report KernelDeadlock=False on '${vm_name}' within ${timeout}s"
+  if [[ -s "${condition_error}" ]]; then
+    cat "${condition_error}" >&2
+  fi
+  kubectl describe node "${vm_name}" 2>&1 || true
+  return 1
+}
+
 # ---------------------------------------------------------------------------
 # validate_all_nodes - Check all MSI, token, and kubeadm VMs joined
 # ---------------------------------------------------------------------------
@@ -91,17 +179,24 @@ validate_all_nodes() {
     --admin
 
   local msi_vm_name token_vm_name kubeadm_vm_name
+  local msi_vm_ip token_vm_ip kubeadm_vm_ip
   local token_vm_private_ip
   msi_vm_name="$(state_get msi_vm_name)"
   token_vm_name="$(state_get token_vm_name)"
   kubeadm_vm_name="$(state_get kubeadm_vm_name)"
+  msi_vm_ip="$(state_get msi_vm_ip)"
+  token_vm_ip="$(state_get token_vm_ip)"
+  kubeadm_vm_ip="$(state_get kubeadm_vm_ip)"
   token_vm_private_ip="$(state_get token_vm_private_ip)"
 
   local failed=0
   validate_node_joined "${msi_vm_name}" || failed=1
   validate_node_joined "${token_vm_name}" || failed=1
   validate_node_joined "${kubeadm_vm_name}" || failed=1
   validate_node_ip "${token_vm_name}" "${token_vm_private_ip}" || failed=1
+  validate_npd_status "${msi_vm_name}" "${msi_vm_ip}" || failed=1
+  validate_npd_status "${token_vm_name}" "${token_vm_ip}" || failed=1
+  validate_npd_status "${kubeadm_vm_name}" "${kubeadm_vm_ip}" || failed=1
 
   if [[ "${failed}" -eq 1 ]]; then
     log_error "One or more nodes failed to join"