diff --git a/hack/e2e/README.md b/hack/e2e/README.md
index c328a0cb..b7e7725b 100644
--- a/hack/e2e/README.md
+++ b/hack/e2e/README.md
@@ -29,7 +29,7 @@ The default `all` command runs:
 1. Build the local `aks-flex-node` binary unless `--binary` or `--skip-build` is used.
 2. Deploy AKS and three VMs with Bicep.
 3. Join all three VMs.
-4. Validate node readiness and run smoke workloads.
+4. Validate node readiness, node-problem-detector status, and run smoke workloads.
 5. Unjoin all Flex Nodes and verify they are absent.
 6. Rejoin all Flex Nodes and validate again.
 7. Run local-machine-driven repave validation.
@@ -51,7 +51,7 @@ The default `all` command runs:
 | `unjoin-msi` | Unjoin only the managed-identity node. |
 | `unjoin-token` | Unjoin only the bootstrap-token node. |
 | `unjoin-kubeadm` | Unjoin only the kubeadm-style node. |
-| `validate` | Verify joined nodes and run smoke tests. |
+| `validate` | Verify joined nodes, node-problem-detector status, and run smoke tests. |
 | `validate-absent` | Verify Flex Node objects are absent after unjoin. |
 | `smoke` | Run smoke workloads only. |
 | `upgrade-drift` | Validate local-machine-driven repave to the alternate nspawn side. |
@@ -197,6 +197,6 @@ Logs are collected under `$E2E_WORK_DIR/logs/`.
 - **Missing prerequisites:** run `./hack/e2e/run.sh --help` and confirm `az`, `jq`, `kubectl`, `ssh`, `scp`, and `openssl` are available.
 - **Azure auth failures:** run `az account show` and `az login` if needed.
 - **SSH failures:** inspect `state.json` for VM public IPs and confirm the SSH key configured by `E2E_SSH_KEY_FILE` is available.
-- **Node join failures:** run `./hack/e2e/run.sh logs` and inspect agent, bootstrap unit, kubelet, and containerd logs.
+- **Node join failures:** run `./hack/e2e/run.sh logs` and inspect agent, bootstrap unit, kubelet, containerd, and node-problem-detector logs.
 - **Repave failures:** check `aks-flex-node-agent` logs, `machinectl list`, and kubelet versions inside `kube1` and `kube2`.
 - **Leftover resources:** run `E2E_RESOURCE_GROUP=<rg> ./hack/e2e/run.sh cleanup`.
diff --git a/hack/e2e/lib/cleanup.sh b/hack/e2e/lib/cleanup.sh
index fcc98f9a..f4957e60 100755
--- a/hack/e2e/lib/cleanup.sh
+++ b/hack/e2e/lib/cleanup.sh
@@ -52,6 +52,39 @@ _collect_vm_logs() {
      fi" \
     > "${E2E_LOG_DIR}/${prefix}-containerd.log" 2>/dev/null || true
 
+  remote_exec "${vm_ip}" "bash -s" <<'REMOTE' > "${E2E_LOG_DIR}/${prefix}-npd.log" 2>&1 || true
+npd_service="node-problem-detector.service"
+active_machine="$(sudo python3 - <<'PY'
+import json
+import sys
+
+try:
+    with open("/etc/aks-flex-node/daemon-state.json", encoding="utf-8") as state:
+        active_machine = json.load(state).get("activeMachine", "")
+    if active_machine:
+        print(active_machine)
+    else:
+        print("daemon state does not include activeMachine", file=sys.stderr)
+except FileNotFoundError as exc:
+    print(f"daemon state not found: {exc}", file=sys.stderr)
+except json.JSONDecodeError as exc:
+    print(f"daemon state is not valid JSON: {exc}", file=sys.stderr)
+except PermissionError as exc:
+    print(f"daemon state permission denied: {exc}", file=sys.stderr)
+PY
+)"
+if [ -n "${active_machine}" ]; then
+  echo "=== ${npd_service} logs (${active_machine}) ==="
+  # Match the agent and kubelet log depth; NPD entries are sparse but useful across node lifecycle phases.
+  sudo systemd-run --machine="${active_machine}" --quiet --pipe journalctl -u "${npd_service}" -n 500 --no-pager || \
+    echo "warning: failed to collect ${npd_service} logs from ${active_machine}"
+else
+  echo "warning: active machine unknown; falling back to host journal"
+  sudo journalctl -u "${npd_service}" -n 500 --no-pager || \
+    echo "warning: failed to collect ${npd_service} logs from host"
+fi
+REMOTE
+
   # Collect CNI config and nspawn machine state for networking diagnostics.
   # Read directly from the nspawn rootfs at /var/lib/machines/kube1/.
   local kube1_root="/var/lib/machines/kube1"
diff --git a/hack/e2e/lib/validate.sh b/hack/e2e/lib/validate.sh
index 3bf60097..4b24edda 100755
--- a/hack/e2e/lib/validate.sh
+++ b/hack/e2e/lib/validate.sh
@@ -5,6 +5,7 @@
 # Functions:
 #   validate_node_joined  <vm_name>  - Wait for a specific node to appear in kubectl
 #   validate_all_nodes                - Verify MSI, token, and kubeadm nodes joined
+#   validate_npd_status   <vm_name> <vm_ip> - Verify node-problem-detector is active
 #   validate_node_absent  <vm_name>  - Wait for a node to disappear from kubectl
 #   validate_all_nodes_absent         - Verify all flex nodes are gone after unjoin
 #   smoke_test            <vm_name> <label>  - Schedule an nginx pod on a node
@@ -73,6 +74,93 @@ validate_node_ip() {
   return 1
 }
 
+# ---------------------------------------------------------------------------
+# validate_npd_status - Ensure node-problem-detector is active and reporting
+# ---------------------------------------------------------------------------
+validate_npd_status() {
+  local vm_name="$1"
+  local vm_ip="$2"
+  local timeout="${E2E_NODE_JOIN_TIMEOUT}"
+  local elapsed=0
+  local npd_condition_jsonpath='{.status.conditions[?(@.type=="KernelDeadlock")].status}'
+  local condition_error="${E2E_WORK_DIR}/npd-condition-${vm_name}.err"
+  local quoted_timeout
+
+  log_info "Validating node-problem-detector on '${vm_name}'..."
+
+  if ! [[ "${timeout}" =~ ^[0-9]+$ ]]; then
+    log_error "E2E_NODE_JOIN_TIMEOUT must be numeric, got '${timeout}'"
+    return 1
+  fi
+  printf -v quoted_timeout "%q" "${timeout}"
+
+  remote_exec "${vm_ip}" "E2E_NODE_JOIN_TIMEOUT=${quoted_timeout} bash -s" <<'REMOTE'
+set -euo pipefail
+
+deadline=$((SECONDS + E2E_NODE_JOIN_TIMEOUT))
+active_machine_error="/tmp/aks-flex-node-e2e-active-machine-$$.err"
+status_error="/tmp/aks-flex-node-e2e-npd-status-$$.err"
+while true; do
+  if [[ ! -f /etc/aks-flex-node/daemon-state.json ]]; then
+    active_machine=""
+    echo "/etc/aks-flex-node/daemon-state.json is missing" > "${active_machine_error}"
+  else
+    active_machine="$(sudo python3 - <<'PY' 2>"${active_machine_error}" || true
+import json
+with open("/etc/aks-flex-node/daemon-state.json", encoding="utf-8") as state:
+    print(json.load(state).get("activeMachine", ""))
+PY
+)"
+  fi
+  if [[ -n "${active_machine}" ]] && sudo machinectl show "${active_machine}" &>/dev/null; then
+    status="$(sudo systemd-run --machine="${active_machine}" --quiet --pipe systemctl is-active node-problem-detector.service 2>"${status_error}" || true)"
+    if [[ "${status}" == "active" ]]; then
+      echo "node-problem-detector.service is active in ${active_machine}"
+      exit 0
+    fi
+  fi
+
+  if (( SECONDS >= deadline )); then
+    echo "node-problem-detector.service did not become active"
+    if [[ -s "${active_machine_error}" ]]; then
+      cat "${active_machine_error}"
+    fi
+    if [[ -s "${status_error}" ]]; then
+      cat "${status_error}"
+    fi
+    sudo machinectl list --no-pager || true
+    if [[ -n "${active_machine:-}" ]]; then
+      sudo systemd-run --machine="${active_machine}" --quiet --pipe systemctl status node-problem-detector.service --no-pager -l || true
+      sudo systemd-run --machine="${active_machine}" --quiet --pipe journalctl -u node-problem-detector.service -n 50 --no-pager || true
+    fi
+    exit 1
+  fi
+
+  sleep 5
+done
+REMOTE
+
+  local kernel_deadlock
+  while [[ "${elapsed}" -lt "${timeout}" ]]; do
+    kernel_deadlock="$(kubectl get node "${vm_name}" -o jsonpath="${npd_condition_jsonpath}" 2>"${condition_error}" || true)"
+    if [[ "${kernel_deadlock}" == "False" ]]; then
+      log_success "node-problem-detector is active and reporting on '${vm_name}'"
+      return 0
+    fi
+
+    sleep 10
+    elapsed=$((elapsed + 10))
+    log_debug "Waiting for node-problem-detector condition on ${vm_name}... (${elapsed}/${timeout}s)"
+  done
+
+  log_error "node-problem-detector did not report KernelDeadlock=False on '${vm_name}' within ${timeout}s"
+  if [[ -s "${condition_error}" ]]; then
+    cat "${condition_error}" >&2
+  fi
+  kubectl describe node "${vm_name}" 2>&1 || true
+  return 1
+}
+
 # ---------------------------------------------------------------------------
 # validate_all_nodes - Check all MSI, token, and kubeadm VMs joined
 # ---------------------------------------------------------------------------
@@ -91,10 +179,14 @@ validate_all_nodes() {
     --admin
 
   local msi_vm_name token_vm_name kubeadm_vm_name
+  local msi_vm_ip token_vm_ip kubeadm_vm_ip
   local token_vm_private_ip
   msi_vm_name="$(state_get msi_vm_name)"
   token_vm_name="$(state_get token_vm_name)"
   kubeadm_vm_name="$(state_get kubeadm_vm_name)"
+  msi_vm_ip="$(state_get msi_vm_ip)"
+  token_vm_ip="$(state_get token_vm_ip)"
+  kubeadm_vm_ip="$(state_get kubeadm_vm_ip)"
   token_vm_private_ip="$(state_get token_vm_private_ip)"
 
   local failed=0
@@ -102,6 +194,9 @@ validate_all_nodes() {
   validate_node_joined "${token_vm_name}" || failed=1
   validate_node_joined "${kubeadm_vm_name}" || failed=1
   validate_node_ip "${token_vm_name}" "${token_vm_private_ip}" || failed=1
+  validate_npd_status "${msi_vm_name}" "${msi_vm_ip}" || failed=1
+  validate_npd_status "${token_vm_name}" "${token_vm_ip}" || failed=1
+  validate_npd_status "${kubeadm_vm_name}" "${kubeadm_vm_ip}" || failed=1
 
   if [[ "${failed}" -eq 1 ]]; then
     log_error "One or more nodes failed to join"
diff --git a/pkg/daemon/nodeoperator.go b/pkg/daemon/nodeoperator.go
index b7a35a3f..4b8aba0e 100644
--- a/pkg/daemon/nodeoperator.go
+++ b/pkg/daemon/nodeoperator.go
@@ -52,7 +52,7 @@ func (o *nspawnNodeOperator) RestartNode(ctx context.Context, log *slog.Logger)
 		nodestop.StopNode(log, active.Name),
 		nodestart.StartNode(log, gs.NodeStart),
 		nodestart.WaitForKubelet(log, active.Name),
-		npd.Start(cfg, log, gs.RootFS.MachineDir, active.Name),
+		npd.Start(log, gs.NodeStart),
 	).Do(ctx)
 }
 
diff --git a/pkg/daemon/start.go b/pkg/daemon/start.go
index 6d202e43..916fb12a 100644
--- a/pkg/daemon/start.go
+++ b/pkg/daemon/start.go
@@ -47,7 +47,7 @@ func StartNode(
 		),
 		nodestart.StartNode(log, gs.NodeStart),
 		nodestart.WaitForKubelet(log, machineName),
-		npd.Start(cfg, log, gs.RootFS.MachineDir, machineName),
+		npd.Start(log, gs.NodeStart),
 		saveState(store, state),
 	)
 }
diff --git a/pkg/npd/assets/node-problem-detector.service b/pkg/npd/assets/node-problem-detector.service
index cc17ac99..beb414d2 100644
--- a/pkg/npd/assets/node-problem-detector.service
+++ b/pkg/npd/assets/node-problem-detector.service
@@ -3,7 +3,7 @@ Description=Node Problem Detector
 After=network.target
 
 [Service]
-ExecStart={{.NPDBinaryPath}} --apiserver-override="{{.APIServerURL}}?inClusterConfig=false&auth={{.KubeconfigPath}}" --config.system-log-monitor={{.NPDConfigPath}}
+ExecStart={{.NPDBinaryPath}} --hostname-override={{.NodeName}} --apiserver-override="{{.APIServerURL}}?inClusterConfig=false&auth={{.KubeconfigPath}}" --config.system-log-monitor={{.NPDConfigPath}}
 Restart=on-failure
 RestartSec=5s
 
diff --git a/pkg/npd/start.go b/pkg/npd/start.go
index 0b3958c1..8c093708 100644
--- a/pkg/npd/start.go
+++ b/pkg/npd/start.go
@@ -11,9 +11,9 @@ import (
 	"path/filepath"
 	"text/template"
 
-	"github.com/Azure/AKSFlexNode/pkg/config"
 	"github.com/Azure/AKSFlexNode/pkg/utils/utilexec"
 	"github.com/Azure/AKSFlexNode/pkg/utils/utilio"
+	"github.com/Azure/unbounded/pkg/agent/goalstates"
 	"github.com/Azure/unbounded/pkg/agent/phases"
 )
 
@@ -21,8 +21,7 @@ import (
 var serviceTemplate string
 
 const (
-	KubeletKubeconfigPath = "/var/lib/kubelet/kubelet/kubeconfig"
-	systemdUnitNPD        = "node-problem-detector.service"
+	systemdUnitNPD = "node-problem-detector.service"
 )
 
 var tmpl = template.Must(template.New("npd-service").Parse(serviceTemplate))
@@ -33,18 +32,20 @@ type startTask struct {
 	kubeconfigPath string
 	machineDir     string
 	machineName    string
+	nodeName       string
 }
 
 // Start returns a task that renders the NPD systemd unit file into the
 // nspawn machine rootfs and ensures the service is running inside the
 // container via systemd-run --machine.
-func Start(cfg *config.Config, log *slog.Logger, machineDir, machineName string) phases.Task {
+func Start(log *slog.Logger, nodeStart *goalstates.NodeStart) phases.Task {
 	return &startTask{
 		log:            log,
-		apiServer:      cfg.Node.Kubelet.ServerURL,
-		kubeconfigPath: KubeletKubeconfigPath,
-		machineDir:     machineDir,
-		machineName:    machineName,
+		apiServer:      nodeStart.Kubelet.APIServer,
+		kubeconfigPath: goalstates.KubeletKubeconfigPath,
+		machineDir:     nodeStart.MachineDir,
+		machineName:    nodeStart.MachineName,
+		nodeName:       nodeStart.NodeName,
 	}
 }
 
@@ -66,6 +67,7 @@ func (t *startTask) ensureServiceFile() (updated bool, err error) {
 		"APIServerURL":   t.apiServer,
 		"KubeconfigPath": t.kubeconfigPath,
 		"NPDConfigPath":  npdConfigPath,
+		"NodeName":       t.nodeName,
 	}); err != nil {
 		return false, fmt.Errorf("render npd service template: %w", err)
 	}
diff --git a/pkg/npd/start_test.go b/pkg/npd/start_test.go
new file mode 100644
index 00000000..ae1be511
--- /dev/null
+++ b/pkg/npd/start_test.go
@@ -0,0 +1,70 @@
+package npd
+
+import (
+	"log/slog"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+
+	"github.com/Azure/unbounded/pkg/agent/goalstates"
+)
+
+// wantKubeconfigPath is the path the kubelet actually writes its kubeconfig to.
+// The historical bug used a doubled segment ("/var/lib/kubelet/kubelet/kubeconfig"),
+// which made node-problem-detector panic on startup and crash-loop forever:
+//
+//	panic: stat /var/lib/kubelet/kubelet/kubeconfig: no such file or directory
+const wantKubeconfigPath = "/var/lib/kubelet/kubeconfig"
+
+// TestCanonicalKubeletKubeconfigPath guards the value of the shared library
+// constant so the doubled-segment typo cannot reappear from a dependency bump.
+func TestCanonicalKubeletKubeconfigPath(t *testing.T) {
+	t.Parallel()
+	if goalstates.KubeletKubeconfigPath != wantKubeconfigPath {
+		t.Fatalf("goalstates.KubeletKubeconfigPath = %q, want %q",
+			goalstates.KubeletKubeconfigPath, wantKubeconfigPath)
+	}
+}
+
+// TestRenderedNPDUnitUsesCanonicalKubeconfig drives Start() through the real
+// service-file rendering and asserts on the kubeconfig path that ends up in the
+// node-problem-detector systemd unit's ExecStart. Asserting on the rendered unit
+// (the externally observable artifact) rather than internal fields keeps the
+// test stable across refactors while still exercising the Start() wiring.
+func TestRenderedNPDUnitUsesCanonicalKubeconfig(t *testing.T) {
+	t.Parallel()
+	machineDir := t.TempDir()
+	nodeStart := &goalstates.NodeStart{
+		MachineDir:  machineDir,
+		MachineName: "kube1",
+		NodeName:    "vm-e2e-token-1781659839",
+		Kubelet: goalstates.Kubelet{
+			APIServer: "https://example.hcp.westus.azmk8s.io:443",
+		},
+	}
+	task, ok := Start(slog.Default(), nodeStart).(*startTask)
+	if !ok {
+		t.Fatalf("Start did not return *startTask")
+	}
+	if _, err := task.ensureServiceFile(); err != nil {
+		t.Fatalf("ensureServiceFile: %v", err)
+	}
+
+	unitPath := filepath.Join(machineDir, "etc/systemd/system", systemdUnitNPD)
+	data, err := os.ReadFile(unitPath) //nolint:gosec // path built from test TempDir
+	if err != nil {
+		t.Fatalf("read rendered unit: %v", err)
+	}
+	rendered := string(data)
+
+	if !strings.Contains(rendered, "auth="+wantKubeconfigPath) {
+		t.Fatalf("rendered unit missing auth=%s:\n%s", wantKubeconfigPath, rendered)
+	}
+	if strings.Contains(rendered, "kubelet/kubelet") {
+		t.Fatalf("rendered unit contains doubled 'kubelet' segment:\n%s", rendered)
+	}
+	if !strings.Contains(rendered, "--hostname-override=vm-e2e-token-1781659839") {
+		t.Fatalf("rendered unit missing hostname override:\n%s", rendered)
+	}
+}