From f05f3afc0014d966ff935d9c61a0d54f6142ec3b Mon Sep 17 00:00:00 2001 From: Dhia Gharsallaoui Date: Mon, 15 Jun 2026 10:27:17 +0200 Subject: [PATCH 01/16] fix(npd): point node-problem-detector at the real kubelet kubeconfig NPD was configured with a doubled path segment, "/var/lib/kubelet/kubelet/kubeconfig", so its in-cluster client paniced on startup on every flex node and the systemd unit crash-looped forever: panic: stat /var/lib/kubelet/kubelet/kubeconfig: no such file or directory node-problem-detector.service: Main process exited, status=2/INVALIDARGUMENT node-problem-detector.service: Scheduled restart job, restart counter is at 46793. The kubelet writes its kubeconfig to /var/lib/kubelet/kubeconfig. The imported agent library already defines this as the canonical goalstates.KubeletKubeconfigPath and wires the kubelet to it, so the npd package's separate copy of the constant was both redundant and wrong. Drop the local constant and reuse goalstates.KubeletKubeconfigPath to fix the path and remove the duplicate source of truth. Add regression tests covering the wired path and the rendered systemd unit's auth parameter. --- pkg/npd/start.go | 6 ++--- pkg/npd/start_test.go | 61 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 3 deletions(-) create mode 100644 pkg/npd/start_test.go diff --git a/pkg/npd/start.go b/pkg/npd/start.go index 0b3958c1..2fe76d11 100644 --- a/pkg/npd/start.go +++ b/pkg/npd/start.go @@ -14,6 +14,7 @@ import ( "github.com/Azure/AKSFlexNode/pkg/config" "github.com/Azure/AKSFlexNode/pkg/utils/utilexec" "github.com/Azure/AKSFlexNode/pkg/utils/utilio" + "github.com/Azure/unbounded/pkg/agent/goalstates" "github.com/Azure/unbounded/pkg/agent/phases" ) @@ -21,8 +22,7 @@ import ( var serviceTemplate string const ( - KubeletKubeconfigPath = "/var/lib/kubelet/kubelet/kubeconfig" - systemdUnitNPD = "node-problem-detector.service" + systemdUnitNPD = "node-problem-detector.service" ) var tmpl = template.Must(template.New("npd-service").Parse(serviceTemplate)) @@ -42,7 +42,7 @@ func Start(cfg *config.Config, log *slog.Logger, machineDir, machineName string) return &startTask{ log: log, apiServer: cfg.Node.Kubelet.ServerURL, - kubeconfigPath: KubeletKubeconfigPath, + kubeconfigPath: goalstates.KubeletKubeconfigPath, machineDir: machineDir, machineName: machineName, } diff --git a/pkg/npd/start_test.go b/pkg/npd/start_test.go new file mode 100644 index 00000000..6301a8f9 --- /dev/null +++ b/pkg/npd/start_test.go @@ -0,0 +1,61 @@ +package npd + +import ( + "log/slog" + "os" + "path/filepath" + "strings" + "testing" + + "github.com/Azure/AKSFlexNode/pkg/config" + "github.com/Azure/unbounded/pkg/agent/goalstates" +) + +// wantKubeconfigPath is the path the kubelet actually writes its kubeconfig to. +// The historical bug used a doubled segment ("/var/lib/kubelet/kubelet/kubeconfig"), +// which made node-problem-detector panic on startup and crash-loop forever: +// +// panic: stat /var/lib/kubelet/kubelet/kubeconfig: no such file or directory +const wantKubeconfigPath = "/var/lib/kubelet/kubeconfig" + +// TestCanonicalKubeletKubeconfigPath guards the value of the shared library +// constant so the doubled-segment typo cannot reappear from a dependency bump. +func TestCanonicalKubeletKubeconfigPath(t *testing.T) { + if goalstates.KubeletKubeconfigPath != wantKubeconfigPath { + t.Fatalf("goalstates.KubeletKubeconfigPath = %q, want %q", + goalstates.KubeletKubeconfigPath, wantKubeconfigPath) + } +} + +// TestRenderedNPDUnitUsesCanonicalKubeconfig drives Start() through the real +// service-file rendering and asserts on the kubeconfig path that ends up in the +// node-problem-detector systemd unit's ExecStart. Asserting on the rendered unit +// (the externally observable artifact) rather than internal fields keeps the +// test stable across refactors while still exercising the Start() wiring. +func TestRenderedNPDUnitUsesCanonicalKubeconfig(t *testing.T) { + cfg := &config.Config{} + cfg.Node.Kubelet.ServerURL = "https://example.hcp.westus.azmk8s.io:443" + + machineDir := t.TempDir() + task, ok := Start(cfg, slog.Default(), machineDir, "kube1").(*startTask) + if !ok { + t.Fatalf("Start did not return *startTask") + } + if _, err := task.ensureServiceFile(); err != nil { + t.Fatalf("ensureServiceFile: %v", err) + } + + unitPath := filepath.Join(machineDir, "etc/systemd/system", systemdUnitNPD) + data, err := os.ReadFile(unitPath) //nolint:gosec // path built from test TempDir + if err != nil { + t.Fatalf("read rendered unit: %v", err) + } + rendered := string(data) + + if !strings.Contains(rendered, "auth="+wantKubeconfigPath) { + t.Fatalf("rendered unit missing auth=%s:\n%s", wantKubeconfigPath, rendered) + } + if strings.Contains(rendered, "kubelet/kubelet") { + t.Fatalf("rendered unit contains doubled 'kubelet' segment:\n%s", rendered) + } +} From 68265d61322d7d53910323455c01a8309e856335 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 16 Jun 2026 23:03:37 +0000 Subject: [PATCH 02/16] Initial plan From 3aad1705f8ba4ce7631097acd9a39910da48b759 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 16 Jun 2026 23:08:20 +0000 Subject: [PATCH 03/16] Validate NPD in e2e flow --- hack/e2e/README.md | 4 +-- hack/e2e/lib/validate.sh | 65 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 2 deletions(-) diff --git a/hack/e2e/README.md b/hack/e2e/README.md index c328a0cb..bbcfa15c 100644 --- a/hack/e2e/README.md +++ b/hack/e2e/README.md @@ -29,7 +29,7 @@ The default `all` command runs: 1. Build the local `aks-flex-node` binary unless `--binary` or `--skip-build` is used. 2. Deploy AKS and three VMs with Bicep. 3. Join all three VMs. -4. Validate node readiness and run smoke workloads. +4. Validate node readiness, node-problem-detector status, and run smoke workloads. 5. Unjoin all Flex Nodes and verify they are absent. 6. Rejoin all Flex Nodes and validate again. 7. Run local-machine-driven repave validation. @@ -51,7 +51,7 @@ The default `all` command runs: | `unjoin-msi` | Unjoin only the managed-identity node. | | `unjoin-token` | Unjoin only the bootstrap-token node. | | `unjoin-kubeadm` | Unjoin only the kubeadm-style node. | -| `validate` | Verify joined nodes and run smoke tests. | +| `validate` | Verify joined nodes, node-problem-detector status, and run smoke tests. | | `validate-absent` | Verify Flex Node objects are absent after unjoin. | | `smoke` | Run smoke workloads only. | | `upgrade-drift` | Validate local-machine-driven repave to the alternate nspawn side. | diff --git a/hack/e2e/lib/validate.sh b/hack/e2e/lib/validate.sh index 3bf60097..c53ba2b8 100755 --- a/hack/e2e/lib/validate.sh +++ b/hack/e2e/lib/validate.sh @@ -5,6 +5,7 @@ # Functions: # validate_node_joined - Wait for a specific node to appear in kubectl # validate_all_nodes - Verify MSI, token, and kubeadm nodes joined +# validate_npd_status - Verify node-problem-detector is active # validate_node_absent - Wait for a node to disappear from kubectl # validate_all_nodes_absent - Verify all flex nodes are gone after unjoin # smoke_test