From 42e6ead709aabffb4484089bbd6722b289ea87d6 Mon Sep 17 00:00:00 2001 From: Igal Tsoiref Date: Mon, 8 Jun 2026 21:10:56 -0400 Subject: [PATCH] fix(hostagent): ensure VFs are unmanaged by NetworkManager via persistent udev rule Add EnsureVFsUnmanaged() to the Backend interface so that NM-specific udev rule logic is encapsulated in NetworkManagerBackend while systemd-networkd cleanly no-ops. Key changes: - Move udev rule logic from hostagent/util into netconfig/nm_udev.go - Call EnsureVFsUnmanaged() at NetworkManager startup, before VFs are created, so the rule is always in place when NM evaluates devices - Mount /etc/udev/rules.d from host into hostagent container to persist the rule across reboots (HostPathDirectoryOrCreate) NetworkManager only evaluates NM_UNMANAGED when a device first appears, so the rule must be present before VFs are created. Signed-off-by: Igal Tsoiref Co-authored-by: Cursor --- .../networkmanager/network_manager.go | 4 + .../hostagent/util/netconfig/backend.go | 5 + .../hostagent/util/netconfig/nm_backend.go | 4 + .../hostagent/util/netconfig/nm_udev.go | 89 +++++++++++ .../hostagent/util/netconfig/nm_udev_test.go | 150 ++++++++++++++++++ .../util/netconfig/systemd_networkd.go | 4 + 6 files changed, 256 insertions(+) create mode 100644 internal/provisioning/hostagent/util/netconfig/nm_udev.go create mode 100644 internal/provisioning/hostagent/util/netconfig/nm_udev_test.go diff --git a/internal/provisioning/hostagent/networkmanager/network_manager.go b/internal/provisioning/hostagent/networkmanager/network_manager.go index 4b932137c..dcc289ff8 100644 --- a/internal/provisioning/hostagent/networkmanager/network_manager.go +++ b/internal/provisioning/hostagent/networkmanager/network_manager.go @@ -91,6 +91,10 @@ func (nm *NetworkManager) Start() error { nm.netBackend = backend klog.Infof("Using network configuration backend: %s", backend.Name()) + if err := nm.netBackend.EnsureVFsUnmanaged(); err != nil { + return fmt.Errorf("failed to ensure VFs are unmanaged by network backend: %w", err) + } + devices, err := hostutil.DiscoverDPUs(hostutil.SysFSRoot) if err != nil { return fmt.Errorf("failed to discovery DPUs: %w", err) diff --git a/internal/provisioning/hostagent/util/netconfig/backend.go b/internal/provisioning/hostagent/util/netconfig/backend.go index 78745666b..70302d65d 100644 --- a/internal/provisioning/hostagent/util/netconfig/backend.go +++ b/internal/provisioning/hostagent/util/netconfig/backend.go @@ -47,6 +47,11 @@ type Backend interface { // IsDHCPConfigured checks if DHCP is enabled for an interface. IsDHCPConfigured(interfaceName string) (bool, error) + + // EnsureVFsUnmanaged ensures that VF interfaces will not be managed by the + // network configuration backend. For NetworkManager this writes a udev rule; + // other backends may no-op. + EnsureVFsUnmanaged() error } // ConfigureNetwork orchestrates PF interface and bridge MTU configuration diff --git a/internal/provisioning/hostagent/util/netconfig/nm_backend.go b/internal/provisioning/hostagent/util/netconfig/nm_backend.go index 4275bc0e4..e18aa553c 100644 --- a/internal/provisioning/hostagent/util/netconfig/nm_backend.go +++ b/internal/provisioning/hostagent/util/netconfig/nm_backend.go @@ -77,6 +77,10 @@ func (n *NetworkManagerBackend) ResetPendingChanges() { n.modifiedConnPaths = nil } +func (n *NetworkManagerBackend) EnsureVFsUnmanaged() error { + return ensureNMUnmanagedUdevRule() +} + // ConfigurePFInterfaces configures physical function network interfaces via NM D-Bus. func (n *NetworkManagerBackend) ConfigurePFInterfaces(pciAddress string, portConfigs []hostutil.PortConfig) (bool, error) { needsApply := false diff --git a/internal/provisioning/hostagent/util/netconfig/nm_udev.go b/internal/provisioning/hostagent/util/netconfig/nm_udev.go new file mode 100644 index 000000000..7ec2d3a45 --- /dev/null +++ b/internal/provisioning/hostagent/util/netconfig/nm_udev.go @@ -0,0 +1,89 @@ +/* +Copyright 2026 NVIDIA + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package netconfig + +import ( + "fmt" + "os" + "os/exec" + "path/filepath" + + "k8s.io/klog/v2" +) + +const ( + nmUnmanagedRulesContent = `ACTION=="add|change|move", ATTRS{device}=="0x101e", ENV{NM_UNMANAGED}="1" +` +) + +// nmUnmanagedRulesPath is the file path for the udev rule. Variable for testability. +var nmUnmanagedRulesPath = "/run/udev/rules.d/10-nm-unmanaged.rules" + +// udevRunner abstracts command execution for testability. +var udevRunner = func(name string, args ...string) ([]byte, error) { + return exec.Command(name, args...).CombinedOutput() +} + +// ensureNMUnmanagedUdevRule writes a udev rule that prevents NetworkManager +// from managing VF interfaces (PCI device ID 0x101e) and reloads/triggers +// udev to apply the rule to both new and already-existing devices. +// Called once at hostagent startup, before VFs are created. +func ensureNMUnmanagedUdevRule() error { + if err := writeUdevRuleFile(); err != nil { + return fmt.Errorf("failed to write udev rule file: %w", err) + } + + if err := reloadAndTriggerUdev(); err != nil { + return fmt.Errorf("failed to reload/trigger udev rules: %w", err) + } + + return nil +} + +func writeUdevRuleFile() error { + dir := filepath.Dir(nmUnmanagedRulesPath) + if err := os.MkdirAll(dir, 0755); err != nil { + return fmt.Errorf("failed to create directory %s: %w", dir, err) + } + + existing, err := os.ReadFile(nmUnmanagedRulesPath) + if err == nil && string(existing) == nmUnmanagedRulesContent { + klog.V(3).Infof("Udev rule %s already up-to-date", nmUnmanagedRulesPath) + return nil + } + + if err := os.WriteFile(nmUnmanagedRulesPath, []byte(nmUnmanagedRulesContent), 0644); err != nil { + return fmt.Errorf("failed to write file %s: %w", nmUnmanagedRulesPath, err) + } + klog.Infof("Wrote udev rule to disable NM management of VFs: %s", nmUnmanagedRulesPath) + return nil +} + +func reloadAndTriggerUdev() error { + output, err := udevRunner("udevadm", "control", "--reload-rules") + if err != nil { + return fmt.Errorf("udevadm control --reload-rules failed: %w, output: %s", err, string(output)) + } + + output, err = udevRunner("udevadm", "trigger", "--subsystem-match=net") + if err != nil { + return fmt.Errorf("udevadm trigger --subsystem-match=net failed: %w, output: %s", err, string(output)) + } + + klog.V(3).Infof("Reloaded udev rules and triggered net subsystem") + return nil +} diff --git a/internal/provisioning/hostagent/util/netconfig/nm_udev_test.go b/internal/provisioning/hostagent/util/netconfig/nm_udev_test.go new file mode 100644 index 000000000..4182eddeb --- /dev/null +++ b/internal/provisioning/hostagent/util/netconfig/nm_udev_test.go @@ -0,0 +1,150 @@ +/* +Copyright 2026 NVIDIA + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package netconfig + +import ( + "fmt" + "os" + "path/filepath" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("ensureNMUnmanagedUdevRule", func() { + var ( + origPath string + origRunner func(string, ...string) ([]byte, error) + tempDir string + commands [][]string + ) + + BeforeEach(func() { + var err error + tempDir, err = os.MkdirTemp("", "udev-test-*") + Expect(err).NotTo(HaveOccurred()) + + origPath = nmUnmanagedRulesPath + origRunner = udevRunner + + commands = nil + udevRunner = func(name string, args ...string) ([]byte, error) { + commands = append(commands, append([]string{name}, args...)) + return nil, nil + } + }) + + AfterEach(func() { + nmUnmanagedRulesPath = origPath + udevRunner = origRunner + os.RemoveAll(tempDir) + }) + + setRulesPath := func() string { + p := filepath.Join(tempDir, "10-nm-unmanaged.rules") + nmUnmanagedRulesPath = p + return p + } + + It("should write the udev rule file and reload rules", func() { + rulesFile := setRulesPath() + + err := ensureNMUnmanagedUdevRule() + Expect(err).NotTo(HaveOccurred()) + + content, err := os.ReadFile(rulesFile) + Expect(err).NotTo(HaveOccurred()) + Expect(string(content)).To(Equal(nmUnmanagedRulesContent)) + + Expect(commands).To(HaveLen(2)) + Expect(commands[0]).To(Equal([]string{"udevadm", "control", "--reload-rules"})) + Expect(commands[1]).To(Equal([]string{"udevadm", "trigger", "--subsystem-match=net"})) + }) + + It("should still reload/trigger when file is already up-to-date", func() { + rulesFile := setRulesPath() + + err := os.MkdirAll(filepath.Dir(rulesFile), 0755) + Expect(err).NotTo(HaveOccurred()) + err = os.WriteFile(rulesFile, []byte(nmUnmanagedRulesContent), 0644) + Expect(err).NotTo(HaveOccurred()) + + err = ensureNMUnmanagedUdevRule() + Expect(err).NotTo(HaveOccurred()) + + Expect(commands).To(HaveLen(2)) + Expect(commands[0]).To(Equal([]string{"udevadm", "control", "--reload-rules"})) + Expect(commands[1]).To(Equal([]string{"udevadm", "trigger", "--subsystem-match=net"})) + }) + + It("should overwrite if content differs", func() { + rulesFile := setRulesPath() + + err := os.MkdirAll(filepath.Dir(rulesFile), 0755) + Expect(err).NotTo(HaveOccurred()) + err = os.WriteFile(rulesFile, []byte("old content"), 0644) + Expect(err).NotTo(HaveOccurred()) + + err = ensureNMUnmanagedUdevRule() + Expect(err).NotTo(HaveOccurred()) + + content, err := os.ReadFile(rulesFile) + Expect(err).NotTo(HaveOccurred()) + Expect(string(content)).To(Equal(nmUnmanagedRulesContent)) + }) + + It("should create parent directories if they don't exist", func() { + nmUnmanagedRulesPath = filepath.Join(tempDir, "subdir", "rules.d", "10-nm-unmanaged.rules") + + err := ensureNMUnmanagedUdevRule() + Expect(err).NotTo(HaveOccurred()) + + content, err := os.ReadFile(nmUnmanagedRulesPath) + Expect(err).NotTo(HaveOccurred()) + Expect(string(content)).To(Equal(nmUnmanagedRulesContent)) + }) + + It("should return error if udevadm reload fails", func() { + setRulesPath() + + udevRunner = func(name string, args ...string) ([]byte, error) { + if len(args) > 0 && args[0] == "control" { + return []byte("reload failed"), fmt.Errorf("exit status 1") + } + return nil, nil + } + + err := ensureNMUnmanagedUdevRule() + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("udevadm control --reload-rules failed")) + }) + + It("should return error if udevadm trigger fails", func() { + setRulesPath() + + udevRunner = func(name string, args ...string) ([]byte, error) { + if len(args) > 0 && args[0] == "trigger" { + return []byte("trigger failed"), fmt.Errorf("exit status 1") + } + return nil, nil + } + + err := ensureNMUnmanagedUdevRule() + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("udevadm trigger --subsystem-match=net failed")) + }) +}) diff --git a/internal/provisioning/hostagent/util/netconfig/systemd_networkd.go b/internal/provisioning/hostagent/util/netconfig/systemd_networkd.go index c0d3bd01e..7f1c7f764 100644 --- a/internal/provisioning/hostagent/util/netconfig/systemd_networkd.go +++ b/internal/provisioning/hostagent/util/netconfig/systemd_networkd.go @@ -59,3 +59,7 @@ func (s *SystemdNetworkdBackend) ApplyConfiguration() error { func (s *SystemdNetworkdBackend) IsDHCPConfigured(interfaceName string) (bool, error) { return hostutil.IsDHCPConfigured(interfaceName) } + +func (s *SystemdNetworkdBackend) EnsureVFsUnmanaged() error { + return nil +}