From 6e5fdb605cd43372d60ace98665e4c1ec0874e53 Mon Sep 17 00:00:00 2001 From: Peter Hunt Date: Tue, 23 Jun 2026 14:12:37 -0400 Subject: [PATCH] crio: conditionally enable min_injected_gomaxprocs for high-CPU pools Go processes on nodes with many CPUs suffer from excessive runtime threads when GOMAXPROCS isn't set. The go runtime creates threads per CPU it can see, but the kubernetes scheduler binpacks based on CPU requests, leading to scheduling and GC latency issues. CRI-O's min_injected_gomaxprocs feature injects the GOMAXPROCS environment variable into all pods based on their CPU request (set to 2*requested CPUs to allow bursting while mitigating latency). This change automatically enables min_injected_gomaxprocs for any pool (master, worker, arbiter, or custom) where any node has >= 128 CPUs. The controller: - Queries nodes in each pool to find max CPU count - Generates a CRIO drop-in config file when CPU >= 128 - Sets min_injected_gomaxprocs = 1 to enable the feature This approach is pool-aware and only affects high-CPU nodes where the GOMAXPROCS issue is most impactful. Signed-off-by: Peter Hunt --- .../container_runtime_config_controller.go | 15 ++ .../container-runtime-config/helpers.go | 51 ++++ .../container-runtime-config/helpers_test.go | 248 ++++++++++++++++++ 3 files changed, 314 insertions(+) diff --git a/pkg/controller/container-runtime-config/container_runtime_config_controller.go b/pkg/controller/container-runtime-config/container_runtime_config_controller.go index 9771ebb841..063e232bf5 100644 --- a/pkg/controller/container-runtime-config/container_runtime_config_controller.go +++ b/pkg/controller/container-runtime-config/container_runtime_config_controller.go @@ -85,6 +85,7 @@ type Controller struct { templatesDir string client mcfgclientset.Interface + kubeClient clientset.Interface configClient configclientset.Interface eventRecorder record.EventRecorder @@ -160,6 +161,7 @@ func New( ctrl := &Controller{ templatesDir: templatesDir, client: mcfgClient, + kubeClient: kubeClient, configClient: configClient, eventRecorder: ctrlcommon.NamespacedEventRecorder(eventBroadcaster.NewRecorder(scheme.Scheme, corev1.EventSource{Component: "machineconfigcontroller-containerruntimeconfigcontroller"})), queue: workqueue.NewTypedRateLimitingQueueWithConfig( @@ -815,6 +817,19 @@ func (ctrl *Controller) syncContainerRuntimeConfig(key string) error { configFileList = append(configFileList, crioFileConfigs...) } + // For pools with >= 128 CPUs, enable min_injected_gomaxprocs + maxCPU := getMaxCPUCountFromPool(ctrl.kubeClient, pool) + if maxCPU >= 128 { + tomlConf := tomlConfigCRIOMinInjectedGomaxprocs{} + tomlConf.Crio.Runtime.MinInjectedGomaxprocs = 1 + gomaxprocsConfig, err := addTOMLgeneratedConfigFile([]generatedConfigFile{}, crioDropInFilePathMinInjectedGomaxprocs, tomlConf) + if err != nil { + return ctrl.syncStatusOnly(cfg, err, "could not create min_injected_gomaxprocs config for pool %s: %v", pool.Name, err) + } + configFileList = append(configFileList, gomaxprocsConfig...) + klog.Infof("Enabling min_injected_gomaxprocs for pool %s with %d CPUs", pool.Name, maxCPU) + } + if isNotFound { tempIgnCfg := ctrlcommon.NewIgnConfig() mc, err = ctrlcommon.MachineConfigFromIgnConfig(role, managedKey, tempIgnCfg) diff --git a/pkg/controller/container-runtime-config/helpers.go b/pkg/controller/container-runtime-config/helpers.go index e0714add46..796941420b 100644 --- a/pkg/controller/container-runtime-config/helpers.go +++ b/pkg/controller/container-runtime-config/helpers.go @@ -32,6 +32,7 @@ import ( corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" + clientset "k8s.io/client-go/kubernetes" "k8s.io/klog/v2" mcfgv1 "github.com/openshift/api/machineconfiguration/v1" @@ -60,6 +61,7 @@ const ( // will be dropped in this is exported so that we can use it in the e2e-tests CRIODropInFilePathDefaultRuntime = "/etc/crio/crio.conf.d/01-ctrcfg-defaultRuntime" crioDropInFilePathAdditionalArtifactStores = "/etc/crio/crio.conf.d/01-ctrcfg-additionalArtifactStores" + crioDropInFilePathMinInjectedGomaxprocs = "/etc/crio/crio.conf.d/01-ctrcfg-minInjectedGomaxprocs" imagepolicyType = "sigstoreSigned" sigstoreRegistriesConfigFilePath = "/etc/containers/registries.d/sigstore-registries.yaml" crioCredentialProviderName = "crio-credential-provider" @@ -145,6 +147,17 @@ type tomlConfigCRIOAdditionalArtifactStores struct { } `toml:"crio"` } +// tomlConfigCRIOMinInjectedGomaxprocs is used for setting min_injected_gomaxprocs +// TOML-friendly (it has all of the explicit tables). It's just used for +// conversions. +type tomlConfigCRIOMinInjectedGomaxprocs struct { + Crio struct { + Runtime struct { + MinInjectedGomaxprocs int64 `toml:"min_injected_gomaxprocs,omitempty"` + } `toml:"runtime"` + } `toml:"crio"` +} + type dockerConfig struct { UseSigstoreAttachments bool `json:"use-sigstore-attachments,omitempty"` } @@ -1553,3 +1566,41 @@ func wrapErrorWithCRIOCredentialProviderConfigCondition(err error, conditionType message, ) } + +// getMaxCPUCountFromPool returns the maximum CPU count from any node in the given pool. +// Returns 0 if no nodes are found or if there's an error querying nodes. +func getMaxCPUCountFromPool(kubeClient clientset.Interface, pool *mcfgv1.MachineConfigPool) int64 { + // Get the node selector from the pool + selector, err := metav1.LabelSelectorAsSelector(pool.Spec.NodeSelector) + if err != nil { + klog.V(2).Infof("error converting node selector to selector for pool %s: %v", pool.Name, err) + return 0 + } + + // List nodes matching the pool's node selector + nodes, err := kubeClient.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{ + LabelSelector: selector.String(), + }) + if err != nil { + klog.V(2).Infof("error listing nodes for pool %s: %v", pool.Name, err) + return 0 + } + + if len(nodes.Items) == 0 { + klog.V(2).Infof("no nodes found for pool %s", pool.Name) + return 0 + } + + // Find the maximum CPU count across all nodes + var maxCPU int64 + for _, node := range nodes.Items { + cpuQuantity := node.Status.Capacity[corev1.ResourceCPU] + cpuCount := cpuQuantity.Value() + if cpuCount > maxCPU { + maxCPU = cpuCount + } + } + + klog.V(4).Infof("pool %s has max CPU count of %d across %d nodes", pool.Name, maxCPU, len(nodes.Items)) + return maxCPU +} diff --git a/pkg/controller/container-runtime-config/helpers_test.go b/pkg/controller/container-runtime-config/helpers_test.go index 360a72f933..716322b412 100644 --- a/pkg/controller/container-runtime-config/helpers_test.go +++ b/pkg/controller/container-runtime-config/helpers_test.go @@ -26,8 +26,11 @@ import ( corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/util/diff" "k8s.io/apimachinery/pkg/util/yaml" + k8sfake "k8s.io/client-go/kubernetes/fake" + k8stesting "k8s.io/client-go/testing" "k8s.io/utils/ptr" ) @@ -2968,3 +2971,248 @@ providers: } } + +func TestGetMaxCPUCountFromPool(t *testing.T) { + tests := []struct { + name string + pool *mcfgv1.MachineConfigPool + nodes []corev1.Node + expectedCPU int64 + expectedErrLog bool + }{ + { + name: "single node with 128 CPUs", + pool: &mcfgv1.MachineConfigPool{ + ObjectMeta: metav1.ObjectMeta{ + Name: "worker", + }, + Spec: mcfgv1.MachineConfigPoolSpec{ + NodeSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "node-role.kubernetes.io/worker": "", + }, + }, + }, + }, + nodes: []corev1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "worker-1", + Labels: map[string]string{ + "node-role.kubernetes.io/worker": "", + }, + }, + Status: corev1.NodeStatus{ + Capacity: corev1.ResourceList{ + corev1.ResourceCPU: *resource.NewQuantity(128, resource.DecimalSI), + }, + }, + }, + }, + expectedCPU: 128, + }, + { + name: "multiple nodes with different CPU counts", + pool: &mcfgv1.MachineConfigPool{ + ObjectMeta: metav1.ObjectMeta{ + Name: "worker", + }, + Spec: mcfgv1.MachineConfigPoolSpec{ + NodeSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "node-role.kubernetes.io/worker": "", + }, + }, + }, + }, + nodes: []corev1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "worker-1", + Labels: map[string]string{ + "node-role.kubernetes.io/worker": "", + }, + }, + Status: corev1.NodeStatus{ + Capacity: corev1.ResourceList{ + corev1.ResourceCPU: *resource.NewQuantity(64, resource.DecimalSI), + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "worker-2", + Labels: map[string]string{ + "node-role.kubernetes.io/worker": "", + }, + }, + Status: corev1.NodeStatus{ + Capacity: corev1.ResourceList{ + corev1.ResourceCPU: *resource.NewQuantity(256, resource.DecimalSI), + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "worker-3", + Labels: map[string]string{ + "node-role.kubernetes.io/worker": "", + }, + }, + Status: corev1.NodeStatus{ + Capacity: corev1.ResourceList{ + corev1.ResourceCPU: *resource.NewQuantity(128, resource.DecimalSI), + }, + }, + }, + }, + expectedCPU: 256, // Should return the maximum + }, + { + name: "node with low CPU count", + pool: &mcfgv1.MachineConfigPool{ + ObjectMeta: metav1.ObjectMeta{ + Name: "worker", + }, + Spec: mcfgv1.MachineConfigPoolSpec{ + NodeSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "node-role.kubernetes.io/worker": "", + }, + }, + }, + }, + nodes: []corev1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "worker-1", + Labels: map[string]string{ + "node-role.kubernetes.io/worker": "", + }, + }, + Status: corev1.NodeStatus{ + Capacity: corev1.ResourceList{ + corev1.ResourceCPU: *resource.NewQuantity(16, resource.DecimalSI), + }, + }, + }, + }, + expectedCPU: 16, + }, + { + name: "no nodes matching selector", + pool: &mcfgv1.MachineConfigPool{ + ObjectMeta: metav1.ObjectMeta{ + Name: "worker", + }, + Spec: mcfgv1.MachineConfigPoolSpec{ + NodeSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "node-role.kubernetes.io/worker": "", + }, + }, + }, + }, + nodes: []corev1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "master-1", + Labels: map[string]string{ + "node-role.kubernetes.io/master": "", + }, + }, + Status: corev1.NodeStatus{ + Capacity: corev1.ResourceList{ + corev1.ResourceCPU: *resource.NewQuantity(32, resource.DecimalSI), + }, + }, + }, + }, + expectedCPU: 0, + }, + { + name: "no nodes in cluster", + pool: &mcfgv1.MachineConfigPool{ + ObjectMeta: metav1.ObjectMeta{ + Name: "worker", + }, + Spec: mcfgv1.MachineConfigPoolSpec{ + NodeSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "node-role.kubernetes.io/worker": "", + }, + }, + }, + }, + nodes: []corev1.Node{}, + expectedCPU: 0, + }, + { + name: "invalid node selector", + pool: &mcfgv1.MachineConfigPool{ + ObjectMeta: metav1.ObjectMeta{ + Name: "worker", + }, + Spec: mcfgv1.MachineConfigPoolSpec{ + NodeSelector: &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "invalid", + Operator: "InvalidOperator", + Values: []string{"value"}, + }, + }, + }, + }, + }, + nodes: []corev1.Node{}, + expectedCPU: 0, + expectedErrLog: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Create fake kubernetes client with the test nodes + objects := []runtime.Object{} + for i := range tt.nodes { + objects = append(objects, &tt.nodes[i]) + } + + fakeKubeClient := k8sfake.NewSimpleClientset(objects...) + + // Call the function + maxCPU := getMaxCPUCountFromPool(fakeKubeClient, tt.pool) + + // Assert the result + assert.Equal(t, tt.expectedCPU, maxCPU, "unexpected CPU count") + }) + } +} + +func TestGetMaxCPUCountFromPool_APIFailure(t *testing.T) { + pool := &mcfgv1.MachineConfigPool{ + ObjectMeta: metav1.ObjectMeta{ + Name: "worker", + }, + Spec: mcfgv1.MachineConfigPoolSpec{ + NodeSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "node-role.kubernetes.io/worker": "", + }, + }, + }, + } + + // Create fake client with a reactor that returns an error on list + fakeKubeClient := k8sfake.NewSimpleClientset() + fakeKubeClient.PrependReactor("list", "nodes", func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { + return true, nil, fmt.Errorf("simulated API error") + }) + + // Call the function - should handle the error gracefully + maxCPU := getMaxCPUCountFromPool(fakeKubeClient, pool) + + // Should return 0 when API call fails + assert.Equal(t, int64(0), maxCPU, "expected 0 CPU count when API fails") +}