From 6e5fdb605cd43372d60ace98665e4c1ec0874e53 Mon Sep 17 00:00:00 2001
From: Peter Hunt <pehunt@redhat.com>
Date: Tue, 23 Jun 2026 14:12:37 -0400
Subject: [PATCH] crio: conditionally enable min_injected_gomaxprocs for
 high-CPU pools

Go processes on nodes with many CPUs suffer from excessive runtime
threads when GOMAXPROCS isn't set. The go runtime creates threads
per CPU it can see, but the kubernetes scheduler binpacks based on
CPU requests, leading to scheduling and GC latency issues.

CRI-O's min_injected_gomaxprocs feature injects the GOMAXPROCS
environment variable into all pods based on their CPU request (set
to 2*requested CPUs to allow bursting while mitigating latency).

This change automatically enables min_injected_gomaxprocs for any
pool (master, worker, arbiter, or custom) where any node has >= 128
CPUs. The controller:
- Queries nodes in each pool to find max CPU count
- Generates a CRIO drop-in config file when CPU >= 128
- Sets min_injected_gomaxprocs = 1 to enable the feature

This approach is pool-aware and only affects high-CPU nodes where
the GOMAXPROCS issue is most impactful.

Signed-off-by: Peter Hunt <pehunt@redhat.com>
---
 .../container_runtime_config_controller.go    |  15 ++
 .../container-runtime-config/helpers.go       |  51 ++++
 .../container-runtime-config/helpers_test.go  | 248 ++++++++++++++++++
 3 files changed, 314 insertions(+)

diff --git a/pkg/controller/container-runtime-config/container_runtime_config_controller.go b/pkg/controller/container-runtime-config/container_runtime_config_controller.go
index 9771ebb841..063e232bf5 100644
--- a/pkg/controller/container-runtime-config/container_runtime_config_controller.go
+++ b/pkg/controller/container-runtime-config/container_runtime_config_controller.go
@@ -85,6 +85,7 @@ type Controller struct {
 	templatesDir string
 
 	client        mcfgclientset.Interface
+	kubeClient    clientset.Interface
 	configClient  configclientset.Interface
 	eventRecorder record.EventRecorder
 
@@ -160,6 +161,7 @@ func New(
 	ctrl := &Controller{
 		templatesDir:  templatesDir,
 		client:        mcfgClient,
+		kubeClient:    kubeClient,
 		configClient:  configClient,
 		eventRecorder: ctrlcommon.NamespacedEventRecorder(eventBroadcaster.NewRecorder(scheme.Scheme, corev1.EventSource{Component: "machineconfigcontroller-containerruntimeconfigcontroller"})),
 		queue: workqueue.NewTypedRateLimitingQueueWithConfig(
@@ -815,6 +817,19 @@ func (ctrl *Controller) syncContainerRuntimeConfig(key string) error {
 			configFileList = append(configFileList, crioFileConfigs...)
 		}
 
+		// For pools with >= 128 CPUs, enable min_injected_gomaxprocs
+		maxCPU := getMaxCPUCountFromPool(ctrl.kubeClient, pool)
+		if maxCPU >= 128 {
+			tomlConf := tomlConfigCRIOMinInjectedGomaxprocs{}
+			tomlConf.Crio.Runtime.MinInjectedGomaxprocs = 1
+			gomaxprocsConfig, err := addTOMLgeneratedConfigFile([]generatedConfigFile{}, crioDropInFilePathMinInjectedGomaxprocs, tomlConf)
+			if err != nil {
+				return ctrl.syncStatusOnly(cfg, err, "could not create min_injected_gomaxprocs config for pool %s: %v", pool.Name, err)
+			}
+			configFileList = append(configFileList, gomaxprocsConfig...)
+			klog.Infof("Enabling min_injected_gomaxprocs for pool %s with %d CPUs", pool.Name, maxCPU)
+		}
+
 		if isNotFound {
 			tempIgnCfg := ctrlcommon.NewIgnConfig()
 			mc, err = ctrlcommon.MachineConfigFromIgnConfig(role, managedKey, tempIgnCfg)
diff --git a/pkg/controller/container-runtime-config/helpers.go b/pkg/controller/container-runtime-config/helpers.go
index e0714add46..796941420b 100644
--- a/pkg/controller/container-runtime-config/helpers.go
+++ b/pkg/controller/container-runtime-config/helpers.go
@@ -32,6 +32,7 @@ import (
 	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/labels"
+	clientset "k8s.io/client-go/kubernetes"
 	"k8s.io/klog/v2"
 
 	mcfgv1 "github.com/openshift/api/machineconfiguration/v1"
@@ -60,6 +61,7 @@ const (
 	// will be dropped in this is exported so that we can use it in the e2e-tests
 	CRIODropInFilePathDefaultRuntime           = "/etc/crio/crio.conf.d/01-ctrcfg-defaultRuntime"
 	crioDropInFilePathAdditionalArtifactStores = "/etc/crio/crio.conf.d/01-ctrcfg-additionalArtifactStores"
+	crioDropInFilePathMinInjectedGomaxprocs    = "/etc/crio/crio.conf.d/01-ctrcfg-minInjectedGomaxprocs"
 	imagepolicyType                            = "sigstoreSigned"
 	sigstoreRegistriesConfigFilePath           = "/etc/containers/registries.d/sigstore-registries.yaml"
 	crioCredentialProviderName                 = "crio-credential-provider"
@@ -145,6 +147,17 @@ type tomlConfigCRIOAdditionalArtifactStores struct {
 	} `toml:"crio"`
 }
 
+// tomlConfigCRIOMinInjectedGomaxprocs is used for setting min_injected_gomaxprocs
+// TOML-friendly (it has all of the explicit tables). It's just used for
+// conversions.
+type tomlConfigCRIOMinInjectedGomaxprocs struct {
+	Crio struct {
+		Runtime struct {
+			MinInjectedGomaxprocs int64 `toml:"min_injected_gomaxprocs,omitempty"`
+		} `toml:"runtime"`
+	} `toml:"crio"`
+}
+
 type dockerConfig struct {
 	UseSigstoreAttachments bool `json:"use-sigstore-attachments,omitempty"`
 }
@@ -1553,3 +1566,41 @@ func wrapErrorWithCRIOCredentialProviderConfigCondition(err error, conditionType
 		message,
 	)
 }
+
+// getMaxCPUCountFromPool returns the maximum CPU count from any node in the given pool.
+// Returns 0 if no nodes are found or if there's an error querying nodes.
+func getMaxCPUCountFromPool(kubeClient clientset.Interface, pool *mcfgv1.MachineConfigPool) int64 {
+	// Get the node selector from the pool
+	selector, err := metav1.LabelSelectorAsSelector(pool.Spec.NodeSelector)
+	if err != nil {
+		klog.V(2).Infof("error converting node selector to selector for pool %s: %v", pool.Name, err)
+		return 0
+	}
+
+	// List nodes matching the pool's node selector
+	nodes, err := kubeClient.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{
+		LabelSelector: selector.String(),
+	})
+	if err != nil {
+		klog.V(2).Infof("error listing nodes for pool %s: %v", pool.Name, err)
+		return 0
+	}
+
+	if len(nodes.Items) == 0 {
+		klog.V(2).Infof("no nodes found for pool %s", pool.Name)
+		return 0
+	}
+
+	// Find the maximum CPU count across all nodes
+	var maxCPU int64
+	for _, node := range nodes.Items {
+		cpuQuantity := node.Status.Capacity[corev1.ResourceCPU]
+		cpuCount := cpuQuantity.Value()
+		if cpuCount > maxCPU {
+			maxCPU = cpuCount
+		}
+	}
+
+	klog.V(4).Infof("pool %s has max CPU count of %d across %d nodes", pool.Name, maxCPU, len(nodes.Items))
+	return maxCPU
+}
diff --git a/pkg/controller/container-runtime-config/helpers_test.go b/pkg/controller/container-runtime-config/helpers_test.go
index 360a72f933..716322b412 100644
--- a/pkg/controller/container-runtime-config/helpers_test.go
+++ b/pkg/controller/container-runtime-config/helpers_test.go
@@ -26,8 +26,11 @@ import (
 	corev1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/apimachinery/pkg/util/diff"
 	"k8s.io/apimachinery/pkg/util/yaml"
+	k8sfake "k8s.io/client-go/kubernetes/fake"
+	k8stesting "k8s.io/client-go/testing"
 	"k8s.io/utils/ptr"
 )
 
@@ -2968,3 +2971,248 @@ providers:
 	}
 
 }
+
+func TestGetMaxCPUCountFromPool(t *testing.T) {
+	tests := []struct {
+		name           string
+		pool           *mcfgv1.MachineConfigPool
+		nodes          []corev1.Node
+		expectedCPU    int64
+		expectedErrLog bool
+	}{
+		{
+			name: "single node with 128 CPUs",
+			pool: &mcfgv1.MachineConfigPool{
+				ObjectMeta: metav1.ObjectMeta{
+					Name: "worker",
+				},
+				Spec: mcfgv1.MachineConfigPoolSpec{
+					NodeSelector: &metav1.LabelSelector{
+						MatchLabels: map[string]string{
+							"node-role.kubernetes.io/worker": "",
+						},
+					},
+				},
+			},
+			nodes: []corev1.Node{
+				{
+					ObjectMeta: metav1.ObjectMeta{
+						Name: "worker-1",
+						Labels: map[string]string{
+							"node-role.kubernetes.io/worker": "",
+						},
+					},
+					Status: corev1.NodeStatus{
+						Capacity: corev1.ResourceList{
+							corev1.ResourceCPU: *resource.NewQuantity(128, resource.DecimalSI),
+						},
+					},
+				},
+			},
+			expectedCPU: 128,
+		},
+		{
+			name: "multiple nodes with different CPU counts",
+			pool: &mcfgv1.MachineConfigPool{
+				ObjectMeta: metav1.ObjectMeta{
+					Name: "worker",
+				},
+				Spec: mcfgv1.MachineConfigPoolSpec{
+					NodeSelector: &metav1.LabelSelector{
+						MatchLabels: map[string]string{
+							"node-role.kubernetes.io/worker": "",
+						},
+					},
+				},
+			},
+			nodes: []corev1.Node{
+				{
+					ObjectMeta: metav1.ObjectMeta{
+						Name: "worker-1",
+						Labels: map[string]string{
+							"node-role.kubernetes.io/worker": "",
+						},
+					},
+					Status: corev1.NodeStatus{
+						Capacity: corev1.ResourceList{
+							corev1.ResourceCPU: *resource.NewQuantity(64, resource.DecimalSI),
+						},
+					},
+				},
+				{
+					ObjectMeta: metav1.ObjectMeta{
+						Name: "worker-2",
+						Labels: map[string]string{
+							"node-role.kubernetes.io/worker": "",
+						},
+					},
+					Status: corev1.NodeStatus{
+						Capacity: corev1.ResourceList{
+							corev1.ResourceCPU: *resource.NewQuantity(256, resource.DecimalSI),
+						},
+					},
+				},
+				{
+					ObjectMeta: metav1.ObjectMeta{
+						Name: "worker-3",
+						Labels: map[string]string{
+							"node-role.kubernetes.io/worker": "",
+						},
+					},
+					Status: corev1.NodeStatus{
+						Capacity: corev1.ResourceList{
+							corev1.ResourceCPU: *resource.NewQuantity(128, resource.DecimalSI),
+						},
+					},
+				},
+			},
+			expectedCPU: 256, // Should return the maximum
+		},
+		{
+			name: "node with low CPU count",
+			pool: &mcfgv1.MachineConfigPool{
+				ObjectMeta: metav1.ObjectMeta{
+					Name: "worker",
+				},
+				Spec: mcfgv1.MachineConfigPoolSpec{
+					NodeSelector: &metav1.LabelSelector{
+						MatchLabels: map[string]string{
+							"node-role.kubernetes.io/worker": "",
+						},
+					},
+				},
+			},
+			nodes: []corev1.Node{
+				{
+					ObjectMeta: metav1.ObjectMeta{
+						Name: "worker-1",
+						Labels: map[string]string{
+							"node-role.kubernetes.io/worker": "",
+						},
+					},
+					Status: corev1.NodeStatus{
+						Capacity: corev1.ResourceList{
+							corev1.ResourceCPU: *resource.NewQuantity(16, resource.DecimalSI),
+						},
+					},
+				},
+			},
+			expectedCPU: 16,
+		},
+		{
+			name: "no nodes matching selector",
+			pool: &mcfgv1.MachineConfigPool{
+				ObjectMeta: metav1.ObjectMeta{
+					Name: "worker",
+				},
+				Spec: mcfgv1.MachineConfigPoolSpec{
+					NodeSelector: &metav1.LabelSelector{
+						MatchLabels: map[string]string{
+							"node-role.kubernetes.io/worker": "",
+						},
+					},
+				},
+			},
+			nodes: []corev1.Node{
+				{
+					ObjectMeta: metav1.ObjectMeta{
+						Name: "master-1",
+						Labels: map[string]string{
+							"node-role.kubernetes.io/master": "",
+						},
+					},
+					Status: corev1.NodeStatus{
+						Capacity: corev1.ResourceList{
+							corev1.ResourceCPU: *resource.NewQuantity(32, resource.DecimalSI),
+						},
+					},
+				},
+			},
+			expectedCPU: 0,
+		},
+		{
+			name: "no nodes in cluster",
+			pool: &mcfgv1.MachineConfigPool{
+				ObjectMeta: metav1.ObjectMeta{
+					Name: "worker",
+				},
+				Spec: mcfgv1.MachineConfigPoolSpec{
+					NodeSelector: &metav1.LabelSelector{
+						MatchLabels: map[string]string{
+							"node-role.kubernetes.io/worker": "",
+						},
+					},
+				},
+			},
+			nodes:       []corev1.Node{},
+			expectedCPU: 0,
+		},
+		{
+			name: "invalid node selector",
+			pool: &mcfgv1.MachineConfigPool{
+				ObjectMeta: metav1.ObjectMeta{
+					Name: "worker",
+				},
+				Spec: mcfgv1.MachineConfigPoolSpec{
+					NodeSelector: &metav1.LabelSelector{
+						MatchExpressions: []metav1.LabelSelectorRequirement{
+							{
+								Key:      "invalid",
+								Operator: "InvalidOperator",
+								Values:   []string{"value"},
+							},
+						},
+					},
+				},
+			},
+			nodes:          []corev1.Node{},
+			expectedCPU:    0,
+			expectedErrLog: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			// Create fake kubernetes client with the test nodes
+			objects := []runtime.Object{}
+			for i := range tt.nodes {
+				objects = append(objects, &tt.nodes[i])
+			}
+
+			fakeKubeClient := k8sfake.NewSimpleClientset(objects...)
+
+			// Call the function
+			maxCPU := getMaxCPUCountFromPool(fakeKubeClient, tt.pool)
+
+			// Assert the result
+			assert.Equal(t, tt.expectedCPU, maxCPU, "unexpected CPU count")
+		})
+	}
+}
+
+func TestGetMaxCPUCountFromPool_APIFailure(t *testing.T) {
+	pool := &mcfgv1.MachineConfigPool{
+		ObjectMeta: metav1.ObjectMeta{
+			Name: "worker",
+		},
+		Spec: mcfgv1.MachineConfigPoolSpec{
+			NodeSelector: &metav1.LabelSelector{
+				MatchLabels: map[string]string{
+					"node-role.kubernetes.io/worker": "",
+				},
+			},
+		},
+	}
+
+	// Create fake client with a reactor that returns an error on list
+	fakeKubeClient := k8sfake.NewSimpleClientset()
+	fakeKubeClient.PrependReactor("list", "nodes", func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) {
+		return true, nil, fmt.Errorf("simulated API error")
+	})
+
+	// Call the function - should handle the error gracefully
+	maxCPU := getMaxCPUCountFromPool(fakeKubeClient, pool)
+
+	// Should return 0 when API call fails
+	assert.Equal(t, int64(0), maxCPU, "expected 0 CPU count when API fails")
+}