Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ type Controller struct {
templatesDir string

client mcfgclientset.Interface
kubeClient clientset.Interface
configClient configclientset.Interface
eventRecorder record.EventRecorder

Expand Down Expand Up @@ -160,6 +161,7 @@ func New(
ctrl := &Controller{
templatesDir: templatesDir,
client: mcfgClient,
kubeClient: kubeClient,
configClient: configClient,
eventRecorder: ctrlcommon.NamespacedEventRecorder(eventBroadcaster.NewRecorder(scheme.Scheme, corev1.EventSource{Component: "machineconfigcontroller-containerruntimeconfigcontroller"})),
queue: workqueue.NewTypedRateLimitingQueueWithConfig(
Expand Down Expand Up @@ -815,6 +817,19 @@ func (ctrl *Controller) syncContainerRuntimeConfig(key string) error {
configFileList = append(configFileList, crioFileConfigs...)
}

// For pools with >= 128 CPUs, enable min_injected_gomaxprocs
maxCPU := getMaxCPUCountFromPool(ctrl.kubeClient, pool)
if maxCPU >= 128 {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Quick design question on this block. Since it reads the live node CPU count on every sync and the answer lands in a MachineConfig, the on/off state can change from things that are not config edits, and each change reboots the pool. For example, if an autoscaler takes a worker pool from 128 CPU nodes down to 64 and later back up, the file gets removed and then added back, so the pool reboots on each change. A fresh pool whose nodes have not registered yet also reads as 0 first and flips on once they join.

Would it be simpler to keep MCO out of the CPU counting and just turn the feature on here, then let CRI-O decide per node at container creation? CRI-O already runs on the node and sits in the injection path (gomaxprocs_hooks_linux.go), so it could skip injection on smaller nodes itself. Then the MachineConfig never changes based on cluster state, the reboot churn goes away, and getMaxCPUCountFromPool plus the kubeClient plumbing would not be needed. Is there a reason the 128 threshold needs to live in MCO, or would moving that decision into CRI-O work for your case?

tomlConf := tomlConfigCRIOMinInjectedGomaxprocs{}
tomlConf.Crio.Runtime.MinInjectedGomaxprocs = 1
gomaxprocsConfig, err := addTOMLgeneratedConfigFile([]generatedConfigFile{}, crioDropInFilePathMinInjectedGomaxprocs, tomlConf)
if err != nil {
return ctrl.syncStatusOnly(cfg, err, "could not create min_injected_gomaxprocs config for pool %s: %v", pool.Name, err)
}
configFileList = append(configFileList, gomaxprocsConfig...)
klog.Infof("Enabling min_injected_gomaxprocs for pool %s with %d CPUs", pool.Name, maxCPU)
}

if isNotFound {
tempIgnCfg := ctrlcommon.NewIgnConfig()
mc, err = ctrlcommon.MachineConfigFromIgnConfig(role, managedKey, tempIgnCfg)
Expand Down
51 changes: 51 additions & 0 deletions pkg/controller/container-runtime-config/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ import (
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/klog/v2"

mcfgv1 "github.com/openshift/api/machineconfiguration/v1"
Expand Down Expand Up @@ -60,6 +61,7 @@ const (
// will be dropped in this is exported so that we can use it in the e2e-tests
CRIODropInFilePathDefaultRuntime = "/etc/crio/crio.conf.d/01-ctrcfg-defaultRuntime"
crioDropInFilePathAdditionalArtifactStores = "/etc/crio/crio.conf.d/01-ctrcfg-additionalArtifactStores"
crioDropInFilePathMinInjectedGomaxprocs = "/etc/crio/crio.conf.d/01-ctrcfg-minInjectedGomaxprocs"
imagepolicyType = "sigstoreSigned"
sigstoreRegistriesConfigFilePath = "/etc/containers/registries.d/sigstore-registries.yaml"
crioCredentialProviderName = "crio-credential-provider"
Expand Down Expand Up @@ -145,6 +147,17 @@ type tomlConfigCRIOAdditionalArtifactStores struct {
} `toml:"crio"`
}

// tomlConfigCRIOMinInjectedGomaxprocs is used for setting min_injected_gomaxprocs
// TOML-friendly (it has all of the explicit tables). It's just used for
// conversions.
type tomlConfigCRIOMinInjectedGomaxprocs struct {
Crio struct {
Runtime struct {
MinInjectedGomaxprocs int64 `toml:"min_injected_gomaxprocs,omitempty"`
} `toml:"runtime"`
} `toml:"crio"`
}

type dockerConfig struct {
UseSigstoreAttachments bool `json:"use-sigstore-attachments,omitempty"`
}
Expand Down Expand Up @@ -1553,3 +1566,41 @@ func wrapErrorWithCRIOCredentialProviderConfigCondition(err error, conditionType
message,
)
}

// getMaxCPUCountFromPool returns the maximum CPU count from any node in the given pool.
// Returns 0 if no nodes are found or if there's an error querying nodes.
func getMaxCPUCountFromPool(kubeClient clientset.Interface, pool *mcfgv1.MachineConfigPool) int64 {
// Get the node selector from the pool
selector, err := metav1.LabelSelectorAsSelector(pool.Spec.NodeSelector)
if err != nil {
klog.V(2).Infof("error converting node selector to selector for pool %s: %v", pool.Name, err)
return 0
}

// List nodes matching the pool's node selector
nodes, err := kubeClient.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This does a full node List against the API server on every sync, once per pool. The controller already runs informers/listers for its other watched types, but there's no node informer wired in today, so this call is uncached. Wiring a node lister and reading from cache here would cut the per-sync API load and also make the transient failure above much less likely to flap the config. Worth weighing against the cost of adding a node watch to this controller.

LabelSelector: selector.String(),
})
if err != nil {
klog.V(2).Infof("error listing nodes for pool %s: %v", pool.Name, err)
return 0

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This collapses every failure into 0, which is the same value as "no big nodes," and 0 means the drop-in gets dropped from the rendered MachineConfig. So a transient node List error during a sync removes the config and reboots the whole pool, then the next successful sync adds it back and reboots again. A blip on the API shouldn't be able to roll a pool. Can we propagate the error and degrade through syncStatusOnly the same way the TOML path below does, instead of swallowing it?

}

if len(nodes.Items) == 0 {
klog.V(2).Infof("no nodes found for pool %s", pool.Name)
return 0
}

// Find the maximum CPU count across all nodes
var maxCPU int64
for _, node := range nodes.Items {
cpuQuantity := node.Status.Capacity[corev1.ResourceCPU]
cpuCount := cpuQuantity.Value()
if cpuCount > maxCPU {
maxCPU = cpuCount
}
}

klog.V(4).Infof("pool %s has max CPU count of %d across %d nodes", pool.Name, maxCPU, len(nodes.Items))
return maxCPU
}
248 changes: 248 additions & 0 deletions pkg/controller/container-runtime-config/helpers_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,11 @@ import (
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/util/diff"
"k8s.io/apimachinery/pkg/util/yaml"
k8sfake "k8s.io/client-go/kubernetes/fake"
k8stesting "k8s.io/client-go/testing"
"k8s.io/utils/ptr"
)

Expand Down Expand Up @@ -2968,3 +2971,248 @@ providers:
}

}

func TestGetMaxCPUCountFromPool(t *testing.T) {
tests := []struct {
name string
pool *mcfgv1.MachineConfigPool
nodes []corev1.Node
expectedCPU int64
expectedErrLog bool

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

expectedErrLog is set but never asserted on, so it's dead right now. Either assert against captured log output or drop the field. Also, no test covers the new branch in syncContainerRuntimeConfig itself (drop-in appended at >=128, and the degrade path on TOML failure), which is the riskier wiring.

}{
{
name: "single node with 128 CPUs",
pool: &mcfgv1.MachineConfigPool{
ObjectMeta: metav1.ObjectMeta{
Name: "worker",
},
Spec: mcfgv1.MachineConfigPoolSpec{
NodeSelector: &metav1.LabelSelector{
MatchLabels: map[string]string{
"node-role.kubernetes.io/worker": "",
},
},
},
},
nodes: []corev1.Node{
{
ObjectMeta: metav1.ObjectMeta{
Name: "worker-1",
Labels: map[string]string{
"node-role.kubernetes.io/worker": "",
},
},
Status: corev1.NodeStatus{
Capacity: corev1.ResourceList{
corev1.ResourceCPU: *resource.NewQuantity(128, resource.DecimalSI),
},
},
},
},
expectedCPU: 128,
},
{
name: "multiple nodes with different CPU counts",
pool: &mcfgv1.MachineConfigPool{
ObjectMeta: metav1.ObjectMeta{
Name: "worker",
},
Spec: mcfgv1.MachineConfigPoolSpec{
NodeSelector: &metav1.LabelSelector{
MatchLabels: map[string]string{
"node-role.kubernetes.io/worker": "",
},
},
},
},
nodes: []corev1.Node{
{
ObjectMeta: metav1.ObjectMeta{
Name: "worker-1",
Labels: map[string]string{
"node-role.kubernetes.io/worker": "",
},
},
Status: corev1.NodeStatus{
Capacity: corev1.ResourceList{
corev1.ResourceCPU: *resource.NewQuantity(64, resource.DecimalSI),
},
},
},
{
ObjectMeta: metav1.ObjectMeta{
Name: "worker-2",
Labels: map[string]string{
"node-role.kubernetes.io/worker": "",
},
},
Status: corev1.NodeStatus{
Capacity: corev1.ResourceList{
corev1.ResourceCPU: *resource.NewQuantity(256, resource.DecimalSI),
},
},
},
{
ObjectMeta: metav1.ObjectMeta{
Name: "worker-3",
Labels: map[string]string{
"node-role.kubernetes.io/worker": "",
},
},
Status: corev1.NodeStatus{
Capacity: corev1.ResourceList{
corev1.ResourceCPU: *resource.NewQuantity(128, resource.DecimalSI),
},
},
},
},
expectedCPU: 256, // Should return the maximum
},
{
name: "node with low CPU count",
pool: &mcfgv1.MachineConfigPool{
ObjectMeta: metav1.ObjectMeta{
Name: "worker",
},
Spec: mcfgv1.MachineConfigPoolSpec{
NodeSelector: &metav1.LabelSelector{
MatchLabels: map[string]string{
"node-role.kubernetes.io/worker": "",
},
},
},
},
nodes: []corev1.Node{
{
ObjectMeta: metav1.ObjectMeta{
Name: "worker-1",
Labels: map[string]string{
"node-role.kubernetes.io/worker": "",
},
},
Status: corev1.NodeStatus{
Capacity: corev1.ResourceList{
corev1.ResourceCPU: *resource.NewQuantity(16, resource.DecimalSI),
},
},
},
},
expectedCPU: 16,
},
{
name: "no nodes matching selector",
pool: &mcfgv1.MachineConfigPool{
ObjectMeta: metav1.ObjectMeta{
Name: "worker",
},
Spec: mcfgv1.MachineConfigPoolSpec{
NodeSelector: &metav1.LabelSelector{
MatchLabels: map[string]string{
"node-role.kubernetes.io/worker": "",
},
},
},
},
nodes: []corev1.Node{
{
ObjectMeta: metav1.ObjectMeta{
Name: "master-1",
Labels: map[string]string{
"node-role.kubernetes.io/master": "",
},
},
Status: corev1.NodeStatus{
Capacity: corev1.ResourceList{
corev1.ResourceCPU: *resource.NewQuantity(32, resource.DecimalSI),
},
},
},
},
expectedCPU: 0,
},
{
name: "no nodes in cluster",
pool: &mcfgv1.MachineConfigPool{
ObjectMeta: metav1.ObjectMeta{
Name: "worker",
},
Spec: mcfgv1.MachineConfigPoolSpec{
NodeSelector: &metav1.LabelSelector{
MatchLabels: map[string]string{
"node-role.kubernetes.io/worker": "",
},
},
},
},
nodes: []corev1.Node{},
expectedCPU: 0,
},
{
name: "invalid node selector",
pool: &mcfgv1.MachineConfigPool{
ObjectMeta: metav1.ObjectMeta{
Name: "worker",
},
Spec: mcfgv1.MachineConfigPoolSpec{
NodeSelector: &metav1.LabelSelector{
MatchExpressions: []metav1.LabelSelectorRequirement{
{
Key: "invalid",
Operator: "InvalidOperator",
Values: []string{"value"},
},
},
},
},
},
nodes: []corev1.Node{},
expectedCPU: 0,
expectedErrLog: true,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
// Create fake kubernetes client with the test nodes
objects := []runtime.Object{}
for i := range tt.nodes {
objects = append(objects, &tt.nodes[i])
}

fakeKubeClient := k8sfake.NewSimpleClientset(objects...)

// Call the function
maxCPU := getMaxCPUCountFromPool(fakeKubeClient, tt.pool)

// Assert the result
assert.Equal(t, tt.expectedCPU, maxCPU, "unexpected CPU count")
})
}
}

func TestGetMaxCPUCountFromPool_APIFailure(t *testing.T) {
pool := &mcfgv1.MachineConfigPool{
ObjectMeta: metav1.ObjectMeta{
Name: "worker",
},
Spec: mcfgv1.MachineConfigPoolSpec{
NodeSelector: &metav1.LabelSelector{
MatchLabels: map[string]string{
"node-role.kubernetes.io/worker": "",
},
},
},
}

// Create fake client with a reactor that returns an error on list
fakeKubeClient := k8sfake.NewSimpleClientset()
fakeKubeClient.PrependReactor("list", "nodes", func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) {
return true, nil, fmt.Errorf("simulated API error")
})

// Call the function - should handle the error gracefully
maxCPU := getMaxCPUCountFromPool(fakeKubeClient, pool)

// Should return 0 when API call fails
assert.Equal(t, int64(0), maxCPU, "expected 0 CPU count when API fails")
}