-
Notifications
You must be signed in to change notification settings - Fork 505
crio: conditionally enable min_injected_gomaxprocs for high-CPU MCPs #6225
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -32,6 +32,7 @@ import ( | |
| corev1 "k8s.io/api/core/v1" | ||
| metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
| "k8s.io/apimachinery/pkg/labels" | ||
| clientset "k8s.io/client-go/kubernetes" | ||
| "k8s.io/klog/v2" | ||
|
|
||
| mcfgv1 "github.com/openshift/api/machineconfiguration/v1" | ||
|
|
@@ -60,6 +61,7 @@ const ( | |
| // will be dropped in this is exported so that we can use it in the e2e-tests | ||
| CRIODropInFilePathDefaultRuntime = "/etc/crio/crio.conf.d/01-ctrcfg-defaultRuntime" | ||
| crioDropInFilePathAdditionalArtifactStores = "/etc/crio/crio.conf.d/01-ctrcfg-additionalArtifactStores" | ||
| crioDropInFilePathMinInjectedGomaxprocs = "/etc/crio/crio.conf.d/01-ctrcfg-minInjectedGomaxprocs" | ||
| imagepolicyType = "sigstoreSigned" | ||
| sigstoreRegistriesConfigFilePath = "/etc/containers/registries.d/sigstore-registries.yaml" | ||
| crioCredentialProviderName = "crio-credential-provider" | ||
|
|
@@ -145,6 +147,17 @@ type tomlConfigCRIOAdditionalArtifactStores struct { | |
| } `toml:"crio"` | ||
| } | ||
|
|
||
| // tomlConfigCRIOMinInjectedGomaxprocs is used for setting min_injected_gomaxprocs | ||
| // TOML-friendly (it has all of the explicit tables). It's just used for | ||
| // conversions. | ||
| type tomlConfigCRIOMinInjectedGomaxprocs struct { | ||
| Crio struct { | ||
| Runtime struct { | ||
| MinInjectedGomaxprocs int64 `toml:"min_injected_gomaxprocs,omitempty"` | ||
| } `toml:"runtime"` | ||
| } `toml:"crio"` | ||
| } | ||
|
|
||
| type dockerConfig struct { | ||
| UseSigstoreAttachments bool `json:"use-sigstore-attachments,omitempty"` | ||
| } | ||
|
|
@@ -1553,3 +1566,41 @@ func wrapErrorWithCRIOCredentialProviderConfigCondition(err error, conditionType | |
| message, | ||
| ) | ||
| } | ||
|
|
||
| // getMaxCPUCountFromPool returns the maximum CPU count from any node in the given pool. | ||
| // Returns 0 if no nodes are found or if there's an error querying nodes. | ||
| func getMaxCPUCountFromPool(kubeClient clientset.Interface, pool *mcfgv1.MachineConfigPool) int64 { | ||
| // Get the node selector from the pool | ||
| selector, err := metav1.LabelSelectorAsSelector(pool.Spec.NodeSelector) | ||
| if err != nil { | ||
| klog.V(2).Infof("error converting node selector to selector for pool %s: %v", pool.Name, err) | ||
| return 0 | ||
| } | ||
|
|
||
| // List nodes matching the pool's node selector | ||
| nodes, err := kubeClient.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{ | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This does a full node List against the API server on every sync, once per pool. The controller already runs informers/listers for its other watched types, but there's no node informer wired in today, so this call is uncached. Wiring a node lister and reading from cache here would cut the per-sync API load and also make the transient failure above much less likely to flap the config. Worth weighing against the cost of adding a node watch to this controller. |
||
| LabelSelector: selector.String(), | ||
| }) | ||
| if err != nil { | ||
| klog.V(2).Infof("error listing nodes for pool %s: %v", pool.Name, err) | ||
| return 0 | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This collapses every failure into |
||
| } | ||
|
|
||
| if len(nodes.Items) == 0 { | ||
| klog.V(2).Infof("no nodes found for pool %s", pool.Name) | ||
| return 0 | ||
| } | ||
|
|
||
| // Find the maximum CPU count across all nodes | ||
| var maxCPU int64 | ||
| for _, node := range nodes.Items { | ||
| cpuQuantity := node.Status.Capacity[corev1.ResourceCPU] | ||
| cpuCount := cpuQuantity.Value() | ||
| if cpuCount > maxCPU { | ||
| maxCPU = cpuCount | ||
| } | ||
| } | ||
|
|
||
| klog.V(4).Infof("pool %s has max CPU count of %d across %d nodes", pool.Name, maxCPU, len(nodes.Items)) | ||
| return maxCPU | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -26,8 +26,11 @@ import ( | |
| corev1 "k8s.io/api/core/v1" | ||
| "k8s.io/apimachinery/pkg/api/resource" | ||
| metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
| "k8s.io/apimachinery/pkg/runtime" | ||
| "k8s.io/apimachinery/pkg/util/diff" | ||
| "k8s.io/apimachinery/pkg/util/yaml" | ||
| k8sfake "k8s.io/client-go/kubernetes/fake" | ||
| k8stesting "k8s.io/client-go/testing" | ||
| "k8s.io/utils/ptr" | ||
| ) | ||
|
|
||
|
|
@@ -2968,3 +2971,248 @@ providers: | |
| } | ||
|
|
||
| } | ||
|
|
||
| func TestGetMaxCPUCountFromPool(t *testing.T) { | ||
| tests := []struct { | ||
| name string | ||
| pool *mcfgv1.MachineConfigPool | ||
| nodes []corev1.Node | ||
| expectedCPU int64 | ||
| expectedErrLog bool | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| }{ | ||
| { | ||
| name: "single node with 128 CPUs", | ||
| pool: &mcfgv1.MachineConfigPool{ | ||
| ObjectMeta: metav1.ObjectMeta{ | ||
| Name: "worker", | ||
| }, | ||
| Spec: mcfgv1.MachineConfigPoolSpec{ | ||
| NodeSelector: &metav1.LabelSelector{ | ||
| MatchLabels: map[string]string{ | ||
| "node-role.kubernetes.io/worker": "", | ||
| }, | ||
| }, | ||
| }, | ||
| }, | ||
| nodes: []corev1.Node{ | ||
| { | ||
| ObjectMeta: metav1.ObjectMeta{ | ||
| Name: "worker-1", | ||
| Labels: map[string]string{ | ||
| "node-role.kubernetes.io/worker": "", | ||
| }, | ||
| }, | ||
| Status: corev1.NodeStatus{ | ||
| Capacity: corev1.ResourceList{ | ||
| corev1.ResourceCPU: *resource.NewQuantity(128, resource.DecimalSI), | ||
| }, | ||
| }, | ||
| }, | ||
| }, | ||
| expectedCPU: 128, | ||
| }, | ||
| { | ||
| name: "multiple nodes with different CPU counts", | ||
| pool: &mcfgv1.MachineConfigPool{ | ||
| ObjectMeta: metav1.ObjectMeta{ | ||
| Name: "worker", | ||
| }, | ||
| Spec: mcfgv1.MachineConfigPoolSpec{ | ||
| NodeSelector: &metav1.LabelSelector{ | ||
| MatchLabels: map[string]string{ | ||
| "node-role.kubernetes.io/worker": "", | ||
| }, | ||
| }, | ||
| }, | ||
| }, | ||
| nodes: []corev1.Node{ | ||
| { | ||
| ObjectMeta: metav1.ObjectMeta{ | ||
| Name: "worker-1", | ||
| Labels: map[string]string{ | ||
| "node-role.kubernetes.io/worker": "", | ||
| }, | ||
| }, | ||
| Status: corev1.NodeStatus{ | ||
| Capacity: corev1.ResourceList{ | ||
| corev1.ResourceCPU: *resource.NewQuantity(64, resource.DecimalSI), | ||
| }, | ||
| }, | ||
| }, | ||
| { | ||
| ObjectMeta: metav1.ObjectMeta{ | ||
| Name: "worker-2", | ||
| Labels: map[string]string{ | ||
| "node-role.kubernetes.io/worker": "", | ||
| }, | ||
| }, | ||
| Status: corev1.NodeStatus{ | ||
| Capacity: corev1.ResourceList{ | ||
| corev1.ResourceCPU: *resource.NewQuantity(256, resource.DecimalSI), | ||
| }, | ||
| }, | ||
| }, | ||
| { | ||
| ObjectMeta: metav1.ObjectMeta{ | ||
| Name: "worker-3", | ||
| Labels: map[string]string{ | ||
| "node-role.kubernetes.io/worker": "", | ||
| }, | ||
| }, | ||
| Status: corev1.NodeStatus{ | ||
| Capacity: corev1.ResourceList{ | ||
| corev1.ResourceCPU: *resource.NewQuantity(128, resource.DecimalSI), | ||
| }, | ||
| }, | ||
| }, | ||
| }, | ||
| expectedCPU: 256, // Should return the maximum | ||
| }, | ||
| { | ||
| name: "node with low CPU count", | ||
| pool: &mcfgv1.MachineConfigPool{ | ||
| ObjectMeta: metav1.ObjectMeta{ | ||
| Name: "worker", | ||
| }, | ||
| Spec: mcfgv1.MachineConfigPoolSpec{ | ||
| NodeSelector: &metav1.LabelSelector{ | ||
| MatchLabels: map[string]string{ | ||
| "node-role.kubernetes.io/worker": "", | ||
| }, | ||
| }, | ||
| }, | ||
| }, | ||
| nodes: []corev1.Node{ | ||
| { | ||
| ObjectMeta: metav1.ObjectMeta{ | ||
| Name: "worker-1", | ||
| Labels: map[string]string{ | ||
| "node-role.kubernetes.io/worker": "", | ||
| }, | ||
| }, | ||
| Status: corev1.NodeStatus{ | ||
| Capacity: corev1.ResourceList{ | ||
| corev1.ResourceCPU: *resource.NewQuantity(16, resource.DecimalSI), | ||
| }, | ||
| }, | ||
| }, | ||
| }, | ||
| expectedCPU: 16, | ||
| }, | ||
| { | ||
| name: "no nodes matching selector", | ||
| pool: &mcfgv1.MachineConfigPool{ | ||
| ObjectMeta: metav1.ObjectMeta{ | ||
| Name: "worker", | ||
| }, | ||
| Spec: mcfgv1.MachineConfigPoolSpec{ | ||
| NodeSelector: &metav1.LabelSelector{ | ||
| MatchLabels: map[string]string{ | ||
| "node-role.kubernetes.io/worker": "", | ||
| }, | ||
| }, | ||
| }, | ||
| }, | ||
| nodes: []corev1.Node{ | ||
| { | ||
| ObjectMeta: metav1.ObjectMeta{ | ||
| Name: "master-1", | ||
| Labels: map[string]string{ | ||
| "node-role.kubernetes.io/master": "", | ||
| }, | ||
| }, | ||
| Status: corev1.NodeStatus{ | ||
| Capacity: corev1.ResourceList{ | ||
| corev1.ResourceCPU: *resource.NewQuantity(32, resource.DecimalSI), | ||
| }, | ||
| }, | ||
| }, | ||
| }, | ||
| expectedCPU: 0, | ||
| }, | ||
| { | ||
| name: "no nodes in cluster", | ||
| pool: &mcfgv1.MachineConfigPool{ | ||
| ObjectMeta: metav1.ObjectMeta{ | ||
| Name: "worker", | ||
| }, | ||
| Spec: mcfgv1.MachineConfigPoolSpec{ | ||
| NodeSelector: &metav1.LabelSelector{ | ||
| MatchLabels: map[string]string{ | ||
| "node-role.kubernetes.io/worker": "", | ||
| }, | ||
| }, | ||
| }, | ||
| }, | ||
| nodes: []corev1.Node{}, | ||
| expectedCPU: 0, | ||
| }, | ||
| { | ||
| name: "invalid node selector", | ||
| pool: &mcfgv1.MachineConfigPool{ | ||
| ObjectMeta: metav1.ObjectMeta{ | ||
| Name: "worker", | ||
| }, | ||
| Spec: mcfgv1.MachineConfigPoolSpec{ | ||
| NodeSelector: &metav1.LabelSelector{ | ||
| MatchExpressions: []metav1.LabelSelectorRequirement{ | ||
| { | ||
| Key: "invalid", | ||
| Operator: "InvalidOperator", | ||
| Values: []string{"value"}, | ||
| }, | ||
| }, | ||
| }, | ||
| }, | ||
| }, | ||
| nodes: []corev1.Node{}, | ||
| expectedCPU: 0, | ||
| expectedErrLog: true, | ||
| }, | ||
| } | ||
|
|
||
| for _, tt := range tests { | ||
| t.Run(tt.name, func(t *testing.T) { | ||
| // Create fake kubernetes client with the test nodes | ||
| objects := []runtime.Object{} | ||
| for i := range tt.nodes { | ||
| objects = append(objects, &tt.nodes[i]) | ||
| } | ||
|
|
||
| fakeKubeClient := k8sfake.NewSimpleClientset(objects...) | ||
|
|
||
| // Call the function | ||
| maxCPU := getMaxCPUCountFromPool(fakeKubeClient, tt.pool) | ||
|
|
||
| // Assert the result | ||
| assert.Equal(t, tt.expectedCPU, maxCPU, "unexpected CPU count") | ||
| }) | ||
| } | ||
| } | ||
|
|
||
| func TestGetMaxCPUCountFromPool_APIFailure(t *testing.T) { | ||
| pool := &mcfgv1.MachineConfigPool{ | ||
| ObjectMeta: metav1.ObjectMeta{ | ||
| Name: "worker", | ||
| }, | ||
| Spec: mcfgv1.MachineConfigPoolSpec{ | ||
| NodeSelector: &metav1.LabelSelector{ | ||
| MatchLabels: map[string]string{ | ||
| "node-role.kubernetes.io/worker": "", | ||
| }, | ||
| }, | ||
| }, | ||
| } | ||
|
|
||
| // Create fake client with a reactor that returns an error on list | ||
| fakeKubeClient := k8sfake.NewSimpleClientset() | ||
| fakeKubeClient.PrependReactor("list", "nodes", func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { | ||
| return true, nil, fmt.Errorf("simulated API error") | ||
| }) | ||
|
|
||
| // Call the function - should handle the error gracefully | ||
| maxCPU := getMaxCPUCountFromPool(fakeKubeClient, pool) | ||
|
|
||
| // Should return 0 when API call fails | ||
| assert.Equal(t, int64(0), maxCPU, "expected 0 CPU count when API fails") | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Quick design question on this block. Since it reads the live node CPU count on every sync and the answer lands in a MachineConfig, the on/off state can change from things that are not config edits, and each change reboots the pool. For example, if an autoscaler takes a worker pool from 128 CPU nodes down to 64 and later back up, the file gets removed and then added back, so the pool reboots on each change. A fresh pool whose nodes have not registered yet also reads as 0 first and flips on once they join.
Would it be simpler to keep MCO out of the CPU counting and just turn the feature on here, then let CRI-O decide per node at container creation? CRI-O already runs on the node and sits in the injection path (gomaxprocs_hooks_linux.go), so it could skip injection on smaller nodes itself. Then the MachineConfig never changes based on cluster state, the reboot churn goes away, and
getMaxCPUCountFromPoolplus thekubeClientplumbing would not be needed. Is there a reason the 128 threshold needs to live in MCO, or would moving that decision into CRI-O work for your case?