From 5ae09a372abdd1e593abc5ff5585f6c6ccea621c Mon Sep 17 00:00:00 2001 From: Cesar Wong Date: Wed, 24 Jun 2026 09:59:19 -0400 Subject: [PATCH 1/2] fix(nodepool): use Status.V1Beta2 instead of conversion-data annotation for MachineDeploymentComplete The conversion-data annotation is metadata, which is preserved from etcd on status-subresource writes. Since CAPI updates MachineDeployment status via the status subresource, the annotation is never refreshed by those writes. If the last main-resource v1beta2 write captured mid-rollout state, the annotation becomes permanently stale, causing MachineDeploymentComplete to return false forever and deadlocking the NodePool in UpdatingVersion/UpdatingConfig. Replace the annotation-based cross-check with a read of Status.V1Beta2.UpToDateReplicas and Status.V1Beta2.AvailableReplicas. These fields are part of the v1beta1 Status struct (not metadata), so they are correctly updated on every status-subresource write via the v1beta2-to-v1beta1 conversion layer. Co-Authored-By: Claude Opus 4.6 --- .../controllers/nodepool/capi_test.go | 123 +++++++----------- .../nodepool/nodepool_controller.go | 46 ++----- 2 files changed, 59 insertions(+), 110 deletions(-) diff --git a/hypershift-operator/controllers/nodepool/capi_test.go b/hypershift-operator/controllers/nodepool/capi_test.go index 1fe35dc09f60..eede34319950 100644 --- a/hypershift-operator/controllers/nodepool/capi_test.go +++ b/hypershift-operator/controllers/nodepool/capi_test.go @@ -27,7 +27,6 @@ import ( capiazure "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1" capikubevirt "sigs.k8s.io/cluster-api-provider-kubevirt/api/v1alpha1" capiv1 "sigs.k8s.io/cluster-api/api/core/v1beta1" - "sigs.k8s.io/cluster-api/util/conversion" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/apiutil" @@ -3284,22 +3283,6 @@ func TestMachineDeploymentComplete(t *testing.T) { two := int32(2) three := int32(3) - conversionData := func(replicas, upToDate, available int32, observedGen int64) string { - data := map[string]any{ - "apiVersion": "cluster.x-k8s.io/v1beta2", - "kind": "MachineDeployment", - "spec": map[string]any{"replicas": replicas}, - "status": map[string]any{ - "observedGeneration": observedGen, - "replicas": replicas, - "upToDateReplicas": upToDate, - "availableReplicas": available, - }, - } - raw, _ := json.Marshal(data) - return string(raw) - } - testCases := []struct { name string md *capiv1.MachineDeployment @@ -3308,106 +3291,92 @@ func TestMachineDeploymentComplete(t *testing.T) { { name: "When all v1beta1 and v1beta2 fields agree it should return true", md: &capiv1.MachineDeployment{ - ObjectMeta: metav1.ObjectMeta{ - Generation: 2, - Annotations: map[string]string{ - conversion.DataAnnotation: conversionData(2, 2, 2, 2), + ObjectMeta: metav1.ObjectMeta{Generation: 2}, + Spec: capiv1.MachineDeploymentSpec{Replicas: &two}, + Status: capiv1.MachineDeploymentStatus{ + Replicas: 2, UpdatedReplicas: 2, AvailableReplicas: 2, ObservedGeneration: 2, + V1Beta2: &capiv1.MachineDeploymentV1Beta2Status{ + UpToDateReplicas: ptr.To(int32(2)), + AvailableReplicas: ptr.To(int32(2)), }, }, - Spec: capiv1.MachineDeploymentSpec{Replicas: &two}, - Status: capiv1.MachineDeploymentStatus{Replicas: 2, UpdatedReplicas: 2, AvailableReplicas: 2, ObservedGeneration: 2}, }, expected: true, }, { name: "When v1beta1 looks complete but v1beta2 upToDateReplicas disagrees it should return false", md: &capiv1.MachineDeployment{ - ObjectMeta: metav1.ObjectMeta{ - Generation: 2, - Annotations: map[string]string{ - conversion.DataAnnotation: conversionData(2, 0, 2, 2), - }, - }, - Spec: capiv1.MachineDeploymentSpec{Replicas: &two}, - Status: capiv1.MachineDeploymentStatus{Replicas: 2, UpdatedReplicas: 2, AvailableReplicas: 2, ObservedGeneration: 2}, - }, - expected: false, - }, - { - name: "When v1beta1 looks complete but v1beta2 replicas shows surge it should return false", - md: &capiv1.MachineDeployment{ - ObjectMeta: metav1.ObjectMeta{ - Generation: 2, - Annotations: map[string]string{ - conversion.DataAnnotation: conversionData(3, 2, 2, 2), + ObjectMeta: metav1.ObjectMeta{Generation: 2}, + Spec: capiv1.MachineDeploymentSpec{Replicas: &two}, + Status: capiv1.MachineDeploymentStatus{ + Replicas: 2, UpdatedReplicas: 2, AvailableReplicas: 2, ObservedGeneration: 2, + V1Beta2: &capiv1.MachineDeploymentV1Beta2Status{ + UpToDateReplicas: ptr.To(int32(0)), + AvailableReplicas: ptr.To(int32(2)), }, }, - Spec: capiv1.MachineDeploymentSpec{Replicas: &two}, - Status: capiv1.MachineDeploymentStatus{Replicas: 2, UpdatedReplicas: 2, AvailableReplicas: 2, ObservedGeneration: 2}, }, expected: false, }, { - name: "When v1beta1 looks complete but v1beta2 observedGeneration is stale it should return false", + name: "When v1beta1 looks complete but v1beta2 availableReplicas disagrees it should return false", md: &capiv1.MachineDeployment{ - ObjectMeta: metav1.ObjectMeta{ - Generation: 2, - Annotations: map[string]string{ - conversion.DataAnnotation: conversionData(2, 2, 2, 1), + ObjectMeta: metav1.ObjectMeta{Generation: 2}, + Spec: capiv1.MachineDeploymentSpec{Replicas: &two}, + Status: capiv1.MachineDeploymentStatus{ + Replicas: 2, UpdatedReplicas: 2, AvailableReplicas: 2, ObservedGeneration: 2, + V1Beta2: &capiv1.MachineDeploymentV1Beta2Status{ + UpToDateReplicas: ptr.To(int32(2)), + AvailableReplicas: ptr.To(int32(1)), }, }, - Spec: capiv1.MachineDeploymentSpec{Replicas: &two}, - Status: capiv1.MachineDeploymentStatus{Replicas: 2, UpdatedReplicas: 2, AvailableReplicas: 2, ObservedGeneration: 2}, }, expected: false, }, { - name: "When v1beta1 is not complete it should return false without checking conversion data", + name: "When v1beta1 is not complete it should return false without checking v1beta2", md: &capiv1.MachineDeployment{ - ObjectMeta: metav1.ObjectMeta{ - Generation: 2, - }, - Spec: capiv1.MachineDeploymentSpec{Replicas: &two}, - Status: capiv1.MachineDeploymentStatus{Replicas: 3, UpdatedReplicas: 1, AvailableReplicas: 2, ObservedGeneration: 2}, + ObjectMeta: metav1.ObjectMeta{Generation: 2}, + Spec: capiv1.MachineDeploymentSpec{Replicas: &two}, + Status: capiv1.MachineDeploymentStatus{Replicas: 3, UpdatedReplicas: 1, AvailableReplicas: 2, ObservedGeneration: 2}, }, expected: false, }, { - name: "When no conversion-data annotation exists it should fall back to v1beta1 only", + name: "When v1beta2 status is nil it should fall back to v1beta1 only", md: &capiv1.MachineDeployment{ - ObjectMeta: metav1.ObjectMeta{ - Generation: 1, - }, - Spec: capiv1.MachineDeploymentSpec{Replicas: &two}, - Status: capiv1.MachineDeploymentStatus{Replicas: 2, UpdatedReplicas: 2, AvailableReplicas: 2, ObservedGeneration: 1}, + ObjectMeta: metav1.ObjectMeta{Generation: 1}, + Spec: capiv1.MachineDeploymentSpec{Replicas: &two}, + Status: capiv1.MachineDeploymentStatus{Replicas: 2, UpdatedReplicas: 2, AvailableReplicas: 2, ObservedGeneration: 1}, }, expected: true, }, { - name: "When conversion-data annotation is malformed it should fall back to v1beta1 result", + name: "When v1beta1 replicas does not match spec it should return false", md: &capiv1.MachineDeployment{ - ObjectMeta: metav1.ObjectMeta{ - Generation: 1, - Annotations: map[string]string{ - conversion.DataAnnotation: "not-valid-json", + ObjectMeta: metav1.ObjectMeta{Generation: 2}, + Spec: capiv1.MachineDeploymentSpec{Replicas: &three}, + Status: capiv1.MachineDeploymentStatus{ + Replicas: 2, UpdatedReplicas: 2, AvailableReplicas: 2, ObservedGeneration: 2, + V1Beta2: &capiv1.MachineDeploymentV1Beta2Status{ + UpToDateReplicas: ptr.To(int32(2)), + AvailableReplicas: ptr.To(int32(2)), }, }, - Spec: capiv1.MachineDeploymentSpec{Replicas: &two}, - Status: capiv1.MachineDeploymentStatus{Replicas: 2, UpdatedReplicas: 2, AvailableReplicas: 2, ObservedGeneration: 1}, }, - expected: true, + expected: false, }, { - name: "When v1beta1 replicas does not match spec it should return false", + name: "When v1beta2 upToDateReplicas is nil it should return false", md: &capiv1.MachineDeployment{ - ObjectMeta: metav1.ObjectMeta{ - Generation: 2, - Annotations: map[string]string{ - conversion.DataAnnotation: conversionData(3, 2, 2, 2), + ObjectMeta: metav1.ObjectMeta{Generation: 2}, + Spec: capiv1.MachineDeploymentSpec{Replicas: &two}, + Status: capiv1.MachineDeploymentStatus{ + Replicas: 2, UpdatedReplicas: 2, AvailableReplicas: 2, ObservedGeneration: 2, + V1Beta2: &capiv1.MachineDeploymentV1Beta2Status{ + AvailableReplicas: ptr.To(int32(2)), }, }, - Spec: capiv1.MachineDeploymentSpec{Replicas: &three}, - Status: capiv1.MachineDeploymentStatus{Replicas: 2, UpdatedReplicas: 2, AvailableReplicas: 2, ObservedGeneration: 2}, }, expected: false, }, diff --git a/hypershift-operator/controllers/nodepool/nodepool_controller.go b/hypershift-operator/controllers/nodepool/nodepool_controller.go index 2a170c1137de..9905b155168c 100644 --- a/hypershift-operator/controllers/nodepool/nodepool_controller.go +++ b/hypershift-operator/controllers/nodepool/nodepool_controller.go @@ -2,7 +2,6 @@ package nodepool import ( "context" - "encoding/json" coreerrors "errors" "fmt" "os" @@ -44,7 +43,6 @@ import ( capiopenstackv1beta1 "sigs.k8s.io/cluster-api-provider-openstack/api/v1beta1" capiv1 "sigs.k8s.io/cluster-api/api/core/v1beta1" "sigs.k8s.io/cluster-api/util" - "sigs.k8s.io/cluster-api/util/conversion" "sigs.k8s.io/cluster-api/util/patch" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/builder" @@ -792,8 +790,8 @@ func defaultNodePoolGCPImage(specifiedArch string, releaseImage *releaseinfo.Rel // fields come from conversion. The converted v1beta1 fields (especially UpdatedReplicas, // which maps from deprecated.v1beta1.updatedReplicas rather than the native upToDateReplicas) // can transiently disagree with the v1beta2 native fields. To guard against this, when the -// v1beta1 fields indicate completion we cross-check against the authoritative v1beta2 status -// stored in the conversion-data annotation. +// v1beta1 fields indicate completion we cross-check against the v1beta2 status stored in the +// Status.V1Beta2 field, which is kept current on every status-subresource write. func MachineDeploymentComplete(deployment *capiv1.MachineDeployment) bool { newStatus := &deployment.Status v1beta1Complete := newStatus.UpdatedReplicas == *(deployment.Spec.Replicas) && @@ -803,39 +801,21 @@ func MachineDeploymentComplete(deployment *capiv1.MachineDeployment) bool { if !v1beta1Complete { return false } - return machineDeploymentCompleteFromConversionData(deployment) + return machineDeploymentCompleteFromV1Beta2Status(deployment) } -// machineDeploymentCompleteFromConversionData parses the v1beta2 conversion-data annotation -// and verifies that the native v1beta2 status also indicates completion. If the annotation -// is absent (e.g. CAPI < v1.11), returns true to preserve backwards compatibility. -func machineDeploymentCompleteFromConversionData(deployment *capiv1.MachineDeployment) bool { - raw, ok := deployment.Annotations[conversion.DataAnnotation] - if !ok { - return true - } - - var v1beta2MD struct { - Spec struct { - Replicas *int32 `json:"replicas"` - } `json:"spec"` - Status struct { - ObservedGeneration int64 `json:"observedGeneration"` - Replicas *int32 `json:"replicas"` - AvailableReplicas *int32 `json:"availableReplicas"` - UpToDateReplicas *int32 `json:"upToDateReplicas"` - } `json:"status"` - } - if err := json.Unmarshal([]byte(raw), &v1beta2MD); err != nil { - ctrl.Log.WithName("nodepool").Error(err, "Failed to unmarshal conversion-data annotation, falling back to v1beta1 status") +// machineDeploymentCompleteFromV1Beta2Status verifies that the native v1beta2 status fields +// also indicate completion. The v1beta1 Status.V1Beta2 field is populated by the v1beta2-to-v1beta1 +// conversion on every status-subresource write, so it is always current. +// If V1Beta2 is nil (e.g. CAPI < v1.11), returns true to preserve backwards compatibility. +func machineDeploymentCompleteFromV1Beta2Status(deployment *capiv1.MachineDeployment) bool { + v1beta2 := deployment.Status.V1Beta2 + if v1beta2 == nil { return true } - - desired := ptr.Deref(v1beta2MD.Spec.Replicas, 0) - return ptr.Deref(v1beta2MD.Status.UpToDateReplicas, 0) == desired && - ptr.Deref(v1beta2MD.Status.Replicas, 0) == desired && - ptr.Deref(v1beta2MD.Status.AvailableReplicas, 0) == desired && - v1beta2MD.Status.ObservedGeneration >= deployment.Generation + desired := ptr.Deref(deployment.Spec.Replicas, 0) + return ptr.Deref(v1beta2.UpToDateReplicas, 0) == desired && + ptr.Deref(v1beta2.AvailableReplicas, 0) == desired } // GetHostedClusterByName finds and return a HostedCluster object using the specified params. From d2cf0ff1570ff9643b52cfc476d864321712a23c Mon Sep 17 00:00:00 2001 From: Cesar Wong Date: Wed, 24 Jun 2026 10:14:04 -0400 Subject: [PATCH 2/2] fix: check nil v1beta2 replica fields explicitly instead of defaulting to zero Nil UpToDateReplicas/AvailableReplicas means "not yet set", not "zero replicas". Using ptr.Deref with a zero default would incorrectly match a desired count of zero, marking incomplete deployments as complete. Co-Authored-By: Claude Opus 4.6 --- .../controllers/nodepool/capi_test.go | 29 +++++++++++++++++++ .../nodepool/nodepool_controller.go | 7 +++-- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/hypershift-operator/controllers/nodepool/capi_test.go b/hypershift-operator/controllers/nodepool/capi_test.go index eede34319950..043cd2662da7 100644 --- a/hypershift-operator/controllers/nodepool/capi_test.go +++ b/hypershift-operator/controllers/nodepool/capi_test.go @@ -3380,6 +3380,35 @@ func TestMachineDeploymentComplete(t *testing.T) { }, expected: false, }, + { + name: "When v1beta2 availableReplicas is nil it should return false", + md: &capiv1.MachineDeployment{ + ObjectMeta: metav1.ObjectMeta{Generation: 2}, + Spec: capiv1.MachineDeploymentSpec{Replicas: &two}, + Status: capiv1.MachineDeploymentStatus{ + Replicas: 2, UpdatedReplicas: 2, AvailableReplicas: 2, ObservedGeneration: 2, + V1Beta2: &capiv1.MachineDeploymentV1Beta2Status{ + UpToDateReplicas: ptr.To(int32(2)), + }, + }, + }, + expected: false, + }, + { + name: "When desired replicas is zero and v1beta2 fields are nil it should return false", + md: func() *capiv1.MachineDeployment { + zero := int32(0) + return &capiv1.MachineDeployment{ + ObjectMeta: metav1.ObjectMeta{Generation: 2}, + Spec: capiv1.MachineDeploymentSpec{Replicas: &zero}, + Status: capiv1.MachineDeploymentStatus{ + Replicas: 0, UpdatedReplicas: 0, AvailableReplicas: 0, ObservedGeneration: 2, + V1Beta2: &capiv1.MachineDeploymentV1Beta2Status{}, + }, + } + }(), + expected: false, + }, } for _, tc := range testCases { diff --git a/hypershift-operator/controllers/nodepool/nodepool_controller.go b/hypershift-operator/controllers/nodepool/nodepool_controller.go index 9905b155168c..6d7bdc3d6098 100644 --- a/hypershift-operator/controllers/nodepool/nodepool_controller.go +++ b/hypershift-operator/controllers/nodepool/nodepool_controller.go @@ -813,9 +813,12 @@ func machineDeploymentCompleteFromV1Beta2Status(deployment *capiv1.MachineDeploy if v1beta2 == nil { return true } + if v1beta2.UpToDateReplicas == nil || v1beta2.AvailableReplicas == nil { + return false + } desired := ptr.Deref(deployment.Spec.Replicas, 0) - return ptr.Deref(v1beta2.UpToDateReplicas, 0) == desired && - ptr.Deref(v1beta2.AvailableReplicas, 0) == desired + return *v1beta2.UpToDateReplicas == desired && + *v1beta2.AvailableReplicas == desired } // GetHostedClusterByName finds and return a HostedCluster object using the specified params.