From ba8a51b3287823f0016ff656ab6559651583d2dd Mon Sep 17 00:00:00 2001 From: Jeff Cantrill Date: Fri, 5 Jun 2026 20:03:47 -0400 Subject: [PATCH 1/4] LOG-9424: Fix API flooding when MultiCLF shares the same serviceAccount (#3295) * fix(auth): Change SCC Role naming to prevent conflicts when CLFs share service accounts When multiple ClusterLogForwarders in the same namespace share the same spec.serviceAccount, they were creating a shared Role named {sa}-scc. This caused both CLFs to continuously update the Role's ownerReferences, alternating between pointing to CLF #1 and CLF #2, flooding the API server with thousands of update requests. Changed the Role naming from {sa}-scc to {clfName}-{sa}-scc, making each CLF's Role unique. The RoleBinding now correctly references this new Role name format. Both resources keep their owner references for proper cleanup when a CLF is deleted. Also removed owner reference from the metadata-reader ClusterRoleBinding because cluster-scoped resources should not be owned by namespaced resources, following Kubernetes best practices. Resolves: LOG-9424 Co-Authored-By: Claude Sonnet 4.5 * feat(reconcile): Add cleanup logic for old SCC Role resources Add DeleteRole function to enable cleanup of Resources with the old naming scheme ({sa}-scc) during reconciliation. This ensures smooth migration from the previous implementation where multiple CLFs sharing a service account would conflict. The ReconcileRBAC function now deletes any old Role using the legacy naming scheme after creating the new one, preventing orphaned resources during upgrades. Co-Authored-By: Claude Sonnet 4.5 * test(auth): Update tests for new SCC RBAC resource naming scheme Update existing tests to reflect the new naming scheme where SCC Roles are named {clfName}-{sa}-scc instead of {sa}-scc. This ensures each CLF gets a unique Role when sharing a service account. Add test to verify that metadata-reader ClusterRoleBinding does not have owner references, ensuring compliance with Kubernetes best practices (cluster-scoped resources should not be owned by namespaced resources). Co-Authored-By: Claude Sonnet 4.5 * check role references before deleting --------- Co-authored-by: Claude Sonnet 4.5 --- internal/auth/rbac.go | 54 +++++++++++++++++++++++++++++--------- internal/auth/rbac_test.go | 19 +++++++++----- internal/reconcile/rbac.go | 12 +++++++++ 3 files changed, 66 insertions(+), 19 deletions(-) diff --git a/internal/auth/rbac.go b/internal/auth/rbac.go index de156454d7..8053bf327a 100644 --- a/internal/auth/rbac.go +++ b/internal/auth/rbac.go @@ -1,6 +1,7 @@ package auth import ( + "context" "fmt" "github.com/openshift/cluster-logging-operator/internal/reconcile" "github.com/openshift/cluster-logging-operator/internal/runtime" @@ -12,21 +13,50 @@ import ( // ReconcileRBAC reconciles the RBAC specifically for the service account and SCC func ReconcileRBAC(k8sClient client.Client, rbacName, saNamespace, saName string, owner metav1.OwnerReference) error { - desiredCRB := NewMetaDataReaderClusterRoleBinding(saNamespace, saName, owner) + desiredCRB := NewMetaDataReaderClusterRoleBinding(saNamespace, saName) if err := reconcile.ClusterRoleBinding(k8sClient, desiredCRB.Name, func() *rbacv1.ClusterRoleBinding { return desiredCRB }); err != nil { return err } - desiredSCCRole := NewServiceAccountSCCRole(saNamespace, saName, owner) + desiredSCCRole := NewServiceAccountSCCRole(saNamespace, rbacName, saName, owner) if err := reconcile.Role(k8sClient, desiredSCCRole); err != nil { return err } - desiredSCCRoleBinding := NewServiceAccountSCCRoleBinding(saNamespace, rbacName, desiredSCCRole.Name, saName, owner) - return reconcile.RoleBinding(k8sClient, desiredSCCRoleBinding) + desiredSCCRoleBinding := NewServiceAccountSCCRoleBinding(saNamespace, rbacName, saName, owner) + if err := reconcile.RoleBinding(k8sClient, desiredSCCRoleBinding); err != nil { + return err + } + + // Cleanup old resources with previous naming scheme + oldRoleName := fmt.Sprintf("%s-scc", saName) + + // List all RoleBindings in the namespace to check if any reference the old role + roleBindings := &rbacv1.RoleBindingList{} + if err := k8sClient.List(context.TODO(), roleBindings, client.InNamespace(saNamespace)); err != nil { + return err + } + + // Check if any RoleBinding references the old role + hasReferences := false + for _, rb := range roleBindings.Items { + if rb.RoleRef.Name == oldRoleName { + hasReferences = true + break + } + } + + // Only delete the old role if no RoleBindings reference it + if !hasReferences { + if err := reconcile.DeleteRole(k8sClient, saNamespace, oldRoleName); err != nil { + return err + } + } + + return nil } // NewMetaDataReaderClusterRoleBinding stubs a clusterrolebinding to allow reading of pod metadata (i.e. labels) -func NewMetaDataReaderClusterRoleBinding(saNamespace, saName string, owner metav1.OwnerReference) *rbacv1.ClusterRoleBinding { +func NewMetaDataReaderClusterRoleBinding(saNamespace, saName string) *rbacv1.ClusterRoleBinding { name := fmt.Sprintf("metadata-reader-%s-%s", saNamespace, saName) desired := runtime.NewClusterRoleBinding(name, rbacv1.RoleRef{ @@ -41,12 +71,11 @@ func NewMetaDataReaderClusterRoleBinding(saNamespace, saName string, owner metav }, ) - utils.AddOwnerRefToObject(desired, owner) return desired } -func NewServiceAccountSCCRole(namespace, name string, owner metav1.OwnerReference) *rbacv1.Role { - name = fmt.Sprintf("%s-scc", name) +func NewServiceAccountSCCRole(namespace, name, saName string, owner metav1.OwnerReference) *rbacv1.Role { + roleName := fmt.Sprintf("%s-%s-scc", name, saName) sccRule := rbacv1.PolicyRule{ APIGroups: []string{"security.openshift.io"}, ResourceNames: []string{sccName}, @@ -54,13 +83,14 @@ func NewServiceAccountSCCRole(namespace, name string, owner metav1.OwnerReferenc Verbs: []string{"use"}, } - desired := runtime.NewRole(namespace, name, sccRule) + desired := runtime.NewRole(namespace, roleName, sccRule) utils.AddOwnerRefToObject(desired, owner) return desired } -func NewServiceAccountSCCRoleBinding(namespace, name, roleName, saName string, owner metav1.OwnerReference) *rbacv1.RoleBinding { +func NewServiceAccountSCCRoleBinding(namespace, name, saName string, owner metav1.OwnerReference) *rbacv1.RoleBinding { + roleName := fmt.Sprintf("%s-%s-scc", name, saName) roleRef := rbacv1.RoleRef{ APIGroup: rbacv1.GroupName, Kind: "Role", @@ -73,8 +103,8 @@ func NewServiceAccountSCCRoleBinding(namespace, name, roleName, saName string, o Namespace: namespace, } - name = fmt.Sprintf("%s-scc", name) - desired := runtime.NewRoleBinding(namespace, name, roleRef, subject) + roleBindingName := fmt.Sprintf("%s-scc", name) + desired := runtime.NewRoleBinding(namespace, roleBindingName, roleRef, subject) utils.AddOwnerRefToObject(desired, owner) return desired diff --git a/internal/auth/rbac_test.go b/internal/auth/rbac_test.go index d14b706e7d..edfaa02506 100644 --- a/internal/auth/rbac_test.go +++ b/internal/auth/rbac_test.go @@ -11,7 +11,7 @@ import ( var _ = Describe("NewMetaDataReaderClusterRoleBinding", func() { It("should stub a well-formed clusterrolebinding", func() { - Expect(test.YAMLString(auth.NewMetaDataReaderClusterRoleBinding(constants.OpenshiftNS, "logcollector", metav1.OwnerReference{}))).To(MatchYAML( + Expect(test.YAMLString(auth.NewMetaDataReaderClusterRoleBinding(constants.OpenshiftNS, "logcollector"))).To(MatchYAML( `apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: @@ -27,16 +27,21 @@ subjects: namespace: openshift-logging `)) }) + + It("should not have owner references", func() { + crb := auth.NewMetaDataReaderClusterRoleBinding(constants.OpenshiftNS, "logcollector") + Expect(crb.OwnerReferences).To(BeEmpty()) + }) }) var _ = Describe("ServiceAccount SCC Role & RoleBinding", func() { It("should stub a well-formed role", func() { - Expect(test.YAMLString(auth.NewServiceAccountSCCRole(constants.OpenshiftNS, "my-sa", metav1.OwnerReference{}))).To(MatchYAML( + Expect(test.YAMLString(auth.NewServiceAccountSCCRole(constants.OpenshiftNS, "my-clf", "my-sa", metav1.OwnerReference{}))).To(MatchYAML( `apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: creationTimestamp: null - name: my-sa-scc + name: my-clf-my-sa-scc namespace: openshift-logging rules: - apiGroups: @@ -51,20 +56,20 @@ rules: }) It("should stub a well-formed roleBinding", func() { - Expect(test.YAMLString(auth.NewServiceAccountSCCRoleBinding(constants.OpenshiftNS, "my-sa", "customSA-scc", "customSA", metav1.OwnerReference{}))).To(MatchYAML( + Expect(test.YAMLString(auth.NewServiceAccountSCCRoleBinding(constants.OpenshiftNS, "my-clf", "my-sa", metav1.OwnerReference{}))).To(MatchYAML( `apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: creationTimestamp: null - name: my-sa-scc + name: my-clf-scc namespace: openshift-logging roleRef: apiGroup: rbac.authorization.k8s.io kind: Role - name: customSA-scc + name: my-clf-my-sa-scc subjects: - kind: ServiceAccount - name: customSA + name: my-sa namespace: openshift-logging `)) }) diff --git a/internal/reconcile/rbac.go b/internal/reconcile/rbac.go index e9f4846bb8..9ca38c3a61 100644 --- a/internal/reconcile/rbac.go +++ b/internal/reconcile/rbac.go @@ -7,6 +7,7 @@ import ( log "github.com/ViaQ/logerr/v2/log/static" "github.com/openshift/cluster-logging-operator/internal/runtime" rbacv1 "k8s.io/api/rbac/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" ) @@ -69,3 +70,14 @@ func DeleteClusterRoleBinding(k8sClient client.Client, name string) error { log.V(3).Info("Deleting", "object", object) return k8sClient.Delete(context.TODO(), object) } + +func DeleteRole(k8sClient client.Client, namespace, name string) error { + object := runtime.NewRole(namespace, name) + log.V(3).Info("Deleting Role", "namespace", namespace, "name", name) + err := k8sClient.Delete(context.TODO(), object) + // Ignore NotFound errors - resource is already deleted + if apierrors.IsNotFound(err) { + return nil + } + return err +} From 259462e76280cce1f33759b603d3454937141bbc Mon Sep 17 00:00:00 2001 From: Vitalii Parfonov Date: Thu, 18 Jun 2026 17:39:02 +0300 Subject: [PATCH 2/4] LOG-9424: Make legacy SCC role cleanup non-blocking The cleanup of old {sa}-scc Role resources introduced in #3308 uses k8sClient.List which goes through the namespace-scoped cache. When a CLF is created in a namespace the cache has not yet indexed, the List fails with "unknown namespace for the cache", blocking the entire reconciliation and preventing collector workloads from being created. Extract cleanup into a best-effort helper that logs errors instead of returning them. The old role will be cleaned up on the next successful reconciliation cycle. Co-Authored-By: Claude Opus 4.6 (1M context) --- internal/auth/rbac.go | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/internal/auth/rbac.go b/internal/auth/rbac.go index 8053bf327a..02fe22f0c3 100644 --- a/internal/auth/rbac.go +++ b/internal/auth/rbac.go @@ -3,6 +3,8 @@ package auth import ( "context" "fmt" + + log "github.com/ViaQ/logerr/v2/log/static" "github.com/openshift/cluster-logging-operator/internal/reconcile" "github.com/openshift/cluster-logging-operator/internal/runtime" "github.com/openshift/cluster-logging-operator/internal/utils" @@ -27,32 +29,32 @@ func ReconcileRBAC(k8sClient client.Client, rbacName, saNamespace, saName string return err } - // Cleanup old resources with previous naming scheme + // Best-effort cleanup of old resources with previous naming scheme. + // Errors are logged but not returned to avoid blocking reconciliation + // (e.g., namespace-scoped cache may not know about newly created namespaces). + cleanupLegacySCCRole(k8sClient, saNamespace, saName) + + return nil +} + +func cleanupLegacySCCRole(k8sClient client.Client, saNamespace, saName string) { oldRoleName := fmt.Sprintf("%s-scc", saName) - // List all RoleBindings in the namespace to check if any reference the old role roleBindings := &rbacv1.RoleBindingList{} if err := k8sClient.List(context.TODO(), roleBindings, client.InNamespace(saNamespace)); err != nil { - return err + log.V(3).Info("skipping legacy SCC role cleanup: unable to list rolebindings", "namespace", saNamespace, "error", err) + return } - // Check if any RoleBinding references the old role - hasReferences := false for _, rb := range roleBindings.Items { if rb.RoleRef.Name == oldRoleName { - hasReferences = true - break + return } } - // Only delete the old role if no RoleBindings reference it - if !hasReferences { - if err := reconcile.DeleteRole(k8sClient, saNamespace, oldRoleName); err != nil { - return err - } + if err := reconcile.DeleteRole(k8sClient, saNamespace, oldRoleName); err != nil { + log.V(3).Info("skipping legacy SCC role cleanup: unable to delete old role", "namespace", saNamespace, "role", oldRoleName, "error", err) } - - return nil } // NewMetaDataReaderClusterRoleBinding stubs a clusterrolebinding to allow reading of pod metadata (i.e. labels) From 9ecfa7f5cabe3a03c410ee1f5976b47e52347f10 Mon Sep 17 00:00:00 2001 From: Vitalii Parfonov Date: Fri, 19 Jun 2026 00:12:17 +0300 Subject: [PATCH 3/4] LOG-9424: Increase AtLeastOnce test timeout to 5 minutes The AtLeastOnce delivery mode uses disk-backed buffers which can delay initial log delivery. The 3-minute poll timeout was too tight for slow CI nodes, causing flaky failures. Align with DefaultWaitForLogsTimeout (5 minutes) used by other e2e tests. Co-Authored-By: Claude Opus 4.6 (1M context) --- test/e2e/collection/tuning/at_least_once_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/e2e/collection/tuning/at_least_once_test.go b/test/e2e/collection/tuning/at_least_once_test.go index 63d61c69d5..6c89e23af9 100644 --- a/test/e2e/collection/tuning/at_least_once_test.go +++ b/test/e2e/collection/tuning/at_least_once_test.go @@ -133,7 +133,7 @@ var _ = Describe("[tuning] deliveryMode AtLeastOnce", func() { //wait for some logs from all streams to be received // Verify some logs from all streams to be received - Expect(wait.PollUntilContextTimeout(context.TODO(), 5*time.Second, 3*time.Minute, true, func(context.Context) (done bool, err error) { + Expect(wait.PollUntilContextTimeout(context.TODO(), 5*time.Second, 5*time.Minute, true, func(context.Context) (done bool, err error) { q, err := receiver.Query(utils.GetPtr(15 * time.Second)) if err != nil { log.V(0).Error(err, "The error from querying the receiver") From dd58405b220a3f3066aa80cc65edb5857e145187 Mon Sep 17 00:00:00 2001 From: Vitalii Parfonov Date: Mon, 22 Jun 2026 17:22:50 +0300 Subject: [PATCH 4/4] Trigger CI re-run with updated workflow