diff --git a/k8s/bases/infrastructure/cluster-security-exceptions/batch-workloads.yaml b/k8s/bases/infrastructure/cluster-security-exceptions/batch-workloads.yaml new file mode 100644 index 000000000..d65320403 --- /dev/null +++ b/k8s/bases/infrastructure/cluster-security-exceptions/batch-workloads.yaml @@ -0,0 +1,35 @@ +--- +# Exempt the health-probe controls (C-0018 readiness, C-0056 liveness) for +# run-to-completion batch workloads. A Job/CronJob pod runs to completion and +# never serves traffic, so readiness/liveness probes are semantically +# inapplicable — Kubescape nonetheless flags every Job/CronJob as missing them. +# +# Matched by workload KIND (not namespace) so it covers current and future batch +# pods cluster-wide — e.g. openbao `vault-snapshot-init`, kyverno +# `migrate-resources`, coroot `custom-cloud-pricing` — WITHOUT over-excepting any +# long-running Deployment/StatefulSet in those namespaces. This is deliberately +# narrower than a namespaceSelector: it lets the openbao StatefulSet (which has a +# readiness probe) keep being scanned for probes while excusing only its batch +# Job, honouring the health-probes.yaml principle of never excepting a namespace +# whose long-running workloads do configure probes. +apiVersion: kubescape.io/v1beta1 +kind: ClusterSecurityException +metadata: + name: batch-workloads +spec: + reason: >- + Readiness and liveness probes are inapplicable to run-to-completion batch + workloads (Jobs/CronJobs): the pod terminates on completion and never serves + traffic. Matched by workload kind so long-running workloads still require + probes. + posture: + - controlID: C-0018 + action: ignore + - controlID: C-0056 + action: ignore + match: + resources: + - apiGroup: batch + kind: Job + - apiGroup: batch + kind: CronJob diff --git a/k8s/bases/infrastructure/cluster-security-exceptions/kustomization.yaml b/k8s/bases/infrastructure/cluster-security-exceptions/kustomization.yaml index 13479eced..feb833c4e 100644 --- a/k8s/bases/infrastructure/cluster-security-exceptions/kustomization.yaml +++ b/k8s/bases/infrastructure/cluster-security-exceptions/kustomization.yaml @@ -3,6 +3,7 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - admission-controllers.yaml + - batch-workloads.yaml - cilium-network-policies.yaml - controller-rbac.yaml - health-probes.yaml diff --git a/k8s/bases/infrastructure/controllers/kubescape/cluster-role.yaml b/k8s/bases/infrastructure/controllers/kubescape/cluster-role.yaml index 4f642880f..5b107771a 100644 --- a/k8s/bases/infrastructure/controllers/kubescape/cluster-role.yaml +++ b/k8s/bases/infrastructure/controllers/kubescape/cluster-role.yaml @@ -2,8 +2,10 @@ # Grants the Kubescape posture scanner (`kubescape` ServiceAccount) read access to the # SecurityException / ClusterSecurityException CRDs. # -# The scanner's CRD exceptions getter (kubescape image >= v4.0.9, see the HelmRelease -# `kubescape.image.tag` override) LISTs these CRDs to apply posture exceptions. But the +# The scanner's CRD exceptions getter (present since kubescape image v4.0.9, but only +# functional from v4.0.10 — v4.0.9 LISTed the CRDs at the wrong apiVersion; see the +# HelmRelease `kubescape.image.tag` override) LISTs these CRDs to apply posture +# exceptions at the v1beta1 apiVersion. But the # kubescape-operator chart (<= 1.40.2) only grants this RBAC to the operator + kubevuln # components — never to the posture scanner SA. Without it the getter's List() is # forbidden; the scanner's merged exceptions getter swallows that error and falls back to diff --git a/k8s/bases/infrastructure/controllers/kubescape/config-map-headlamp-exceptions.yaml b/k8s/bases/infrastructure/controllers/kubescape/config-map-headlamp-exceptions.yaml index da70b5a5d..bd3f8b5f8 100644 --- a/k8s/bases/infrastructure/controllers/kubescape/config-map-headlamp-exceptions.yaml +++ b/k8s/bases/infrastructure/controllers/kubescape/config-map-headlamp-exceptions.yaml @@ -11,11 +11,15 @@ # "No data to be shown" and its compliance view recomputes scores client-side # with zero exceptions, so every control the CRs except still renders as failing. # -# The CRs remain the source of truth (they drive the kubescape operator posture -# report and are what `ksail workload scan` should consume via #2264). This file -# is a presentation-layer mirror for the Headlamp dashboard ONLY; keep it in sync -# by hand when the CRs change, and DELETE it once the plugin is bumped to a release -# that reads the CRDs natively. +# The CRs remain the source of truth. With the scanner at kubescape >= v4.0.10 +# (see the HelmRelease `kubescape.image.tag` override) the operator now applies the +# CRs at scan time and writes status=passed/subStatus=w-exceptions into the stored +# workloadconfigurationscansummaries — which the plugin renders directly, so once a +# post-bump scan has run this mirror is redundant even without a group selection. +# This file is a presentation-layer FALLBACK for the Headlamp dashboard ONLY; keep +# it in sync by hand while it lives, and DELETE it (plus the manual step below) once +# the v4.0.10 scanner is confirmed marking excepted controls in prod, OR once the +# plugin is bumped to a release that reads the CRDs natively — whichever lands first. # # NOTE: the plugin only evaluates workload posture scans, so the two host/CIS # false-positive CRs (talos-cis-control-plane-false-positives, @@ -28,9 +32,12 @@ # regex matches the workload's kind/name/namespace. Values are JS RegExp, anchored # here for exactness. `namespace: ".*"` mirrors a cluster-wide (unscoped) CR. # -# IMPORTANT (manual step): the plugin applies ONE selected group at a time. On the -# plugin's Compliance page, pick the "platform-exceptions" group from the -# exceptions selector for these to take effect in the view. +# IMPORTANT (interim manual step, needed ONLY until a v4.0.10 scan has run): the +# v0.11.2 plugin applies ONE selected group at a time and recomputes scores +# client-side, so before the scanner marks the stored summaries a viewer must pick +# the "platform-exceptions" group from the Compliance page's exceptions selector for +# these to take effect. Once the v4.0.10 scanner writes the exception status this is +# no longer required (the plugin renders the summary status directly). apiVersion: v1 kind: ConfigMap metadata: @@ -116,6 +123,30 @@ data: } ] }, + { + "name": "batch-workloads", + "policyType": "postureExceptionPolicy", + "actions": [ + "alertOnly" + ], + "reason": "Readiness/liveness probes are inapplicable to run-to-completion Jobs/CronJobs; matched by workload kind so long-running workloads still require probes.", + "resources": [ + { + "designatorType": "Attributes", + "attributes": { + "kind": "^(Job|CronJob)$" + } + } + ], + "posturePolicies": [ + { + "controlID": "^C-0018$" + }, + { + "controlID": "^C-0056$" + } + ] + }, { "name": "infrastructure-privileged", "policyType": "postureExceptionPolicy", diff --git a/k8s/bases/infrastructure/controllers/kubescape/helm-release.yaml b/k8s/bases/infrastructure/controllers/kubescape/helm-release.yaml index b522c0965..9f93e8e9a 100644 --- a/k8s/bases/infrastructure/controllers/kubescape/helm-release.yaml +++ b/k8s/bases/infrastructure/controllers/kubescape/helm-release.yaml @@ -53,18 +53,23 @@ spec: # the in-cluster SecurityException / ClusterSecurityException CRDs # (k8s/bases/infrastructure/cluster-security-exceptions/) and emit posture-exception # match events (kubescape#2291). IMPORTANT: this flag alone does NOT make POSTURE - # exceptions apply — that needs TWO more things the chart (<= 1.40.2) does not give us: + # exceptions apply — that needs THREE more things the chart (<= 1.40.2) does not give us: # (1) the posture scanner reads those CRDs via a CRD exceptions getter that only - # exists in kubescape image >= v4.0.9 (kubescape#2322); chart 1.40.2 still pins - # the scanner to v4.0.8 → see the `kubescape.image.tag` override below; and - # (2) the scanner ServiceAccount needs RBAC to `list` those CRDs, which NO chart + # exists in kubescape image >= v4.0.9 (kubescape#2291/#2322); chart 1.40.2 still + # pins the scanner to v4.0.8 → see the `kubescape.image.tag` override below; + # (2) that getter must LIST the CRDs at the apiVersion the chart serves — v4.0.9 + # hard-coded kubescape.io/v1 while the chart's CRDs serve ONLY v1beta1, so its + # List() 404s and (being the secondary/merged source) is silently swallowed → + # still zero exceptions. kubescape#2366 (image >= v4.0.10) switches the getter to + # v1beta1, so the override below is at v4.0.10 for this reason; and + # (3) the scanner ServiceAccount needs RBAC to `list` those CRDs, which NO chart # version grants it (only operator/kubevuln get it) → see `cluster-role.yaml` # + `cluster-role-binding.yaml`. - # With v4.0.8 + no RBAC the scanner silently loads zero exceptions (its merged getter - # swallows the missing/forbidden CRD source and falls back to the offline = empty - # primary), so the dashboard reads "excluding 0 controls" even though the CRs are - # deployed and schema-valid. Exemptions stay a LAST RESORT behind real fixes — keep - # the cluster-security-exceptions set minimal and irreducible. + # Missing any one of the three → the scanner silently loads zero exceptions (its merged + # getter swallows the missing/forbidden/wrong-version CRD source and falls back to the + # offline = empty primary), so the dashboard reads "excluding 0 controls" even though + # the CRs are deployed and schema-valid. Exemptions stay a LAST RESORT behind real + # fixes — keep the cluster-security-exceptions set minimal and irreducible. riskAcceptance: enable # Run posture/config scanning OFFLINE (air-gapped). This cluster has no # ARMO SaaS account, but the chart defaults to submit-mode: the scanner @@ -114,19 +119,22 @@ spec: memory: 1Gi kubescape: image: - # The posture scanner applies the SecurityException / ClusterSecurityException - # CRDs via its CRD exceptions getter, added in kubescape v4.0.9 - # (kubescape#2291 + #2322). Chart 1.40.2 still pins the scanner to v4.0.8, which - # has NO such getter, so posture exceptions are silently ignored no matter how the - # CRs are authored or what RBAC exists. Override to v4.0.9 so the getter is present; - # it is paired with `cluster-role.yaml` + `cluster-role-binding.yaml`, which grant - # the scanner SA the `list` - # RBAC the getter needs (the chart never grants it). The v4.0.9 merged getter still - # swallows a missing-RBAC/CRD source, so this bump cannot break scans on its own. - # Drop both once the chart bumps the scanner past v4.0.9 AND grants it the - # securityexceptions RBAC. (Same component-image-ahead-of-chart pattern as the - # nodeAgent override above.) - tag: v4.0.9 + # The posture scanner applies the SecurityException / ClusterSecurityException CRDs + # via its CRD exceptions getter (added in kubescape v4.0.9, kubescape#2291 + #2322). + # Chart 1.40.2 pins the scanner to v4.0.8, which has NO such getter, so posture + # exceptions are silently ignored no matter how the CRs are authored or what RBAC + # exists. But v4.0.9's getter hard-coded the CRD apiVersion to kubescape.io/v1 while + # the chart serves ONLY kubescape.io/v1beta1 — so its List() 404s and, as the + # secondary/merged source, is silently swallowed → still ZERO exceptions applied + # (dashboard "excluding 0 controls"). kubescape#2366 (first shipped in image v4.0.10) + # switches the getter to v1beta1, so v4.0.10 is the first scanner that actually reads + # our CRs offline. Paired with `cluster-role.yaml` + `cluster-role-binding.yaml`, + # which grant the scanner SA the `list` RBAC the getter needs (the chart never grants + # it). The merged getter still swallows a missing-RBAC/CRD source, so this bump cannot + # break scans on its own. Drop this override + the reader RBAC once the chart bumps + # the scanner to >= v4.0.10 AND grants it the securityexceptions RBAC. (Same + # component-image-ahead-of-chart pattern as the nodeAgent override above.) + tag: v4.0.10 # Hetzner CSI provisions a minimum 10Gi volume; match the PVC request # to avoid Helm upgrade failures from PVC shrink rejection. persistence: