diff --git a/docs/dr/alerting.md b/docs/dr/alerting.md index dbbec54f7..9b6680c62 100644 --- a/docs/dr/alerting.md +++ b/docs/dr/alerting.md @@ -90,6 +90,45 @@ stays quiet by design, exactly as the old Alertmanager did. container logs/traces, not host audit-log files, so the previous alloy-audit → Loki pipeline was removed. +## Kubescape runtime-detection alerts + +The Kubescape node-agent's **runtime-detection** alerts (rule violations, +malware) are the one signal that does **not** fit the Coroot model, because their +only first-class dashboard — the **Headlamp Kubescape plugin's "Runtime +Detection → Alerts" tab** — reads exclusively from a Prometheus **Alertmanager** +(`GET /api/v2/alerts`, filtered on `alertname="KubescapeRuleViolated"`). It +cannot read Kubescape's storage CRs and cannot query Coroot's Prometheus (a +metrics store, not an Alertmanager). So a **single, minimal Alertmanager** is +reintroduced *scoped to Kubescape* — not a re-adoption of the Prometheus stack. +It lives in the `kubescape` namespace, prod-only +(`providers/hetzner/infrastructure/controllers/alertmanager/`), ~10m CPU / 32Mi +RAM, ephemeral (no PVC). + +The node-agent fans each alert out to **all three destinations** (wired in +`providers/hetzner/infrastructure/controllers/kubescape/patches/`): + +| Destination | Path | +| --- | --- | +| **Headlamp plugin** | `nodeAgent.config.alertManagerExporterUrls` → the Alertmanager, which the plugin queries. | +| **Slack** | the Alertmanager's `slack_configs` receiver → the shared `${alertmanager_webhook_url}` incoming-webhook (same channel as Coroot/Flux). | +| **Coroot** | `nodeAgent.config.stdoutExporter` (on by default) → Coroot's eBPF log capture surfaces the alert in the **Logs** view (Coroot CE has no inbound alert receiver). | + +**One manual step (Headlamp).** The plugin's Alertmanager address is a +**per-user, per-browser** setting (stored in `localStorage`; there is no +declarative/Helm way to seed it — [headlamp#3979](https://github.com/kubernetes-sigs/headlamp/issues/3979)). +Each operator sets it **once** in Headlamp → *Settings → Plugins → Kubescape*, +in the `namespace/service:port` form the plugin validates: + +``` +kubescape/alertmanager:9093 +``` + +The plugin reaches it through the Kubernetes API server's Service proxy, so the +logged-in user needs `get`/`create` on `services/proxy` in the `kubescape` +namespace (satisfied by the admin binding). Until it is set, the tab shows +"Alertmanager URL is not configured" — the data source now exists, only the +per-user pointer is manual. + ## Dead-man's-switch (off-cluster heartbeat) In-cluster alerting cannot tell you the cluster is down — it's down too. A tiny diff --git a/k8s/providers/hetzner/infrastructure/controllers/alertmanager/cilium-network-policy.yaml b/k8s/providers/hetzner/infrastructure/controllers/alertmanager/cilium-network-policy.yaml new file mode 100644 index 000000000..1aa8b5d0f --- /dev/null +++ b/k8s/providers/hetzner/infrastructure/controllers/alertmanager/cilium-network-policy.yaml @@ -0,0 +1,57 @@ +# Adds the Alertmanager-specific flows on top of the namespace-wide +# `allow-kubescape` policy (bases/infrastructure/controllers/kubescape/) and the +# Kyverno-generated default-deny. Cilium composes policies ADDITIVELY (the +# effective allow-set is the UNION across every policy selecting the endpoint), so +# this only needs to add what the base does not already grant the Alertmanager pod: +# * the node-agent → Alertmanager:9093 push is intra-namespace, already allowed +# by allow-kubescape's intra-namespace rules — not repeated here; +# * DNS egress is already granted to every kubescape pod by allow-kubescape. +apiVersion: cilium.io/v2 +kind: CiliumNetworkPolicy +metadata: + name: allow-alertmanager + namespace: kubescape +spec: + endpointSelector: + matchLabels: + app.kubernetes.io/name: alertmanager + ingress: + # The Headlamp Kubescape plugin reads GET /api/v2/alerts through the + # Kubernetes API server's Service proxy (/api/v1/namespaces/kubescape/ + # services/alertmanager:9093/proxy/...), so the connection to :9093 + # originates from the API server. Mirror the entity set the base + # allow-kubescape policy uses for the webhook/host-scanner ports. + - fromEntities: + - kube-apiserver + - remote-node + - host + toPorts: + - ports: + - port: "9093" + protocol: TCP + egress: + # Post alert notifications to the Slack incoming-webhook. Pinned by FQDN + # (not world:443) so a compromised pod cannot exfiltrate elsewhere — the same + # lockdown posture as allow-kubescape's toFQDNs list. + - toFQDNs: + - matchName: "hooks.slack.com" + toPorts: + - ports: + - port: "443" + protocol: TCP + # DNS resolution, so Cilium's L7 DNS proxy can learn hooks.slack.com's IPs and + # enforce the toFQDNs allow above (the base policy engages the proxy too, but + # keep this self-contained). + - toEndpoints: + - matchLabels: + k8s:io.kubernetes.pod.namespace: kube-system + k8s-app: kube-dns + toPorts: + - ports: + - port: "53" + protocol: UDP + - port: "53" + protocol: TCP + rules: + dns: + - matchPattern: "*" diff --git a/k8s/providers/hetzner/infrastructure/controllers/alertmanager/helm-release.yaml b/k8s/providers/hetzner/infrastructure/controllers/alertmanager/helm-release.yaml new file mode 100644 index 000000000..99fcbad84 --- /dev/null +++ b/k8s/providers/hetzner/infrastructure/controllers/alertmanager/helm-release.yaml @@ -0,0 +1,120 @@ +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: alertmanager + namespace: kubescape + labels: + helm.toolkit.fluxcd.io/remediation: enabled +spec: + # WHY THIS EXISTS (prod-only). The platform's general alerting was migrated off + # the kube-prometheus-stack onto Coroot (docs/dr/alerting.md), so there is no + # Alertmanager in the cluster. But the Kubescape node-agent's runtime-detection + # alerts have exactly one first-class dashboard — the Headlamp Kubescape + # plugin's "Runtime Detection > Alerts" tab — and that tab reads ONLY from a + # Prometheus Alertmanager `GET /api/v2/alerts` (filtered on + # alertname="KubescapeRuleViolated"); it cannot read Kubescape storage CRs and + # cannot query Coroot's Prometheus (a metrics store, not an Alertmanager). So a + # single, minimal Alertmanager is reintroduced here — scoped to the kubescape + # namespace and to runtime alerts only, NOT a general re-adoption of the + # Prometheus stack. One instance both RECEIVES the node-agent's pushed alerts + # (POST /api/v2/alerts, wired via the nodeAgent.config.alertManagerExporterUrls + # patch in ../kubescape/patches/) and SERVES them to the Headlamp plugin (GET), + # and routes them to Slack (slack_configs → the shared incoming-webhook). The + # third destination, Coroot, is fed independently by the node-agent's stdout + # exporter (Coroot's eBPF log capture surfaces the alerts in its Logs view; + # Coroot CE has no inbound alert receiver). Prod-only because runtimeDetection + # is disabled on the docker/local overlay and Slack is a prod-only concern. + interval: 10m + timeout: 10m + chart: + spec: + chart: alertmanager + version: 1.40.1 + sourceRef: + kind: HelmRepository + name: prometheus-community + # https://github.com/prometheus-community/helm-charts/blob/main/charts/alertmanager/values.yaml + values: + # Deterministic in-cluster name so the node-agent exporter target and the + # Headlamp plugin setting can hard-code it: Service = alertmanager.kubescape.svc:9093. + fullnameOverride: alertmanager + replicaCount: 1 + # This Alertmanager never calls the Kubernetes API — don't mount the SA + # token (chart default is true). + automountServiceAccountToken: false + # Ephemeral by design: this Alertmanager only relays transient runtime alerts + # (no silences or gossip cluster worth persisting), so drop the chart's PVC + # and back /alertmanager with an emptyDir. Keeps it a zero-storage add-on. + persistence: + enabled: false + # Grossly-overprovisioned chart default is `resources: {}`; the kubescape + # namespace is excluded from auto-vpa, so pin an explicit, tiny block. + resources: + requests: + cpu: 10m + memory: 32Mi + limits: + cpu: 100m + memory: 128Mi + # The kubescape namespace is excluded from the platform's add-security-context + # mutation and the enforced PSS-restricted policy, so harden the container + # here instead of relying on Kyverno. readOnlyRootFilesystem is safe: the only + # path Alertmanager writes is --storage.path=/alertmanager, mounted as the + # emptyDir above; config + the webhook secret are read-only mounts. + podSecurityContext: + fsGroup: 65534 + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + securityContext: + runAsNonRoot: true + runAsUser: 65534 + runAsGroup: 65534 + readOnlyRootFilesystem: true + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + seccompProfile: + type: RuntimeDefault + # Mount the Slack incoming-webhook from a Secret so it never lands in the + # rendered alertmanager.yml ConfigMap; referenced below via slack_configs.api_url_file. + # Sibling path (NOT under /etc/alertmanager, which the chart already mounts + # the config ConfigMap onto) to avoid a nested volume mount. + extraSecretMounts: + - name: slack-webhook + mountPath: /etc/alertmanager-secrets + secretName: alertmanager-slack-webhook + readOnly: true + config: + enabled: true + global: + resolve_timeout: 5m + route: + receiver: slack + # The node-agent labels every alert with alertname + # (KubescapeRuleViolated / KubescapeMalwareDetected) + host, so group + # per rule per node. A single catch-all route sends everything to Slack. + group_by: + - alertname + - host + group_wait: 30s + group_interval: 5m + repeat_interval: 4h + receivers: + - name: slack + slack_configs: + # api_url_file reads the webhook from the mounted Secret (above), + # keeping it out of the ConfigMap. The path MUST match the Secret's + # key name (slack-webhook-url in secret.yaml) — they are coupled. + - api_url_file: /etc/alertmanager-secrets/slack-webhook-url + # channel is optional for an incoming webhook (the channel is baked + # into the webhook URL); set here only for clarity in the UI. + channel: "#platform-alerts" + send_resolved: true + title: '[{{ .Status | toUpper }}] {{ .CommonLabels.alertname }} on {{ .CommonLabels.host }}' + # Render every annotation the node-agent attaches (keys vary by + # rule), one per line, for each grouped alert. + text: |- + {{ range .Alerts }}{{ range .Annotations.SortedPairs }}*{{ .Name }}:* {{ .Value }} + {{ end }}{{ end }} diff --git a/k8s/providers/hetzner/infrastructure/controllers/alertmanager/helm-repository.yaml b/k8s/providers/hetzner/infrastructure/controllers/alertmanager/helm-repository.yaml new file mode 100644 index 000000000..c4de7ce6d --- /dev/null +++ b/k8s/providers/hetzner/infrastructure/controllers/alertmanager/helm-repository.yaml @@ -0,0 +1,9 @@ +--- +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmRepository +metadata: + name: prometheus-community + namespace: kubescape +spec: + interval: 1h + url: https://prometheus-community.github.io/helm-charts diff --git a/k8s/providers/hetzner/infrastructure/controllers/alertmanager/kustomization.yaml b/k8s/providers/hetzner/infrastructure/controllers/alertmanager/kustomization.yaml new file mode 100644 index 000000000..7481b9818 --- /dev/null +++ b/k8s/providers/hetzner/infrastructure/controllers/alertmanager/kustomization.yaml @@ -0,0 +1,8 @@ +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - helm-repository.yaml + - helm-release.yaml + - secret.yaml + - cilium-network-policy.yaml diff --git a/k8s/providers/hetzner/infrastructure/controllers/alertmanager/secret.yaml b/k8s/providers/hetzner/infrastructure/controllers/alertmanager/secret.yaml new file mode 100644 index 000000000..e8a3eff5a --- /dev/null +++ b/k8s/providers/hetzner/infrastructure/controllers/alertmanager/secret.yaml @@ -0,0 +1,18 @@ +# Slack incoming-webhook the Alertmanager Slack receiver POSTs Kubescape +# runtime-detection alerts to. Reuses ${alertmanager_webhook_url} — the SAME +# per-cluster webhook Coroot's incidents and the Flux notification-controller +# already post to (injected by Flux substitution from the variables-cluster +# Secret) — so there is nothing new to provision, and runtime alerts land in the +# same Slack channel as the rest of the platform's alerting. The inline default +# keeps `ksail workload validate` (which has no SOPS access) building and leaves +# local/CI inert; this overlay is prod-only anyway (runtimeDetection is disabled +# on docker/local). Mounted read-only into the Alertmanager pod and referenced by +# slack_configs.api_url_file so the URL never lands in the rendered ConfigMap. +apiVersion: v1 +kind: Secret +metadata: + name: alertmanager-slack-webhook + namespace: kubescape +type: Opaque +stringData: + slack-webhook-url: ${alertmanager_webhook_url:=https://example.invalid/no-slack-configured} diff --git a/k8s/providers/hetzner/infrastructure/controllers/kubescape/patches/helm-release-patch.yaml b/k8s/providers/hetzner/infrastructure/controllers/kubescape/patches/helm-release-patch.yaml new file mode 100644 index 000000000..377a166fb --- /dev/null +++ b/k8s/providers/hetzner/infrastructure/controllers/kubescape/patches/helm-release-patch.yaml @@ -0,0 +1,33 @@ +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: kubescape + namespace: kubescape +spec: + # Matches the base interval so this patch is a no-op on that field once merged. + interval: 10m + values: + nodeAgent: + config: + # Fan the runtime-detection alerts out to their three destinations + # (prod-only — runtimeDetection is disabled on the docker/local overlay). + # + # 1. Headlamp + 2. Slack — via the in-cluster Alertmanager deployed in + # ../alertmanager/. The node-agent POSTs alerts to it; the Headlamp + # Kubescape plugin's "Runtime Detection > Alerts" tab reads them back + # (GET /api/v2/alerts), and the Alertmanager Slack receiver forwards + # them to the shared incoming-webhook. The value is HOST:PORT ONLY + # (no scheme, no path) — the node-agent's Prometheus Alertmanager + # client appends /api/v2/alerts itself. + # 3. Coroot — via stdout. Coroot CE has no inbound alert receiver, but + # its eBPF node-agent captures this pod's stdout, so the alerts show + # up in Coroot's Logs view (and its log-pattern inspection). This is + # the chart default (stdoutExporter: true); pinned here so the Coroot + # path is explicit and survives a chart-default change. + # + # The default httpExporterConfig (→ synchronizer:8089) is deliberately + # left untouched: it is what feeds the plugin's other Runtime Detection + # tabs (Application Profiles, Rules, …) from the Kubescape storage CRs. + alertManagerExporterUrls: + - alertmanager.kubescape.svc:9093 + stdoutExporter: true diff --git a/k8s/providers/hetzner/infrastructure/controllers/kustomization.yaml b/k8s/providers/hetzner/infrastructure/controllers/kustomization.yaml index ee182d967..71ae9be10 100644 --- a/k8s/providers/hetzner/infrastructure/controllers/kustomization.yaml +++ b/k8s/providers/hetzner/infrastructure/controllers/kustomization.yaml @@ -8,6 +8,13 @@ resources: # local/CI only): no VirtualMachines/DataVolumes exist on Hetzner and # virt-handler crash-looped (1000+ restarts) with zero VMs. - ../../../../bases/infrastructure/controllers/ + # Prod-only: a minimal Alertmanager scoped to Kubescape runtime-detection + # alerts. The Headlamp Kubescape plugin's "Runtime Detection > Alerts" tab + # reads ONLY from a Prometheus Alertmanager (GET /api/v2/alerts), which the + # Coroot migration removed from the cluster; this reintroduces one, fed by the + # node-agent (kubescape/patches/ below) and routing to Slack. See its + # helm-release.yaml header. Prod-only (runtimeDetection is off on docker/local). + - alertmanager/ # Prod-only: layers a cluster-wide mutual-auth (SPIRE mTLS) policy on top of # the base Cilium controller listed above. SPIRE is enabled only on Hetzner, # so the policy lives here, not in the base — see cilium/ for the rationale. @@ -70,6 +77,10 @@ patches: # eager startup OIDC discovery that needs a cluster-resolvable + TLS-trusted # issuer, which the local *.platform.lan host-file domain isn't — see file. - path: ksail-operator/patches/helm-release-patch.yaml + # Prod-only: point the Kubescape node-agent's runtime-detection alerts at the + # in-cluster Alertmanager (alertmanager/ above) + keep the stdout exporter on, + # fanning them out to the Headlamp plugin, Slack, and Coroot — see file. + - path: kubescape/patches/helm-release-patch.yaml # NB: the Coroot CR moved to the `infrastructure` layer (so the operator's # CRD is installed first), so its hetzner patch now lives in # ../coroot/patches/ and is applied by ../kustomization.yaml, not here.