From 07f7ad2afb392b5f157989aaf75e2ae77b42950e Mon Sep 17 00:00:00 2001 From: Nikolai Emil Damm Date: Sat, 4 Jul 2026 21:25:50 +0200 Subject: [PATCH 1/2] refactor: name patch fragments by intent and codify patch naming rules MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch fragments now follow the CR-folder naming convention: they live under a patches/ directory, hold one resource per file, and are named -.yaml — no Kind-led names, no redundant -patch suffix (the flux-kustomization prefix stays, per the Flux CR rule). Talos machine-config patches (talos/, talos-local/) are explicitly exempt from all naming and file-structure conventions. validate-naming.py enforces the new rules (checks 7-8); three no-op patches (two empty spec: {} placeholders and a Corefile patch identical to the base) are dropped. All five affected overlays render byte-identical before and after. Co-Authored-By: Claude Fable 5 --- AGENTS.md | 7 ++-- docs/TENANTS.md | 2 +- docs/dr/alerting.md | 2 +- docs/dr/spire-server-ha.md | 2 +- docs/wireguard-vpn-access.md | 2 +- .../controllers/cilium/helm-release.yaml | 2 +- .../vault-backup/persistent-volume-claim.yaml | 2 +- ...ase-patch.yaml => shrink-persistence.yaml} | 0 ...ease-patch.yaml => trust-platform-ca.yaml} | 0 k8s/providers/docker/apps/kustomization.yaml | 4 +-- ...> disable-encryption-and-mutual-auth.yaml} | 0 ...ease-patch.yaml => trust-platform-ca.yaml} | 0 ...h.yaml => disable-heavy-capabilities.yaml} | 0 ...ch.yaml => enable-software-emulation.yaml} | 0 .../controllers/kustomization.yaml | 12 +++---- ...h.yaml => allow-insecure-kubelet-tls.yaml} | 0 ...ease-patch.yaml => disable-helm-wait.yaml} | 0 .../apps/backstage/cilium-network-policy.yaml | 2 +- ...tch.yaml => enable-coroot-monitoring.yaml} | 0 .../headlamp/patches/helm-release-patch.yaml | 6 ---- ...patch.yaml => enable-user-namespaces.yaml} | 0 k8s/providers/hetzner/apps/kustomization.yaml | 13 ++++--- .../apps/umami/cilium-network-policy.yaml | 2 +- ...tch.yaml => enable-coroot-monitoring.yaml} | 2 +- ...nitor-patch.yaml => grant-pg-monitor.yaml} | 0 ...lux-kustomization-protect-wedding-db.yaml} | 0 ...patch.yaml => enable-user-namespaces.yaml} | 0 ...aml => enforce-wireguard-strict-mode.yaml} | 2 +- ...h.yaml => store-spire-data-on-hcloud.yaml} | 0 .../controllers/coredns/kustomization.yaml | 2 -- .../patches/corefile-config-map-patch.yaml | 30 ---------------- .../patches/helm-release-patch.yaml | 6 ---- ...lm-release-patch.yaml => enable-oidc.yaml} | 0 .../controllers/kustomization.yaml | 11 +++--- ...e-patch.yaml => store-data-on-hcloud.yaml} | 0 .../controllers/velero/config-map.yaml | 2 +- ...e-patch.yaml => enable-csi-snapshots.yaml} | 0 .../infrastructure/coroot/cluster.yaml | 2 +- .../{coroot-patch.yaml => enable-ha.yaml} | 0 .../hetzner/infrastructure/kustomization.yaml | 10 +++--- .../add-ascoachingogvaner-dk-listeners.yaml} | 0 .../attach-hcloud-load-balancer.yaml} | 0 .../store-vault-snapshots-on-hcloud.yaml} | 0 scripts/validate-naming.py | 35 ++++++++++++++++--- 44 files changed, 70 insertions(+), 90 deletions(-) rename k8s/providers/docker/apps/actual-budget/patches/{helm-release-patch.yaml => shrink-persistence.yaml} (100%) rename k8s/providers/docker/apps/headlamp/patches/{helm-release-patch.yaml => trust-platform-ca.yaml} (100%) rename k8s/providers/docker/infrastructure/controllers/cilium/patches/{helm-release-patch.yaml => disable-encryption-and-mutual-auth.yaml} (100%) rename k8s/providers/docker/infrastructure/controllers/flux-operator/patches/{helm-release-patch.yaml => trust-platform-ca.yaml} (100%) rename k8s/providers/docker/infrastructure/controllers/kubescape/patches/{helm-release-patch.yaml => disable-heavy-capabilities.yaml} (100%) rename k8s/providers/docker/infrastructure/controllers/kubevirt/patches/{kubevirt-cr-patch.yaml => enable-software-emulation.yaml} (100%) rename k8s/providers/docker/infrastructure/controllers/metrics-server/patches/{helm-release-patch.yaml => allow-insecure-kubelet-tls.yaml} (100%) rename k8s/providers/docker/infrastructure/controllers/tetragon/patches/{helm-release-patch.yaml => disable-helm-wait.yaml} (100%) rename k8s/providers/hetzner/apps/backstage/patches/{postgres-cluster-patch.yaml => enable-coroot-monitoring.yaml} (100%) delete mode 100644 k8s/providers/hetzner/apps/headlamp/patches/helm-release-patch.yaml rename k8s/providers/hetzner/apps/homepage/patches/{helm-release-patch.yaml => enable-user-namespaces.yaml} (100%) rename k8s/providers/hetzner/apps/umami/patches/{postgres-cluster-patch.yaml => enable-coroot-monitoring.yaml} (96%) rename k8s/providers/hetzner/apps/umami/patches/{grant-pg-monitor-patch.yaml => grant-pg-monitor.yaml} (100%) rename k8s/providers/hetzner/apps/wedding-app/patches/{flux-kustomization-patch.yaml => flux-kustomization-protect-wedding-db.yaml} (100%) rename k8s/providers/hetzner/apps/whoami/patches/{helm-release-patch.yaml => enable-user-namespaces.yaml} (100%) rename k8s/providers/hetzner/infrastructure/controllers/cilium/patches/{helm-release-patch.yaml => enforce-wireguard-strict-mode.yaml} (98%) rename k8s/providers/hetzner/infrastructure/controllers/cilium/patches/{spire-datastorage-patch.yaml => store-spire-data-on-hcloud.yaml} (100%) delete mode 100644 k8s/providers/hetzner/infrastructure/controllers/coredns/patches/corefile-config-map-patch.yaml delete mode 100644 k8s/providers/hetzner/infrastructure/controllers/flux-operator/patches/helm-release-patch.yaml rename k8s/providers/hetzner/infrastructure/controllers/ksail-operator/patches/{helm-release-patch.yaml => enable-oidc.yaml} (100%) rename k8s/providers/hetzner/infrastructure/controllers/openbao/patches/{helm-release-patch.yaml => store-data-on-hcloud.yaml} (100%) rename k8s/providers/hetzner/infrastructure/controllers/velero/patches/{helm-release-patch.yaml => enable-csi-snapshots.yaml} (100%) rename k8s/providers/hetzner/infrastructure/coroot/patches/{coroot-patch.yaml => enable-ha.yaml} (100%) rename k8s/providers/hetzner/infrastructure/{ascoachingogvaner-dk-listeners-patch.yaml => patches/add-ascoachingogvaner-dk-listeners.yaml} (100%) rename k8s/providers/hetzner/infrastructure/{gateway-patch.yaml => patches/attach-hcloud-load-balancer.yaml} (100%) rename k8s/providers/hetzner/infrastructure/{vault-snapshots-hcloud-patch.yaml => patches/store-vault-snapshots-on-hcloud.yaml} (100%) diff --git a/AGENTS.md b/AGENTS.md index 22747fd2e..3f9b232ef 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -206,18 +206,19 @@ You **cannot** decrypt existing secrets without the proper Age keys. For local d - **Never commit plaintext secrets** — all secrets must be SOPS-encrypted with the `.enc.yaml` suffix. - **Base files are immutable** — use Kustomize `patches:` in overlays; never edit `k8s/bases/` directly from a provider or cluster overlay. - **Flux dependency order** — `bootstrap` → `infrastructure-controllers` → `infrastructure` → `apps`. One prod-only side layer hangs off `infrastructure` without gating `apps`: `infrastructure-overprovisioning` (apply-only autoscaler buffer). Declarative GitHub org management runs as a normal **app** (`github-config`) consuming the `devantler-tech/.github` artifact, with its Crossplane provider in the `infrastructure` layer — see [`docs/github-management.md`](docs/github-management.md). -- **File & directory naming** — kebab-case folders, one resource per file, and filenames led by the resource Kind (CR folders excepted); enforced by the `naming` CI job. See [File and Directory Naming Conventions](#file-and-directory-naming-conventions) below. +- **File & directory naming** — kebab-case folders, one resource per file, and filenames led by the resource Kind (CR folders and `patches/` excepted — both name files by intent); Talos machine-config patches (`talos/`, `talos-local/`) are fully exempt. Enforced by the `naming` CI job. See [File and Directory Naming Conventions](#file-and-directory-naming-conventions) below. ### File and Directory Naming Conventions Enforced in CI by [`scripts/validate-naming.py`](scripts/validate-naming.py) (the `naming` job in `ci.yaml`); run it locally before any manifest PR. - **Directories are kebab-case**, named after the **application/component** *or* a **CR Kind in plural**. Co-locate a component's own CRs in its folder by default; break a CR out into a `‹kind-plural›/` folder only when it cannot live with its component (see the two reasons in the next section). `‹kind-plural›` is the **kebab-cased plural of the Kind** (`VerticalPodAutoscaler → vertical-pod-autoscalers/`, `LimitRange → limit-ranges/`) — a folder that groups ≥2 instances of one non-workload Kind under any other name is flagged. -- **One Kubernetes resource per file.** The only exception is a vendored upstream operator bundle, explicitly whitelisted in the validator (today `controllers/cdi/cdi-operator.yaml` and `controllers/kubevirt/kubevirt-operator.yaml`). +- **One Kubernetes resource per file** — patch fragments included. The only exception is a vendored upstream operator bundle, explicitly whitelisted in the validator (today `controllers/cdi/cdi-operator.yaml` and `controllers/kubevirt/kubevirt-operator.yaml`). - **Component-folder files are named after their resource Kind, kebab-cased**: `‹kind›.yaml` (e.g. `helm-release.yaml`, `http-route.yaml`, `cilium-network-policy.yaml`, `service-account.yaml`). When a folder holds more than one of a Kind, qualify each with a purpose: `‹kind›-‹purpose›.yaml` (e.g. `external-secret-db-backup.yaml`). The Kind→kebab map is acronym-aware: `HTTPRoute → http-route`, `OCIRepository → oci-repository`, `CiliumNetworkPolicy → cilium-network-policy`, `PodDisruptionBudget → pod-disruption-budget`. - **CR-folder files** omit the folder-implied Kind and are named `‹verb›-‹purpose›.yaml` (e.g. `restrict-tenant-secret-stores.yaml`). - A **Flux `Kustomization` CR** (`kustomize.toolkit.fluxcd.io`) is named `flux-kustomization*.yaml`; the `flux-` prefix disambiguates it from the kustomize **build** file, which must stay exactly `kustomization.yaml` (`kustomize.config.k8s.io`). -- **Patch fragments** (under a `patches/` directory or named `*-patch.yaml`) are overlay inputs, not deployed resources — they keep an intent-describing name and are exempt from the Kind-leads rule (but stay kebab-case, one-resource, and keep the `flux-kustomization` prefix where applicable). +- **Patch fragments** are overlay inputs, not deployed resources. They live under a `patches/` directory (a `*-patch.yaml` loose next to a kustomization is flagged as misplaced) and follow the **CR-folder naming convention**: an intent-describing `‹verb›-‹purpose›.yaml` (e.g. `enable-oidc.yaml`, `store-spire-data-on-hcloud.yaml`) that neither leads with the patched Kind nor carries a `-patch` suffix — the folder already says it's a patch. One-resource-per-file applies to them too; a patch on a Flux `Kustomization` CR keeps the `flux-kustomization` prefix (e.g. `flux-kustomization-protect-wedding-db.yaml`). +- **Talos machine-config patches** (`talos/`, `talos-local/`) are **exempt from all naming and file-structure conventions** — they are Talos config fragments, not Kubernetes manifests, and keep upstream Talos' file style. ### Infrastructure File Structure Convention diff --git a/docs/TENANTS.md b/docs/TENANTS.md index 6b16ffd98..c357049c6 100644 --- a/docs/TENANTS.md +++ b/docs/TENANTS.md @@ -191,7 +191,7 @@ Open the change as a PR; once merged, Flux reconciles the new tenant. A tenant's manifests ship from its own OCI artifact, but the **prod (hetzner) overlay** can layer a `Kustomization` `spec.patches` onto the tenant's platform-side Flux `Kustomization` -at `k8s/providers/hetzner/apps//patches/kustomization-patch.yaml`, which Flux then +at `k8s/providers/hetzner/apps//patches/flux-kustomization-.yaml`, which Flux then applies to the tenant's resources *after* pulling the artifact. This is a **narrow escape hatch**, not a place for tenant config. diff --git a/docs/dr/alerting.md b/docs/dr/alerting.md index dbbec54f7..2b7324721 100644 --- a/docs/dr/alerting.md +++ b/docs/dr/alerting.md @@ -33,7 +33,7 @@ pressure (the operator only exposes `priorityClassName` on the node-agent). Local/CI (docker provider) runs the same CR on the cluster's default storage class (ephemeral — losing telemetry on a restart is fine there). The `hcloud` PVC overrides and longer retention live in the hetzner overlay -(`k8s/providers/hetzner/infrastructure/controllers/coroot/patches/`), the same +(`k8s/providers/hetzner/infrastructure/coroot/patches/`), the same way OpenBao gets block storage. ## SSO diff --git a/docs/dr/spire-server-ha.md b/docs/dr/spire-server-ha.md index e7321d414..85d7b136c 100644 --- a/docs/dr/spire-server-ha.md +++ b/docs/dr/spire-server-ha.md @@ -39,7 +39,7 @@ Three independent blockers, each fatal to a blind change: i.e. **behind the very gate SPIRE bootstraps**. SPIRE down → its Postgres unreachable/uncertifiable → SPIRE can't start → loop. This is precisely the SPIRE↔Longhorn deadlock the prod overlay already engineered around by moving - the datastore to hcloud-csi (`cilium/patches/spire-datastorage-patch.yaml`, + the datastore to hcloud-csi (`cilium/patches/store-spire-data-on-hcloud.yaml`, 2026-06-06 prod outage), but Postgres is a *busier, multi-pod, multi-node* dependency than a single attached block device, so it is strictly harder to make safe. diff --git a/docs/wireguard-vpn-access.md b/docs/wireguard-vpn-access.md index 790238873..2cfd10835 100644 --- a/docs/wireguard-vpn-access.md +++ b/docs/wireguard-vpn-access.md @@ -28,7 +28,7 @@ Talos control planes run a WireGuard **server** (`wg0` = `10.200.0.1/24`, | --- | --- | --- | | Gateway | single Cilium `Gateway platform` (kube-system), HTTPS:443, wildcard `gateway-tls` | `k8s/bases/infrastructure/gateway/` | | Admin routes | all 7 HTTPRoutes bind that Gateway, `allowedRoutes.from: All` | per-controller `http-route.yaml` | -| LB | `cilium-gateway-platform` Service `type=LoadBalancer` → **Hetzner Cloud LB** | `gateway-patch.yaml` (hcloud annotations) | +| LB | `cilium-gateway-platform` Service `type=LoadBalancer` → **Hetzner Cloud LB** | `patches/attach-hcloud-load-balancer.yaml` (hcloud annotations) | | LB IPs | public `49.12.20.241` (+ IPv6) **and private `10.0.1.7`**; ports 80/443 (nodePorts 32269/30755); `externalTrafficPolicy: Cluster` | live `svc` status | | Public DNS | admin hostnames → **Cloudflare** (`188.114.96/97.1`) → proxied to the LB origin | `dig` | | Cilium | `kube-proxy-replacement=true`, `routing-mode=tunnel/vxlan`, `enable-ipv4-masquerade=true`, LB-IPAM enabled **but no `CiliumLoadBalancerIPPool`**, **`devices = enp7s0 eth1`** | live `cilium-config` | diff --git a/k8s/bases/infrastructure/controllers/cilium/helm-release.yaml b/k8s/bases/infrastructure/controllers/cilium/helm-release.yaml index 560f7454d..636a79e20 100644 --- a/k8s/bases/infrastructure/controllers/cilium/helm-release.yaml +++ b/k8s/bases/infrastructure/controllers/cilium/helm-release.yaml @@ -264,7 +264,7 @@ spec: # # The datastore StorageClass is intentionally NOT set here. On # hetzner it is pinned to hcloud-csi by the prod overlay - # (providers/hetzner/.../cilium/patches/spire-datastorage-patch.yaml) + # (providers/hetzner/.../cilium/patches/store-spire-data-on-hcloud.yaml) # — NOT the cluster-default longhorn. The reason is a circular # dependency: Longhorn's own control plane is pod-to-pod traffic # behind this very mTLS gate, so binding spire-server's PVC to diff --git a/k8s/bases/infrastructure/vault-backup/persistent-volume-claim.yaml b/k8s/bases/infrastructure/vault-backup/persistent-volume-claim.yaml index ab7ea72fa..22dcefa47 100644 --- a/k8s/bases/infrastructure/vault-backup/persistent-volume-claim.yaml +++ b/k8s/bases/infrastructure/vault-backup/persistent-volume-claim.yaml @@ -1,7 +1,7 @@ # Dedicated volume for OpenBao raft snapshots (written by the # vault-snapshot CronJob, newest 14 retained). Uses the cluster's default # StorageClass (the hetzner overlay pins it to hcloud — see -# k8s/providers/hetzner/infrastructure/vault-snapshots-hcloud-patch.yaml). +# k8s/providers/hetzner/infrastructure/patches/store-vault-snapshots-on-hcloud.yaml). # Snapshots on this PVC are the first-line restore source after OpenBao data # loss: the vault-config Job restores from the newest one automatically when # it finds an uninitialized cluster alongside a surviving openbao-unseal diff --git a/k8s/providers/docker/apps/actual-budget/patches/helm-release-patch.yaml b/k8s/providers/docker/apps/actual-budget/patches/shrink-persistence.yaml similarity index 100% rename from k8s/providers/docker/apps/actual-budget/patches/helm-release-patch.yaml rename to k8s/providers/docker/apps/actual-budget/patches/shrink-persistence.yaml diff --git a/k8s/providers/docker/apps/headlamp/patches/helm-release-patch.yaml b/k8s/providers/docker/apps/headlamp/patches/trust-platform-ca.yaml similarity index 100% rename from k8s/providers/docker/apps/headlamp/patches/helm-release-patch.yaml rename to k8s/providers/docker/apps/headlamp/patches/trust-platform-ca.yaml diff --git a/k8s/providers/docker/apps/kustomization.yaml b/k8s/providers/docker/apps/kustomization.yaml index 1d33175a3..a0f75bbfe 100644 --- a/k8s/providers/docker/apps/kustomization.yaml +++ b/k8s/providers/docker/apps/kustomization.yaml @@ -22,12 +22,12 @@ kind: Kustomization # kind: HelmRelease # name: actual-budget # namespace: actual-budget -# path: actual-budget/patches/helm-release-patch.yaml +# path: actual-budget/patches/shrink-persistence.yaml # - target: # kind: HelmRelease # name: headlamp # namespace: headlamp -# path: headlamp/patches/helm-release-patch.yaml +# path: headlamp/patches/trust-platform-ca.yaml # # Tenant RGD pilot (#1932 step 3) — opt-in like the apps above. Enabling it has # the KRO controller expand ascoachingogvaner's full control-plane skeleton from diff --git a/k8s/providers/docker/infrastructure/controllers/cilium/patches/helm-release-patch.yaml b/k8s/providers/docker/infrastructure/controllers/cilium/patches/disable-encryption-and-mutual-auth.yaml similarity index 100% rename from k8s/providers/docker/infrastructure/controllers/cilium/patches/helm-release-patch.yaml rename to k8s/providers/docker/infrastructure/controllers/cilium/patches/disable-encryption-and-mutual-auth.yaml diff --git a/k8s/providers/docker/infrastructure/controllers/flux-operator/patches/helm-release-patch.yaml b/k8s/providers/docker/infrastructure/controllers/flux-operator/patches/trust-platform-ca.yaml similarity index 100% rename from k8s/providers/docker/infrastructure/controllers/flux-operator/patches/helm-release-patch.yaml rename to k8s/providers/docker/infrastructure/controllers/flux-operator/patches/trust-platform-ca.yaml diff --git a/k8s/providers/docker/infrastructure/controllers/kubescape/patches/helm-release-patch.yaml b/k8s/providers/docker/infrastructure/controllers/kubescape/patches/disable-heavy-capabilities.yaml similarity index 100% rename from k8s/providers/docker/infrastructure/controllers/kubescape/patches/helm-release-patch.yaml rename to k8s/providers/docker/infrastructure/controllers/kubescape/patches/disable-heavy-capabilities.yaml diff --git a/k8s/providers/docker/infrastructure/controllers/kubevirt/patches/kubevirt-cr-patch.yaml b/k8s/providers/docker/infrastructure/controllers/kubevirt/patches/enable-software-emulation.yaml similarity index 100% rename from k8s/providers/docker/infrastructure/controllers/kubevirt/patches/kubevirt-cr-patch.yaml rename to k8s/providers/docker/infrastructure/controllers/kubevirt/patches/enable-software-emulation.yaml diff --git a/k8s/providers/docker/infrastructure/controllers/kustomization.yaml b/k8s/providers/docker/infrastructure/controllers/kustomization.yaml index 234078d4a..fb4e1b24c 100644 --- a/k8s/providers/docker/infrastructure/controllers/kustomization.yaml +++ b/k8s/providers/docker/infrastructure/controllers/kustomization.yaml @@ -75,10 +75,10 @@ components: # enough — no redundant `target:` selector. Matches the path-only style used by # the hetzner overlays. patches: - - path: cilium/patches/helm-release-patch.yaml - - path: metrics-server/patches/helm-release-patch.yaml - - path: flux-operator/patches/helm-release-patch.yaml + - path: cilium/patches/disable-encryption-and-mutual-auth.yaml + - path: metrics-server/patches/allow-insecure-kubelet-tls.yaml + - path: flux-operator/patches/trust-platform-ca.yaml # Opt-in patches — uncomment alongside the matching controller above: - # - path: kubescape/patches/helm-release-patch.yaml - # - path: tetragon/patches/helm-release-patch.yaml - # - path: kubevirt/patches/kubevirt-cr-patch.yaml + # - path: kubescape/patches/disable-heavy-capabilities.yaml + # - path: tetragon/patches/disable-helm-wait.yaml + # - path: kubevirt/patches/enable-software-emulation.yaml diff --git a/k8s/providers/docker/infrastructure/controllers/metrics-server/patches/helm-release-patch.yaml b/k8s/providers/docker/infrastructure/controllers/metrics-server/patches/allow-insecure-kubelet-tls.yaml similarity index 100% rename from k8s/providers/docker/infrastructure/controllers/metrics-server/patches/helm-release-patch.yaml rename to k8s/providers/docker/infrastructure/controllers/metrics-server/patches/allow-insecure-kubelet-tls.yaml diff --git a/k8s/providers/docker/infrastructure/controllers/tetragon/patches/helm-release-patch.yaml b/k8s/providers/docker/infrastructure/controllers/tetragon/patches/disable-helm-wait.yaml similarity index 100% rename from k8s/providers/docker/infrastructure/controllers/tetragon/patches/helm-release-patch.yaml rename to k8s/providers/docker/infrastructure/controllers/tetragon/patches/disable-helm-wait.yaml diff --git a/k8s/providers/hetzner/apps/backstage/cilium-network-policy.yaml b/k8s/providers/hetzner/apps/backstage/cilium-network-policy.yaml index 937e5d2f6..f4b8a2897 100644 --- a/k8s/providers/hetzner/apps/backstage/cilium-network-policy.yaml +++ b/k8s/providers/hetzner/apps/backstage/cilium-network-policy.yaml @@ -2,7 +2,7 @@ # CloudNativePG instances on 5432 for the declarative Postgres integration — the # cluster-agent discovers the DB pods via the coroot.com/postgres-scrape # annotations the Cluster's inheritedMetadata stamps onto them, then connects to -# scrape pg_stat_* (see patches/postgres-cluster-patch.yaml). +# scrape pg_stat_* (see patches/enable-coroot-monitoring.yaml). # # ADDITIVE to allow-backstage (Cilium unions ingress allows across policies): # this only adds the cross-namespace observability source on 5432 while diff --git a/k8s/providers/hetzner/apps/backstage/patches/postgres-cluster-patch.yaml b/k8s/providers/hetzner/apps/backstage/patches/enable-coroot-monitoring.yaml similarity index 100% rename from k8s/providers/hetzner/apps/backstage/patches/postgres-cluster-patch.yaml rename to k8s/providers/hetzner/apps/backstage/patches/enable-coroot-monitoring.yaml diff --git a/k8s/providers/hetzner/apps/headlamp/patches/helm-release-patch.yaml b/k8s/providers/hetzner/apps/headlamp/patches/helm-release-patch.yaml deleted file mode 100644 index 621e8f478..000000000 --- a/k8s/providers/hetzner/apps/headlamp/patches/helm-release-patch.yaml +++ /dev/null @@ -1,6 +0,0 @@ -apiVersion: helm.toolkit.fluxcd.io/v2 -kind: HelmRelease -metadata: - name: headlamp - namespace: headlamp -spec: {} diff --git a/k8s/providers/hetzner/apps/homepage/patches/helm-release-patch.yaml b/k8s/providers/hetzner/apps/homepage/patches/enable-user-namespaces.yaml similarity index 100% rename from k8s/providers/hetzner/apps/homepage/patches/helm-release-patch.yaml rename to k8s/providers/hetzner/apps/homepage/patches/enable-user-namespaces.yaml diff --git a/k8s/providers/hetzner/apps/kustomization.yaml b/k8s/providers/hetzner/apps/kustomization.yaml index 5f736a323..1877af5f1 100644 --- a/k8s/providers/hetzner/apps/kustomization.yaml +++ b/k8s/providers/hetzner/apps/kustomization.yaml @@ -40,13 +40,12 @@ patches: # removed with the app: an unreferenced patch fragment is schema-validated # standalone by `ksail workload validate`, and a partial HelmRelease (no # spec.interval) fails. Restore the patch from git history if re-enabling. - - path: headlamp/patches/helm-release-patch.yaml - - path: wedding-app/patches/flux-kustomization-patch.yaml + - path: wedding-app/patches/flux-kustomization-protect-wedding-db.yaml # Prod-only Coroot Postgres integration for the in-repo CNPG DBs (strategic # merge: inheritedMetadata scrape annotations + pg_stat_statements params, plus # for backstage a pg_monitor managed role). These self-identify, so no target. - - path: umami/patches/postgres-cluster-patch.yaml - - path: backstage/patches/postgres-cluster-patch.yaml + - path: umami/patches/enable-coroot-monitoring.yaml + - path: backstage/patches/enable-coroot-monitoring.yaml # umami's pg_monitor grant is a JSON6902 append onto its EXISTING managed role # (so it doesn't replace the list / desync the OpenBao-rotation enforcer); a # JSON6902 patch has no embedded identity, so it keeps an explicit target. @@ -56,7 +55,7 @@ patches: kind: Cluster name: umami-db namespace: umami - path: umami/patches/grant-pg-monitor-patch.yaml + path: umami/patches/grant-pg-monitor.yaml # User namespaces (hostUsers: false) — prod-only; see each patch file for the # full rationale. JSON6902 patches (a list of ops) have no embedded identity, # so they must keep an explicit target. whoami is the proven stateless canary @@ -66,9 +65,9 @@ patches: kind: HelmRelease name: whoami namespace: whoami - path: whoami/patches/helm-release-patch.yaml + path: whoami/patches/enable-user-namespaces.yaml - target: kind: HelmRelease name: homepage namespace: homepage - path: homepage/patches/helm-release-patch.yaml + path: homepage/patches/enable-user-namespaces.yaml diff --git a/k8s/providers/hetzner/apps/umami/cilium-network-policy.yaml b/k8s/providers/hetzner/apps/umami/cilium-network-policy.yaml index d902379b9..bd948a444 100644 --- a/k8s/providers/hetzner/apps/umami/cilium-network-policy.yaml +++ b/k8s/providers/hetzner/apps/umami/cilium-network-policy.yaml @@ -2,7 +2,7 @@ # CloudNativePG instances on 5432 for the declarative Postgres integration — # the cluster-agent discovers the DB pods via the coroot.com/postgres-scrape # annotations the Cluster's inheritedMetadata stamps onto them, then connects to -# scrape pg_stat_* (see patches/postgres-cluster-patch.yaml). +# scrape pg_stat_* (see patches/enable-coroot-monitoring.yaml). # # ADDITIVE, not a replacement for allow-umami: Cilium unions the ingress allows # across every policy selecting an endpoint, so this standalone policy only adds diff --git a/k8s/providers/hetzner/apps/umami/patches/postgres-cluster-patch.yaml b/k8s/providers/hetzner/apps/umami/patches/enable-coroot-monitoring.yaml similarity index 96% rename from k8s/providers/hetzner/apps/umami/patches/postgres-cluster-patch.yaml rename to k8s/providers/hetzner/apps/umami/patches/enable-coroot-monitoring.yaml index a01be9935..e3c00e63f 100644 --- a/k8s/providers/hetzner/apps/umami/patches/postgres-cluster-patch.yaml +++ b/k8s/providers/hetzner/apps/umami/patches/enable-coroot-monitoring.yaml @@ -1,7 +1,7 @@ # Prod-only Coroot Postgres integration for umami-db (strategic merge onto the # base Cluster). Coroot/SPIRE run only on Hetzner, so this lives in the hetzner # overlay alongside umami's other prod-only wiring. The pg_monitor grant on the -# `umami` role is a separate JSON6902 patch (grant-pg-monitor-patch.yaml) so it +# `umami` role is a separate JSON6902 patch (grant-pg-monitor.yaml) so it # appends to the existing managed role instead of replacing the list. # # inheritedMetadata.annotations — CNPG propagates these to the DB instance diff --git a/k8s/providers/hetzner/apps/umami/patches/grant-pg-monitor-patch.yaml b/k8s/providers/hetzner/apps/umami/patches/grant-pg-monitor.yaml similarity index 100% rename from k8s/providers/hetzner/apps/umami/patches/grant-pg-monitor-patch.yaml rename to k8s/providers/hetzner/apps/umami/patches/grant-pg-monitor.yaml diff --git a/k8s/providers/hetzner/apps/wedding-app/patches/flux-kustomization-patch.yaml b/k8s/providers/hetzner/apps/wedding-app/patches/flux-kustomization-protect-wedding-db.yaml similarity index 100% rename from k8s/providers/hetzner/apps/wedding-app/patches/flux-kustomization-patch.yaml rename to k8s/providers/hetzner/apps/wedding-app/patches/flux-kustomization-protect-wedding-db.yaml diff --git a/k8s/providers/hetzner/apps/whoami/patches/helm-release-patch.yaml b/k8s/providers/hetzner/apps/whoami/patches/enable-user-namespaces.yaml similarity index 100% rename from k8s/providers/hetzner/apps/whoami/patches/helm-release-patch.yaml rename to k8s/providers/hetzner/apps/whoami/patches/enable-user-namespaces.yaml diff --git a/k8s/providers/hetzner/infrastructure/controllers/cilium/patches/helm-release-patch.yaml b/k8s/providers/hetzner/infrastructure/controllers/cilium/patches/enforce-wireguard-strict-mode.yaml similarity index 98% rename from k8s/providers/hetzner/infrastructure/controllers/cilium/patches/helm-release-patch.yaml rename to k8s/providers/hetzner/infrastructure/controllers/cilium/patches/enforce-wireguard-strict-mode.yaml index d7cd4d15f..fe9222d33 100644 --- a/k8s/providers/hetzner/infrastructure/controllers/cilium/patches/helm-release-patch.yaml +++ b/k8s/providers/hetzner/infrastructure/controllers/cilium/patches/enforce-wireguard-strict-mode.yaml @@ -98,7 +98,7 @@ spec: # Safe on this topology (all verified live, 2026-06-19): # - Ingress: the Hetzner LB reaches nodes on their PRIVATE IPs # (load-balancer.hetzner.cloud/use-private-ip: "true", see - # ../../../gateway-patch.yaml) and clients hit the LB's public IP, not + # ../../../patches/attach-hcloud-load-balancer.yaml) and clients hit the LB's public IP, not # node public IPs. The private NIC stays a Cilium device, so the LB's # NodePort BPF is unaffected; only unused public-NIC NodePort BPF is shed. # - Egress: pod -> internet masquerade is the device-AGNOSTIC iptables rule diff --git a/k8s/providers/hetzner/infrastructure/controllers/cilium/patches/spire-datastorage-patch.yaml b/k8s/providers/hetzner/infrastructure/controllers/cilium/patches/store-spire-data-on-hcloud.yaml similarity index 100% rename from k8s/providers/hetzner/infrastructure/controllers/cilium/patches/spire-datastorage-patch.yaml rename to k8s/providers/hetzner/infrastructure/controllers/cilium/patches/store-spire-data-on-hcloud.yaml diff --git a/k8s/providers/hetzner/infrastructure/controllers/coredns/kustomization.yaml b/k8s/providers/hetzner/infrastructure/controllers/coredns/kustomization.yaml index a9c7aec42..d56e1caf0 100644 --- a/k8s/providers/hetzner/infrastructure/controllers/coredns/kustomization.yaml +++ b/k8s/providers/hetzner/infrastructure/controllers/coredns/kustomization.yaml @@ -4,5 +4,3 @@ kind: Kustomization resources: - ../../../../../bases/infrastructure/controllers/coredns/ - pod-disruption-budget.yaml -patches: - - path: patches/corefile-config-map-patch.yaml diff --git a/k8s/providers/hetzner/infrastructure/controllers/coredns/patches/corefile-config-map-patch.yaml b/k8s/providers/hetzner/infrastructure/controllers/coredns/patches/corefile-config-map-patch.yaml deleted file mode 100644 index ba71afdf3..000000000 --- a/k8s/providers/hetzner/infrastructure/controllers/coredns/patches/corefile-config-map-patch.yaml +++ /dev/null @@ -1,30 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: coredns - namespace: kube-system -data: - Corefile: | - .:53 { - errors - health { - lameduck 5s - } - ready - kubernetes cluster.local in-addr.arpa ip6.arpa { - pods insecure - fallthrough in-addr.arpa ip6.arpa - ttl 30 - } - prometheus :9153 - forward . /etc/resolv.conf { - max_concurrent 1000 - } - cache 30 { - disable success cluster.local - disable denial cluster.local - } - loop - reload - loadbalance - } diff --git a/k8s/providers/hetzner/infrastructure/controllers/flux-operator/patches/helm-release-patch.yaml b/k8s/providers/hetzner/infrastructure/controllers/flux-operator/patches/helm-release-patch.yaml deleted file mode 100644 index 243dcc0ce..000000000 --- a/k8s/providers/hetzner/infrastructure/controllers/flux-operator/patches/helm-release-patch.yaml +++ /dev/null @@ -1,6 +0,0 @@ -apiVersion: helm.toolkit.fluxcd.io/v2 -kind: HelmRelease -metadata: - name: flux-operator - namespace: flux-system -spec: {} diff --git a/k8s/providers/hetzner/infrastructure/controllers/ksail-operator/patches/helm-release-patch.yaml b/k8s/providers/hetzner/infrastructure/controllers/ksail-operator/patches/enable-oidc.yaml similarity index 100% rename from k8s/providers/hetzner/infrastructure/controllers/ksail-operator/patches/helm-release-patch.yaml rename to k8s/providers/hetzner/infrastructure/controllers/ksail-operator/patches/enable-oidc.yaml diff --git a/k8s/providers/hetzner/infrastructure/controllers/kustomization.yaml b/k8s/providers/hetzner/infrastructure/controllers/kustomization.yaml index ee182d967..6b7527169 100644 --- a/k8s/providers/hetzner/infrastructure/controllers/kustomization.yaml +++ b/k8s/providers/hetzner/infrastructure/controllers/kustomization.yaml @@ -54,22 +54,21 @@ components: - ../../../../bases/components/helmrelease-drift-detection - ../../../../bases/components/helmrelease-flux-defaults patches: - - path: flux-operator/patches/helm-release-patch.yaml - - path: openbao/patches/helm-release-patch.yaml + - path: openbao/patches/store-data-on-hcloud.yaml # Prod-only fail-closed WireGuard encryption (egress strict mode) — see file. - - path: cilium/patches/helm-release-patch.yaml + - path: cilium/patches/enforce-wireguard-strict-mode.yaml # Prod-only: move the SPIRE server datastore off Longhorn onto hcloud-csi to # break the SPIRE <-> Longhorn <-> mTLS circular deadlock — see file. - - path: cilium/patches/spire-datastorage-patch.yaml + - path: cilium/patches/store-spire-data-on-hcloud.yaml # Prod-only: switch Velero to a MIXED backup model — CSI volume snapshots + # data movement for Longhorn-backed PVCs (→ R2), file-system (Kopia) backup # for openbao + any non-Longhorn PVC. The base tier stays all-FSB because the # docker/local-path provider cannot snapshot. - - path: velero/patches/helm-release-patch.yaml + - path: velero/patches/enable-csi-snapshots.yaml # Prod-only: enable the ksail-operator's native Dex OIDC. The operator does # eager startup OIDC discovery that needs a cluster-resolvable + TLS-trusted # issuer, which the local *.platform.lan host-file domain isn't — see file. - - path: ksail-operator/patches/helm-release-patch.yaml + - path: ksail-operator/patches/enable-oidc.yaml # NB: the Coroot CR moved to the `infrastructure` layer (so the operator's # CRD is installed first), so its hetzner patch now lives in # ../coroot/patches/ and is applied by ../kustomization.yaml, not here. diff --git a/k8s/providers/hetzner/infrastructure/controllers/openbao/patches/helm-release-patch.yaml b/k8s/providers/hetzner/infrastructure/controllers/openbao/patches/store-data-on-hcloud.yaml similarity index 100% rename from k8s/providers/hetzner/infrastructure/controllers/openbao/patches/helm-release-patch.yaml rename to k8s/providers/hetzner/infrastructure/controllers/openbao/patches/store-data-on-hcloud.yaml diff --git a/k8s/providers/hetzner/infrastructure/controllers/velero/config-map.yaml b/k8s/providers/hetzner/infrastructure/controllers/velero/config-map.yaml index 7b3a5a80b..89e2a7b0c 100644 --- a/k8s/providers/hetzner/infrastructure/controllers/velero/config-map.yaml +++ b/k8s/providers/hetzner/infrastructure/controllers/velero/config-map.yaml @@ -1,6 +1,6 @@ # Velero Volume Policy (prod/hetzner only) — routes each PVC to the right backup # method by StorageClass. Referenced by name from the daily-full schedule's -# spec.resourcePolicy (see patches/helm-release-patch.yaml). +# spec.resourcePolicy (see patches/enable-csi-snapshots.yaml). # # Why a policy instead of per-pod annotations: it is declarative (by # StorageClass, not volume name), and fail-safe — any volume that does NOT match diff --git a/k8s/providers/hetzner/infrastructure/controllers/velero/patches/helm-release-patch.yaml b/k8s/providers/hetzner/infrastructure/controllers/velero/patches/enable-csi-snapshots.yaml similarity index 100% rename from k8s/providers/hetzner/infrastructure/controllers/velero/patches/helm-release-patch.yaml rename to k8s/providers/hetzner/infrastructure/controllers/velero/patches/enable-csi-snapshots.yaml diff --git a/k8s/providers/hetzner/infrastructure/coroot/cluster.yaml b/k8s/providers/hetzner/infrastructure/coroot/cluster.yaml index f2a839eb2..f2b79b74f 100644 --- a/k8s/providers/hetzner/infrastructure/coroot/cluster.yaml +++ b/k8s/providers/hetzner/infrastructure/coroot/cluster.yaml @@ -9,7 +9,7 @@ # server at an external Postgres is therefore the ONLY supported way to run more # than one Coroot server pod — so this Cluster is what makes coroot-coroot HA. # spec.postgres + spec.replicas: 2 + the spread affinity live in -# patches/coroot-patch.yaml (this same hetzner overlay). +# patches/enable-ha.yaml (this same hetzner overlay). # # PROD-ONLY (lives in the hetzner overlay, not the shared base): the base Coroot # CR stays single-replica SQLite for local/CI, which have no CloudNativePG diff --git a/k8s/providers/hetzner/infrastructure/coroot/patches/coroot-patch.yaml b/k8s/providers/hetzner/infrastructure/coroot/patches/enable-ha.yaml similarity index 100% rename from k8s/providers/hetzner/infrastructure/coroot/patches/coroot-patch.yaml rename to k8s/providers/hetzner/infrastructure/coroot/patches/enable-ha.yaml diff --git a/k8s/providers/hetzner/infrastructure/kustomization.yaml b/k8s/providers/hetzner/infrastructure/kustomization.yaml index 71599f642..eaa989ba1 100644 --- a/k8s/providers/hetzner/infrastructure/kustomization.yaml +++ b/k8s/providers/hetzner/infrastructure/kustomization.yaml @@ -109,7 +109,7 @@ resources: # seeds apps/ascoachingogvaner/simply in OpenBao; keeping it in the tenant means # only the (isolated) tenant Kustomization is affected, never this `wait: true` # layer. The platform keeps only the shared Gateway HTTPS listener patch - # (ascoachingogvaner-dk-listeners-patch.yaml, below), which references the + # (patches/add-ascoachingogvaner-dk-listeners.yaml, below), which references the # tenant Secret cross-namespace and stays Ready (Gateway Programmed) regardless. components: # Platform-wide HelmRelease drift detection (mode: enabled) — covers the @@ -120,11 +120,11 @@ components: patches: # gateway-patch self-identifies (Gateway/platform/kube-system), so a path is # enough — no explicit target, matching the coroot patch below. - - path: gateway-patch.yaml + - path: patches/attach-hcloud-load-balancer.yaml # JSON6902 patches don't self-identify, so this one needs an explicit # target. Adds HTTPS listeners for the tenant-owned ascoachingogvaner.dk # domain — see the file header. - - path: ascoachingogvaner-dk-listeners-patch.yaml + - path: patches/add-ascoachingogvaner-dk-listeners.yaml target: group: gateway.networking.k8s.io kind: Gateway @@ -133,12 +133,12 @@ patches: # wiring. The CR it patches comes from ../../../bases/infrastructure/coroot/ # (the `infrastructure` layer); the strategic-merge patch matches on the # embedded coroot.com/v1 Coroot/coroot identity, so no explicit target. - - path: coroot/patches/coroot-patch.yaml + - path: coroot/patches/enable-ha.yaml # Prod-only: migrate the OpenBao raft-snapshot PVC from the default longhorn # StorageClass to hcloud so node-mobile snapshot Jobs can attach it reliably # (see the file header). Self-identifies via PersistentVolumeClaim/openbao/ # vault-snapshots, so no explicit target is needed. - - path: vault-snapshots-hcloud-patch.yaml + - path: patches/store-vault-snapshots-on-hcloud.yaml # Prod-only: add a generous memory `default` to the kube-system / kyverno # LimitRanges (which are CPU-only in the base). This seeds the memory limit the # vertical-pod-autoscalers/ VPAs scale ratio-preserving, so VPA right-sizes the memory diff --git a/k8s/providers/hetzner/infrastructure/ascoachingogvaner-dk-listeners-patch.yaml b/k8s/providers/hetzner/infrastructure/patches/add-ascoachingogvaner-dk-listeners.yaml similarity index 100% rename from k8s/providers/hetzner/infrastructure/ascoachingogvaner-dk-listeners-patch.yaml rename to k8s/providers/hetzner/infrastructure/patches/add-ascoachingogvaner-dk-listeners.yaml diff --git a/k8s/providers/hetzner/infrastructure/gateway-patch.yaml b/k8s/providers/hetzner/infrastructure/patches/attach-hcloud-load-balancer.yaml similarity index 100% rename from k8s/providers/hetzner/infrastructure/gateway-patch.yaml rename to k8s/providers/hetzner/infrastructure/patches/attach-hcloud-load-balancer.yaml diff --git a/k8s/providers/hetzner/infrastructure/vault-snapshots-hcloud-patch.yaml b/k8s/providers/hetzner/infrastructure/patches/store-vault-snapshots-on-hcloud.yaml similarity index 100% rename from k8s/providers/hetzner/infrastructure/vault-snapshots-hcloud-patch.yaml rename to k8s/providers/hetzner/infrastructure/patches/store-vault-snapshots-on-hcloud.yaml diff --git a/scripts/validate-naming.py b/scripts/validate-naming.py index 5f58c1cb5..034ab6a33 100644 --- a/scripts/validate-naming.py +++ b/scripts/validate-naming.py @@ -13,11 +13,23 @@ 4. Kustomize build files (kustomize.config.k8s.io) live only in kustomization.yaml. 5. In a component folder, a single-resource file's name leads with the kebab-cased Kind (.yaml or -.yaml). CR folders, patch - fragments (under patches/ or *-patch.yaml) and kustomization.yaml are exempt. + fragments (under patches/) and kustomization.yaml are exempt. 6. A folder that groups multiple instances of a single (non-workload) Kind is a CR folder and must be named the kebab-cased plural of that Kind (e.g. VerticalPodAutoscaler -> vertical-pod-autoscalers/). Organizational subfolders inside a known CR folder are exempt. + 7. Patch fragments live under a patches/ directory and never carry a + redundant -patch suffix (the directory already marks them): a -patch stem + inside patches/ is redundant, and one outside patches/ is a misplaced + fragment. + 8. Files under patches/ follow the CR-folder naming convention — an + intent-describing -.yaml — and must not lead with the + patched resource's Kind (a Flux Kustomization CR patch keeps the + flux-kustomization prefix per check 3). + +Talos machine-config patches (talos/, talos-local/) are exempt from all naming +and file-structure conventions; the walk covers k8s/ only, keeping them out of +scope by design. """ import os, re, sys @@ -78,7 +90,7 @@ def in_cr(r): return any(r == d or r.startswith(d + "/") for d in CR_DIR_PATHS) def is_patch(r): - return "/patches/" in r or os.path.basename(r).endswith("-patch.yaml") + return "/patches/" in r def docs_with_kind(path): """Return [(apiVersion, kind)] for every top-level document declaring a kind.""" @@ -98,6 +110,7 @@ def docs_with_kind(path): def main(): bad_dirs, multi, flux_bad, build_bad, kind_bad, cr_name_bad = [], [], [], [], [], [] + patch_suffix, patch_misplaced, patch_kind_bad = [], [], [] folder_kinds = {} # dirpath -> [kind, ...] for real single-resource files for dirpath, dirnames, filenames in os.walk(K8S): @@ -108,9 +121,12 @@ def main(): if not fn.endswith((".yaml", ".yml")): continue r = rel(os.path.join(dirpath, fn)) + stem = fn[:-9] if fn.endswith(".enc.yaml") else fn.rsplit(".", 1)[0] + if stem.endswith("-patch"): + (patch_suffix if is_patch(r) else patch_misplaced).append(r) docs = docs_with_kind(os.path.join(dirpath, fn)) if len(docs) == 0: - continue # patch fragment / non-resource + continue # JSON6902 patch fragment / non-resource if len(docs) > 1: if r not in ONE_RESOURCE_EXEMPT: multi.append((r, [k for _, k in docs])) @@ -118,7 +134,6 @@ def main(): api, kind = docs[0] if fn != "kustomization.yaml" and not is_patch(r): folder_kinds.setdefault(dirpath, []).append(kind) - stem = fn[:-9] if fn.endswith(".enc.yaml") else fn.rsplit(".", 1)[0] if kind == "Kustomization" and api.startswith("kustomize.toolkit.fluxcd.io"): if not fn.startswith("flux-kustomization"): flux_bad.append(r) @@ -127,9 +142,13 @@ def main(): if fn != "kustomization.yaml": build_bad.append(r) continue - if fn == "kustomization.yaml" or in_cr(r) or is_patch(r): + if fn == "kustomization.yaml" or in_cr(r): continue kb = kebab(kind) + if is_patch(r): + if stem == kb or stem.startswith(kb + "-"): + patch_kind_bad.append((r, kind, kb)) + continue if not (stem == kb or stem.startswith(kb + "-")): kind_bad.append((r, kind, kb)) @@ -157,6 +176,12 @@ def main(): lambda x: f"{x[0]} (kind {x[1]} -> expected {x[2]}.yaml or {x[2]}-.yaml)"), ("CR-grouping folder not named by Kind plural", cr_name_bad, lambda x: f"{x[0]} ({x[1]} grouping -> expected folder '{x[2]}/')"), + ("Patch fragments outside a patches/ directory", patch_misplaced, + lambda x: f"{x} (move into a patches/ folder and drop the -patch suffix)"), + ("Patch filenames with redundant -patch suffix", patch_suffix, + lambda x: f"{x} (the patches/ folder already marks it; name by intent)"), + ("Patch filename leads with the patched Kind instead of intent", patch_kind_bad, + lambda x: f"{x[0]} (kind {x[1]} -> name it -.yaml, not {x[2]}-*)"), ] problems = sum(len(items) for _, items, _ in groups) for title, items, fmt in groups: From be0625f183a53fdf37c1f7cd5c20e81768366621 Mon Sep 17 00:00:00 2001 From: Nikolai Emil Damm Date: Sat, 4 Jul 2026 22:08:23 +0200 Subject: [PATCH 2/2] refactor: apply intent naming and one-document-per-file to talos patches Talos machine-config patches now follow the same intent-naming convention as CR folders and k8s patches: -.yaml, one YAML document per file. The multi-document files are split (disk-encryption into its two VolumeConfigs; each role's ingress-firewall into one NetworkRuleConfig per file, preserving the consolidated rule COUNT and the ENOBUFS warning), all non-comment config lines are verified preserved doc-for-doc, and validate-naming.py check 9 enforces the rules in CI. ksail globs the talos*/ directories, so no config reference changes. Co-Authored-By: Claude Fable 5 --- AGENTS.md | 6 +- docs/dr/spire-server-ha.md | 2 +- docs/runtime-security.md | 6 +- docs/rwx-storage.md | 4 +- .../best-practices/verify-ksail-images.yaml | 2 +- .../image-verification.yaml | 2 +- .../restrict-storage-to-baseline-workers.yaml | 2 +- .../enforce-wireguard-strict-mode.yaml | 2 +- scripts/validate-naming.py | 53 ++++++- ...> disable-default-cni-and-kube-proxy.yaml} | 0 .../{oidc.yaml => enable-dex-oidc.yaml} | 0 ...auth.yaml => authenticate-ghcr-pulls.yaml} | 2 +- ...ult.yaml => block-ingress-by-default.yaml} | 0 ...> disable-default-cni-and-kube-proxy.yaml} | 0 .../{apparmor.yaml => enable-apparmor.yaml} | 0 ...logging.yaml => enable-audit-logging.yaml} | 0 .../{oidc.yaml => enable-dex-oidc.yaml} | 2 +- ...paces.yaml => enable-user-namespaces.yaml} | 0 ...ion.yaml => encrypt-ephemeral-volume.yaml} | 17 +-- talos/cluster/encrypt-state-volume.yaml | 21 +++ ...ubelet.yaml => evict-pods-before-oom.yaml} | 0 ...gc.yaml => gc-terminated-pods-sooner.yaml} | 2 +- ...ysctls.yaml => harden-kernel-sysctls.yaml} | 0 ...stname.yaml => use-platform-hostname.yaml} | 0 ...on.yaml => verify-first-party-images.yaml} | 2 +- .../allow-internal-node-ingress.yaml | 26 ++++ .../allow-internal-nodepod-ingress.yaml | 22 +++ .../allow-internal-udp-ingress.yaml | 20 +++ .../control-planes/allow-public-ingress.yaml | 27 ++++ talos/control-planes/ingress-firewall.yaml | 99 ------------- talos/workers/allow-apid-ingress.yaml | 15 ++ .../workers/allow-cilium-health-ingress.yaml | 11 ++ .../allow-cilium-mutual-auth-ingress.yaml | 22 +++ .../allow-cilium-wireguard-ingress.yaml | 14 ++ talos/workers/allow-cni-vxlan-ingress.yaml | 11 ++ talos/workers/allow-hubble-peer-ingress.yaml | 12 ++ talos/workers/allow-kubelet-ingress.yaml | 16 +++ .../workers/allow-node-exporter-ingress.yaml | 17 +++ talos/workers/allow-nodeport-ingress.yaml | 12 ++ talos/workers/ingress-firewall.yaml | 134 ------------------ .../{node-labels.yaml => label-nodes.yaml} | 0 .../{kubevirt.yaml => load-kvm-modules.yaml} | 0 ...longhorn.yaml => mount-longhorn-data.yaml} | 4 +- 43 files changed, 317 insertions(+), 270 deletions(-) rename talos-local/cluster/{cni.yaml => disable-default-cni-and-kube-proxy.yaml} (100%) rename talos-local/cluster/{oidc.yaml => enable-dex-oidc.yaml} (100%) rename talos/cluster/{registry-auth.yaml => authenticate-ghcr-pulls.yaml} (97%) rename talos/cluster/{ingress-firewall-default.yaml => block-ingress-by-default.yaml} (100%) rename talos/cluster/{cni.yaml => disable-default-cni-and-kube-proxy.yaml} (100%) rename talos/cluster/{apparmor.yaml => enable-apparmor.yaml} (100%) rename talos/cluster/{audit-logging.yaml => enable-audit-logging.yaml} (100%) rename talos/cluster/{oidc.yaml => enable-dex-oidc.yaml} (94%) rename talos/cluster/{user-namespaces.yaml => enable-user-namespaces.yaml} (100%) rename talos/cluster/{disk-encryption.yaml => encrypt-ephemeral-volume.yaml} (68%) create mode 100644 talos/cluster/encrypt-state-volume.yaml rename talos/cluster/{kubelet.yaml => evict-pods-before-oom.yaml} (100%) rename talos/cluster/{terminated-pod-gc.yaml => gc-terminated-pods-sooner.yaml} (95%) rename talos/cluster/{sysctls.yaml => harden-kernel-sysctls.yaml} (100%) rename talos/cluster/{hostname.yaml => use-platform-hostname.yaml} (100%) rename talos/cluster/{image-verification.yaml => verify-first-party-images.yaml} (98%) create mode 100644 talos/control-planes/allow-internal-node-ingress.yaml create mode 100644 talos/control-planes/allow-internal-nodepod-ingress.yaml create mode 100644 talos/control-planes/allow-internal-udp-ingress.yaml create mode 100644 talos/control-planes/allow-public-ingress.yaml delete mode 100644 talos/control-planes/ingress-firewall.yaml create mode 100644 talos/workers/allow-apid-ingress.yaml create mode 100644 talos/workers/allow-cilium-health-ingress.yaml create mode 100644 talos/workers/allow-cilium-mutual-auth-ingress.yaml create mode 100644 talos/workers/allow-cilium-wireguard-ingress.yaml create mode 100644 talos/workers/allow-cni-vxlan-ingress.yaml create mode 100644 talos/workers/allow-hubble-peer-ingress.yaml create mode 100644 talos/workers/allow-kubelet-ingress.yaml create mode 100644 talos/workers/allow-node-exporter-ingress.yaml create mode 100644 talos/workers/allow-nodeport-ingress.yaml delete mode 100644 talos/workers/ingress-firewall.yaml rename talos/workers/{node-labels.yaml => label-nodes.yaml} (100%) rename talos/workers/{kubevirt.yaml => load-kvm-modules.yaml} (100%) rename talos/workers/{longhorn.yaml => mount-longhorn-data.yaml} (89%) diff --git a/AGENTS.md b/AGENTS.md index 3f9b232ef..3fa39e494 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -206,7 +206,7 @@ You **cannot** decrypt existing secrets without the proper Age keys. For local d - **Never commit plaintext secrets** — all secrets must be SOPS-encrypted with the `.enc.yaml` suffix. - **Base files are immutable** — use Kustomize `patches:` in overlays; never edit `k8s/bases/` directly from a provider or cluster overlay. - **Flux dependency order** — `bootstrap` → `infrastructure-controllers` → `infrastructure` → `apps`. One prod-only side layer hangs off `infrastructure` without gating `apps`: `infrastructure-overprovisioning` (apply-only autoscaler buffer). Declarative GitHub org management runs as a normal **app** (`github-config`) consuming the `devantler-tech/.github` artifact, with its Crossplane provider in the `infrastructure` layer — see [`docs/github-management.md`](docs/github-management.md). -- **File & directory naming** — kebab-case folders, one resource per file, and filenames led by the resource Kind (CR folders and `patches/` excepted — both name files by intent); Talos machine-config patches (`talos/`, `talos-local/`) are fully exempt. Enforced by the `naming` CI job. See [File and Directory Naming Conventions](#file-and-directory-naming-conventions) below. +- **File & directory naming** — kebab-case folders, one resource per file, and filenames led by the resource Kind (CR folders and `patches/` excepted — both name files by intent). Talos machine-config patches (`talos/`, `talos-local/`) also hold one document per file with intent names; only the k8s-manifest-specific rules don't apply to them. Enforced by the `naming` CI job. See [File and Directory Naming Conventions](#file-and-directory-naming-conventions) below. ### File and Directory Naming Conventions @@ -218,7 +218,7 @@ Enforced in CI by [`scripts/validate-naming.py`](scripts/validate-naming.py) (th - **CR-folder files** omit the folder-implied Kind and are named `‹verb›-‹purpose›.yaml` (e.g. `restrict-tenant-secret-stores.yaml`). - A **Flux `Kustomization` CR** (`kustomize.toolkit.fluxcd.io`) is named `flux-kustomization*.yaml`; the `flux-` prefix disambiguates it from the kustomize **build** file, which must stay exactly `kustomization.yaml` (`kustomize.config.k8s.io`). - **Patch fragments** are overlay inputs, not deployed resources. They live under a `patches/` directory (a `*-patch.yaml` loose next to a kustomization is flagged as misplaced) and follow the **CR-folder naming convention**: an intent-describing `‹verb›-‹purpose›.yaml` (e.g. `enable-oidc.yaml`, `store-spire-data-on-hcloud.yaml`) that neither leads with the patched Kind nor carries a `-patch` suffix — the folder already says it's a patch. One-resource-per-file applies to them too; a patch on a Flux `Kustomization` CR keeps the `flux-kustomization` prefix (e.g. `flux-kustomization-protect-wedding-db.yaml`). -- **Talos machine-config patches** (`talos/`, `talos-local/`) are **exempt from all naming and file-structure conventions** — they are Talos config fragments, not Kubernetes manifests, and keep upstream Talos' file style. +- **Talos machine-config patches** (`talos/`, `talos-local/`) follow the same spirit: **one YAML document per file** and intent-describing `‹verb›-‹purpose›.yaml` names (e.g. `enable-apparmor.yaml`, `block-ingress-by-default.yaml`, `allow-kubelet-ingress.yaml`). They are Talos config fragments, not Kubernetes manifests, so the k8s-specific rules — Kind-led filenames, `patches/` placement, the `flux-kustomization` prefix — are the only parts that don't apply. Ingress-firewall rule files stay **one `NetworkRuleConfig` per file**, but keep the rule *count* low by consolidating ports into an existing rule when protocol + subnets match (see the ENOBUFS note in `talos/control-planes/allow-public-ingress.yaml`). ### Infrastructure File Structure Convention @@ -256,7 +256,7 @@ The platform uses a hierarchical kustomization structure: **base** configuration - **Workaround:** fork the repository and use your own Age keys; re-encrypt every `*.enc.yaml` with your key. ### CNI Configuration -- The Talos cluster starts with its default CNI disabled (via `talos-local/cluster/cni.yaml`). +- The Talos cluster starts with its default CNI disabled (via `talos-local/cluster/disable-default-cni-and-kube-proxy.yaml`). - Nodes stay `NotReady` until Cilium is installed by KSail. - This is expected — KSail handles CNI installation automatically. diff --git a/docs/dr/spire-server-ha.md b/docs/dr/spire-server-ha.md index 85d7b136c..acc34e5cc 100644 --- a/docs/dr/spire-server-ha.md +++ b/docs/dr/spire-server-ha.md @@ -125,7 +125,7 @@ SVIDs, or it deadlocks. Options, hardest constraint first: single most important safety prerequisite and **must land and be verified before any replica/datastore change.** (It is purely additive — safe to ship ahead.) - **Talos node firewall** already allows the SPIRE mesh-auth port 4250 - node-to-node (`talos/{workers,control-planes}/ingress-firewall.yaml`). Postgres + node-to-node (`talos/workers/allow-cilium-mutual-auth-ingress.yaml`, `talos/control-planes/allow-internal-node-ingress.yaml`). Postgres :5432 between nodes is intra-cluster pod traffic over the CNI, not a host port, so no Talos firewall change is expected — **verify** spire-db instances and spire-server can co-locate or cross nodes without a host-firewall drop. diff --git a/docs/runtime-security.md b/docs/runtime-security.md index c3ee5ffea..1084bd14b 100644 --- a/docs/runtime-security.md +++ b/docs/runtime-security.md @@ -24,13 +24,13 @@ they sit inside a wider set of controls: | Layer | Control | What it does at runtime | | --- | --- | --- | -| Kernel LSM | **AppArmor** ([`talos/cluster/apparmor.yaml`](../talos/cluster/apparmor.yaml)) | Confines container processes to a profile; default-deny for unexpected file/cap access | +| Kernel LSM | **AppArmor** ([`talos/cluster/enable-apparmor.yaml`](../talos/cluster/enable-apparmor.yaml)) | Confines container processes to a profile; default-deny for unexpected file/cap access | | Syscall filter | **seccomp `RuntimeDefault`** (mutated + enforced by [Kyverno](../k8s/bases/infrastructure/cluster-policies/best-practices/validate-pod-security.yaml)) | Blocks the dangerous-syscall tail every container gets by default | -| Kernel hardening | **sysctls** ([`talos/cluster/sysctls.yaml`](../talos/cluster/sysctls.yaml)) | `kptr_restrict`, `ptrace_scope`, unprivileged-eBPF off, etc. — shrinks the local-privesc surface | +| Kernel hardening | **sysctls** ([`talos/cluster/harden-kernel-sysctls.yaml`](../talos/cluster/harden-kernel-sysctls.yaml)) | `kptr_restrict`, `ptrace_scope`, unprivileged-eBPF off, etc. — shrinks the local-privesc surface | | Network | **Cilium + Hubble** | L3–L7 flow visibility and default-deny [CiliumNetworkPolicy](../k8s/bases/infrastructure/cluster-policies/best-practices/add-default-deny.yaml) per namespace | | Runtime detection | **Kubescape node-agent** | Learned-behaviour anomaly detection, correlated with config/CVE/compliance posture | | Runtime enforcement | **Tetragon** | Declarative kernel-hook policies that **terminate the offending process** (SIGKILL) on a policy match | -| Forensics | **API audit log** ([`talos/cluster/audit-logging.yaml`](../talos/cluster/audit-logging.yaml)) | Who-did-what record of control-plane mutations | +| Forensics | **API audit log** ([`talos/cluster/enable-audit-logging.yaml`](../talos/cluster/enable-audit-logging.yaml)) | Who-did-what record of control-plane mutations | This document focuses on the two middle-to-bottom rows — the eBPF sensors. diff --git a/docs/rwx-storage.md b/docs/rwx-storage.md index 149329d60..544a6b96d 100644 --- a/docs/rwx-storage.md +++ b/docs/rwx-storage.md @@ -70,11 +70,11 @@ hcloud volume create \ --server ``` -The Talos machine config patch (`talos/workers/longhorn.yaml`) handles mounting `/dev/sdb` at `/var/lib/longhorn`. +The Talos machine config patch (`talos/workers/mount-longhorn-data.yaml`) handles mounting `/dev/sdb` at `/var/lib/longhorn`. > **Verify the device path** after attaching: on Hetzner Cloud, the first attached volume > consistently appears as `/dev/sdb`. Confirm with `talosctl disks --nodes `. -> If the volume shows a different path, update `talos/workers/longhorn.yaml` accordingly. +> If the volume shows a different path, update `talos/workers/mount-longhorn-data.yaml` accordingly. ## StorageClasses diff --git a/k8s/bases/infrastructure/cluster-policies/best-practices/verify-ksail-images.yaml b/k8s/bases/infrastructure/cluster-policies/best-practices/verify-ksail-images.yaml index 1ccb9f6b1..30dc7be26 100644 --- a/k8s/bases/infrastructure/cluster-policies/best-practices/verify-ksail-images.yaml +++ b/k8s/bases/infrastructure/cluster-policies/best-practices/verify-ksail-images.yaml @@ -3,7 +3,7 @@ # (ghcr.io/devantler-tech/*) — the third verification layer, complementing: # 1. Flux OCI *manifest* verification (apps' oci-repository.yaml verify.provider: # cosign) — gates the GitOps artifacts, not container images. -# 2. Talos ImageVerificationConfig (talos/cluster/image-verification.yaml) +# 2. Talos ImageVerificationConfig (talos/cluster/verify-first-party-images.yaml) # — gates the image bytes containerd pulls, with the SAME two signing # identities as below. Keep both files in sync when identities change. # This layer rejects a Pod spec referencing an unsigned/tampered first-party diff --git a/k8s/bases/infrastructure/cluster-security-exceptions/image-verification.yaml b/k8s/bases/infrastructure/cluster-security-exceptions/image-verification.yaml index 27c3e2438..efabc585b 100644 --- a/k8s/bases/infrastructure/cluster-security-exceptions/image-verification.yaml +++ b/k8s/bases/infrastructure/cluster-security-exceptions/image-verification.yaml @@ -3,7 +3,7 @@ # First-party images (ghcr.io/devantler-tech/*) ARE cosign-signed and verified # — by Flux on the OCI manifest artifacts (apps' oci-repository.yaml verify.provider: # cosign) and, at the node pull layer, by Talos ImageVerificationConfig -# (talos/cluster/image-verification.yaml). But third-party chart images +# (talos/cluster/verify-first-party-images.yaml). But third-party chart images # (Cilium, Longhorn, Coroot, registry.k8s.io, …) are not all signed, and there # is no admission-layer signature enforcement, so this cluster-wide control — # which scans every workload image ref — remains exempted. diff --git a/k8s/providers/hetzner/infrastructure/cluster-policies/restrict-storage-to-baseline-workers.yaml b/k8s/providers/hetzner/infrastructure/cluster-policies/restrict-storage-to-baseline-workers.yaml index cc5e5f581..834b49848 100644 --- a/k8s/providers/hetzner/infrastructure/cluster-policies/restrict-storage-to-baseline-workers.yaml +++ b/k8s/providers/hetzner/infrastructure/cluster-policies/restrict-storage-to-baseline-workers.yaml @@ -4,7 +4,7 @@ # nodes (autoscaler nodes are compute-only)"). # # WHY THIS EXISTS: autoscaler nodes boot from the SAME Talos worker config as -# the static workers (talos/workers/node-labels.yaml), so they inherit the +# the static workers (talos/workers/label-nodes.yaml), so they inherit the # node.longhorn.io/create-default-disk=true kubelet label. With # createDefaultDiskOnLabeledNodesOnly Longhorn then treats every autoscale # node as a storage node: it creates a default disk, and replicaAutoBalance: diff --git a/k8s/providers/hetzner/infrastructure/controllers/cilium/patches/enforce-wireguard-strict-mode.yaml b/k8s/providers/hetzner/infrastructure/controllers/cilium/patches/enforce-wireguard-strict-mode.yaml index fe9222d33..45bbee164 100644 --- a/k8s/providers/hetzner/infrastructure/controllers/cilium/patches/enforce-wireguard-strict-mode.yaml +++ b/k8s/providers/hetzner/infrastructure/controllers/cilium/patches/enforce-wireguard-strict-mode.yaml @@ -81,7 +81,7 @@ spec: # connection (cilium-agent -> peer cilium-agent on 4250) dials the peer's # PRIMARY (public) IP. The Talos ingress firewall only admits 4240/4250/8472 # from the private node CIDR (10.0.0.0/16, see - # talos/workers/ingress-firewall.yaml), so the public-sourced 4250 handshake + # talos/workers/allow-cilium-mutual-auth-ingress.yaml), so the public-sourced 4250 handshake # is dropped (`dial :4250: i/o timeout`) and Cilium black-holes every # cross-node pod flow the hetzner require-mutual-auth policy marks # `authentication.mode: required` once its auth-cache entry lapses. Observed diff --git a/scripts/validate-naming.py b/scripts/validate-naming.py index 034ab6a33..e94d97f74 100644 --- a/scripts/validate-naming.py +++ b/scripts/validate-naming.py @@ -26,10 +26,11 @@ intent-describing -.yaml — and must not lead with the patched resource's Kind (a Flux Kustomization CR patch keeps the flux-kustomization prefix per check 3). - -Talos machine-config patches (talos/, talos-local/) are exempt from all naming -and file-structure conventions; the walk covers k8s/ only, keeping them out of -scope by design. + 9. Talos machine-config patches (talos*/ at the repo root) hold ONE YAML + document per file, in kebab-case, intent-describing -.yaml + files (no -patch suffix, not led by a document kind). The k8s-specific + rules (Kind-led filenames, patches/ placement, flux-kustomization prefix) + do not apply to them. """ import os, re, sys @@ -108,6 +109,15 @@ def docs_with_kind(path): out.append((api or "", kind)) return out +def count_docs(path): + """Count YAML documents with any non-comment content (kind-less included).""" + with open(path, encoding="utf-8", errors="replace") as f: + text = f.read() + return sum( + 1 for chunk in re.split(r"(?m)^---[ \t]*$", text) + if any(l.strip() and not l.lstrip().startswith("#") for l in chunk.splitlines()) + ) + def main(): bad_dirs, multi, flux_bad, build_bad, kind_bad, cr_name_bad = [], [], [], [], [], [] patch_suffix, patch_misplaced, patch_kind_bad = [], [], [] @@ -152,6 +162,35 @@ def main(): if not (stem == kb or stem.startswith(kb + "-")): kind_bad.append((r, kind, kb)) + # Check 9: Talos machine-config patch dirs (talos*/ at the repo root) — + # kebab-case names, one YAML document per file, intent naming (no -patch + # suffix, not led by a document kind). + talos_multi, talos_kind_bad = [], [] + for talos_dir in sorted(d for d in os.listdir(ROOT) + if d.startswith("talos") and os.path.isdir(os.path.join(ROOT, d))): + for dirpath, dirnames, filenames in os.walk(os.path.join(ROOT, talos_dir)): + for dn in dirnames: + if not KEBAB.match(dn): + bad_dirs.append(rel(os.path.join(dirpath, dn))) + for fn in filenames: + if not fn.endswith((".yaml", ".yml")): + continue + path = os.path.join(dirpath, fn) + r = rel(path) + stem = fn.rsplit(".", 1)[0] + if not KEBAB.match(stem): + bad_dirs.append(r) + if stem.endswith("-patch"): + patch_suffix.append(r) + if count_docs(path) > 1: + talos_multi.append((r, count_docs(path))) + continue + docs = docs_with_kind(path) + if len(docs) == 1: + kb = kebab(docs[0][1]) + if stem == kb or stem.startswith(kb + "-"): + talos_kind_bad.append((r, docs[0][1], kb)) + # Check 6: a folder grouping >=2 instances of one non-workload Kind is a CR # folder and must be named the kebab-cased plural of that Kind. for folder, kinds in folder_kinds.items(): @@ -168,7 +207,7 @@ def main(): cr_name_bad.append((rfolder, kind, expected)) groups = [ - ("Directories not kebab-case", bad_dirs, lambda x: x), + ("Directories or filenames not kebab-case", bad_dirs, lambda x: x), ("Files with more than one resource", multi, lambda x: f"{x[0]} -> {x[1]}"), ("Flux Kustomization CRs not named flux-kustomization*.yaml", flux_bad, lambda x: x), ("Kustomize build files not named kustomization.yaml", build_bad, lambda x: x), @@ -182,6 +221,10 @@ def main(): lambda x: f"{x} (the patches/ folder already marks it; name by intent)"), ("Patch filename leads with the patched Kind instead of intent", patch_kind_bad, lambda x: f"{x[0]} (kind {x[1]} -> name it -.yaml, not {x[2]}-*)"), + ("Talos patch files with more than one YAML document", talos_multi, + lambda x: f"{x[0]} ({x[1]} documents -> split, one per file)"), + ("Talos patch filename leads with the document kind instead of intent", talos_kind_bad, + lambda x: f"{x[0]} (kind {x[1]} -> name it -.yaml, not {x[2]}-*)"), ] problems = sum(len(items) for _, items, _ in groups) for title, items, fmt in groups: diff --git a/talos-local/cluster/cni.yaml b/talos-local/cluster/disable-default-cni-and-kube-proxy.yaml similarity index 100% rename from talos-local/cluster/cni.yaml rename to talos-local/cluster/disable-default-cni-and-kube-proxy.yaml diff --git a/talos-local/cluster/oidc.yaml b/talos-local/cluster/enable-dex-oidc.yaml similarity index 100% rename from talos-local/cluster/oidc.yaml rename to talos-local/cluster/enable-dex-oidc.yaml diff --git a/talos/cluster/registry-auth.yaml b/talos/cluster/authenticate-ghcr-pulls.yaml similarity index 97% rename from talos/cluster/registry-auth.yaml rename to talos/cluster/authenticate-ghcr-pulls.yaml index 3bc0ae1b7..da40e8f0f 100644 --- a/talos/cluster/registry-auth.yaml +++ b/talos/cluster/authenticate-ghcr-pulls.yaml @@ -1,6 +1,6 @@ # Node-level ghcr.io registry auth for first-party image-signature verification. # -# talos/cluster/image-verification.yaml (ImageVerificationConfig) verifies the +# talos/cluster/verify-first-party-images.yaml (ImageVerificationConfig) verifies the # cosign signature of every ghcr.io/devantler-tech/* image at the containerd # PULL layer. That signature is a separate artifact stored in the SAME package # as the image, and Talos fetches it using THIS machine.registries config — diff --git a/talos/cluster/ingress-firewall-default.yaml b/talos/cluster/block-ingress-by-default.yaml similarity index 100% rename from talos/cluster/ingress-firewall-default.yaml rename to talos/cluster/block-ingress-by-default.yaml diff --git a/talos/cluster/cni.yaml b/talos/cluster/disable-default-cni-and-kube-proxy.yaml similarity index 100% rename from talos/cluster/cni.yaml rename to talos/cluster/disable-default-cni-and-kube-proxy.yaml diff --git a/talos/cluster/apparmor.yaml b/talos/cluster/enable-apparmor.yaml similarity index 100% rename from talos/cluster/apparmor.yaml rename to talos/cluster/enable-apparmor.yaml diff --git a/talos/cluster/audit-logging.yaml b/talos/cluster/enable-audit-logging.yaml similarity index 100% rename from talos/cluster/audit-logging.yaml rename to talos/cluster/enable-audit-logging.yaml diff --git a/talos/cluster/oidc.yaml b/talos/cluster/enable-dex-oidc.yaml similarity index 94% rename from talos/cluster/oidc.yaml rename to talos/cluster/enable-dex-oidc.yaml index 8b14b4509..6c5df200a 100644 --- a/talos/cluster/oidc.yaml +++ b/talos/cluster/enable-dex-oidc.yaml @@ -10,7 +10,7 @@ # NOTE: This file lives in the talos/ directory used by ksail.prod.yaml. # The issuer URL below targets production. If a dev environment is added # in the future (e.g. ksail.dev.yaml), create a separate talos-dev/ -# directory with a dev-specific oidc.yaml (issuer URL: +# directory with a dev-specific enable-dex-oidc.yaml (issuer URL: # https://dex.dev.platform.devantler.tech) and point the dev config's # distributionConfig to talos-dev. cluster: diff --git a/talos/cluster/user-namespaces.yaml b/talos/cluster/enable-user-namespaces.yaml similarity index 100% rename from talos/cluster/user-namespaces.yaml rename to talos/cluster/enable-user-namespaces.yaml diff --git a/talos/cluster/disk-encryption.yaml b/talos/cluster/encrypt-ephemeral-volume.yaml similarity index 68% rename from talos/cluster/disk-encryption.yaml rename to talos/cluster/encrypt-ephemeral-volume.yaml index 8fa558676..00b7326d1 100644 --- a/talos/cluster/disk-encryption.yaml +++ b/talos/cluster/encrypt-ephemeral-volume.yaml @@ -1,10 +1,10 @@ -# Encrypt STATE and EPHEMERAL partitions with nodeID-derived keys (LUKS2). +# Encrypt the EPHEMERAL partition with a nodeID-derived key (LUKS2). # # nodeID uses the Hetzner VM UUID to derive the encryption key, protecting -# against drive recovery/reuse (e.g. recycled cloud volumes, stolen disks). -# This is the strongest option available on Hetzner Cloud (no TPM/SecureBoot). +# against drive recovery/reuse (e.g. recycled cloud volumes, stolen disks) — +# same scheme as encrypt-state-volume.yaml. # -# EPHEMERAL uses lockToState so its data is irrecoverable if STATE is wiped, +# lockToState makes EPHEMERAL's data irrecoverable if STATE is wiped, # preventing orphaned workload data from surviving a node reset. # # ⚠️ DESTRUCTIVE: Only takes effect on empty/unformatted partitions. @@ -14,15 +14,6 @@ # Reference: https://docs.siderolabs.com/talos/v1.12/configure-your-talos-cluster/storage-and-disk-management/disk-encryption apiVersion: v1alpha1 kind: VolumeConfig -name: STATE -encryption: - provider: luks2 - keys: - - nodeID: {} - slot: 0 ---- -apiVersion: v1alpha1 -kind: VolumeConfig name: EPHEMERAL encryption: provider: luks2 diff --git a/talos/cluster/encrypt-state-volume.yaml b/talos/cluster/encrypt-state-volume.yaml new file mode 100644 index 000000000..bb5d4b35b --- /dev/null +++ b/talos/cluster/encrypt-state-volume.yaml @@ -0,0 +1,21 @@ +# Encrypt the STATE partition with a nodeID-derived key (LUKS2). +# +# nodeID uses the Hetzner VM UUID to derive the encryption key, protecting +# against drive recovery/reuse (e.g. recycled cloud volumes, stolen disks). +# This is the strongest option available on Hetzner Cloud (no TPM/SecureBoot). +# The EPHEMERAL partition is encrypted the same way in +# encrypt-ephemeral-volume.yaml (lockToState ties it to this partition). +# +# ⚠️ DESTRUCTIVE: Only takes effect on empty/unformatted partitions. +# Existing clusters require a rolling rebuild — see: +# https://docs.siderolabs.com/talos/v1.12/configure-your-talos-cluster/storage-and-disk-management/disk-encryption#going-from-unencrypted-to-encrypted-and-vice-versa +# +# Reference: https://docs.siderolabs.com/talos/v1.12/configure-your-talos-cluster/storage-and-disk-management/disk-encryption +apiVersion: v1alpha1 +kind: VolumeConfig +name: STATE +encryption: + provider: luks2 + keys: + - nodeID: {} + slot: 0 diff --git a/talos/cluster/kubelet.yaml b/talos/cluster/evict-pods-before-oom.yaml similarity index 100% rename from talos/cluster/kubelet.yaml rename to talos/cluster/evict-pods-before-oom.yaml diff --git a/talos/cluster/terminated-pod-gc.yaml b/talos/cluster/gc-terminated-pods-sooner.yaml similarity index 95% rename from talos/cluster/terminated-pod-gc.yaml rename to talos/cluster/gc-terminated-pods-sooner.yaml index 2dace5d68..0cc5393e7 100644 --- a/talos/cluster/terminated-pod-gc.yaml +++ b/talos/cluster/gc-terminated-pods-sooner.yaml @@ -19,7 +19,7 @@ # # cluster.* config only takes effect on control-plane nodes (where # kube-controller-manager runs), so this lives in talos/cluster/ alongside the -# other control-plane component tuning (see audit-logging.yaml). +# other control-plane component tuning (see enable-audit-logging.yaml). # # Reference: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-garbage-collection cluster: diff --git a/talos/cluster/sysctls.yaml b/talos/cluster/harden-kernel-sysctls.yaml similarity index 100% rename from talos/cluster/sysctls.yaml rename to talos/cluster/harden-kernel-sysctls.yaml diff --git a/talos/cluster/hostname.yaml b/talos/cluster/use-platform-hostname.yaml similarity index 100% rename from talos/cluster/hostname.yaml rename to talos/cluster/use-platform-hostname.yaml diff --git a/talos/cluster/image-verification.yaml b/talos/cluster/verify-first-party-images.yaml similarity index 98% rename from talos/cluster/image-verification.yaml rename to talos/cluster/verify-first-party-images.yaml index c3e39bf8c..ca6c55d3f 100644 --- a/talos/cluster/image-verification.yaml +++ b/talos/cluster/verify-first-party-images.yaml @@ -33,7 +33,7 @@ # omitted for now and can be added later once the first-party rules are proven. # # This does NOT satisfy kubescape C-0237 (see -# k8s/bases/infrastructure/security-exceptions/image-verification.yaml): that +# k8s/bases/infrastructure/cluster-security-exceptions/image-verification.yaml): that # control scans workload image refs at the cluster layer, cannot observe Talos # node-level verification, and still fails while unsigned third-party images run. # diff --git a/talos/control-planes/allow-internal-node-ingress.yaml b/talos/control-planes/allow-internal-node-ingress.yaml new file mode 100644 index 000000000..26ff5f286 --- /dev/null +++ b/talos/control-planes/allow-internal-node-ingress.yaml @@ -0,0 +1,26 @@ +# Cluster-internal TCP reachable from the node CIDR only on control-plane +# nodes. Pairs with cluster/block-ingress-by-default.yaml. +# etcd 2379-2380, cilium-health 4240, cilium-mutual-auth 4250, +# trustd 50001, NodePort range 30000-32767 +# cilium-mutual-auth 4250: cilium-agent agent-to-agent mTLS handshake for flows +# the require-mutual-auth CiliumClusterwideNetworkPolicy marks +# authentication.mode: required; the source is the peer node's host-network +# cilium-agent, i.e. the node CIDR. Without it the cross-node handshake times +# out and Cilium black-holes the authenticated flow cluster-wide. +# +# Keep the rule COUNT low — add ports to this rule when the subnet set +# matches, don't add new rules (see the ENOBUFS note in +# allow-public-ingress.yaml). +apiVersion: v1alpha1 +kind: NetworkRuleConfig +name: control-plane-internal-node-ingress +portSelector: + ports: + - 2379-2380 + - 4240 + - 4250 + - 30000-32767 + - 50001 + protocol: tcp +ingress: + - subnet: 10.0.0.0/16 diff --git a/talos/control-planes/allow-internal-nodepod-ingress.yaml b/talos/control-planes/allow-internal-nodepod-ingress.yaml new file mode 100644 index 000000000..0dede6967 --- /dev/null +++ b/talos/control-planes/allow-internal-nodepod-ingress.yaml @@ -0,0 +1,22 @@ +# Cluster-internal TCP reachable from the node CIDR AND the pod CIDR on +# control-plane nodes. Pairs with cluster/block-ingress-by-default.yaml. +# The node CIDR covers cross-node scrapes (Cilium masquerades pod->node-IP to +# the source node IP); same-node scrapes are NOT masqueraded, so kubelet / +# node-exporter see the raw pod IP and the pod CIDR must be allowed too. +# kubelet 10250, hubble-peer 4244, node-exporter 9100 +# +# Keep the rule COUNT low — add ports to this rule when the subnet set +# matches, don't add new rules (see the ENOBUFS note in +# allow-public-ingress.yaml). +apiVersion: v1alpha1 +kind: NetworkRuleConfig +name: control-plane-internal-nodepod-ingress +portSelector: + ports: + - 4244 + - 9100 + - 10250 + protocol: tcp +ingress: + - subnet: 10.0.0.0/16 + - subnet: 10.244.0.0/16 diff --git a/talos/control-planes/allow-internal-udp-ingress.yaml b/talos/control-planes/allow-internal-udp-ingress.yaml new file mode 100644 index 000000000..807793658 --- /dev/null +++ b/talos/control-planes/allow-internal-udp-ingress.yaml @@ -0,0 +1,20 @@ +# Cluster-internal UDP reachable from the node CIDR only on control-plane +# nodes. Pairs with cluster/block-ingress-by-default.yaml. +# Cilium VXLAN 8472, Cilium WireGuard 51871 +# WireGuard transparent encryption (encryption.type: wireguard) tunnels +# VXLAN-encapsulated pod traffic between nodes through cilium_wg0; the default +# block would otherwise drop inter-node WireGuard and break pod-to-pod traffic. +# +# Keep the rule COUNT low — add ports to this rule when the subnet set +# matches, don't add new rules (see the ENOBUFS note in +# allow-public-ingress.yaml). +apiVersion: v1alpha1 +kind: NetworkRuleConfig +name: control-plane-internal-udp-ingress +portSelector: + ports: + - 8472 + - 51871 + protocol: udp +ingress: + - subnet: 10.0.0.0/16 diff --git a/talos/control-planes/allow-public-ingress.yaml b/talos/control-planes/allow-public-ingress.yaml new file mode 100644 index 000000000..07191fd45 --- /dev/null +++ b/talos/control-planes/allow-public-ingress.yaml @@ -0,0 +1,27 @@ +# World-reachable management endpoints on control-plane nodes: +# Kubernetes API (6443) + apid (50000). Pairs with +# cluster/block-ingress-by-default.yaml (NetworkDefaultActionConfig: block). +# +# Rules are CONSOLIDATED by protocol + subnet-set (all ports in one rule) on +# purpose — ADD PORTS TO THE MATCHING RULE, DON'T ADD NEW RULES. Talos +# programs rules through a bare, un-tuned netlink conn (google/nftables +# v0.3.0), and a large rule count makes conn.Flush() fail with ENOBUFS; the +# NfTablesChainController then loops forever and the node hangs in STAGE +# "Booting" — apid/etcd/kubelet never start. Control planes carry the most +# rules and hit the threshold first: this bricked prod-control-plane-1 during +# the Talos v1.13.4 in-place upgrade. See google/nftables#103 and #235. +# +# Cluster network CIDR: 10.0.0.0/16 (from ksail.prod.yaml networkCidr) +# Pod CIDR: 10.244.0.0/16 +# Reference: https://docs.siderolabs.com/talos/v1.12/networking/ingress-firewall +apiVersion: v1alpha1 +kind: NetworkRuleConfig +name: control-plane-public-ingress +portSelector: + ports: + - 6443 + - 50000 + protocol: tcp +ingress: + - subnet: 0.0.0.0/0 + - subnet: ::/0 diff --git a/talos/control-planes/ingress-firewall.yaml b/talos/control-planes/ingress-firewall.yaml deleted file mode 100644 index 7395c71c3..000000000 --- a/talos/control-planes/ingress-firewall.yaml +++ /dev/null @@ -1,99 +0,0 @@ -# Ingress firewall rules for control-plane nodes. -# Requires NetworkDefaultActionConfig (ingress: block) from cluster/. -# -# Rules are CONSOLIDATED by ingress subnet — one NetworkRuleConfig per -# protocol + subnet-set, listing all its ports — instead of one rule per port. -# This keeps the rendered nftables ruleset small on purpose: Talos's -# NfTablesChainController programs these through a bare, un-tuned netlink conn -# (google/nftables v0.3.0), and a large rule count makes conn.Flush() fail with -# "netlink receive: recvmsg: no buffer space available" (ENOBUFS). When that -# happens the controller loops forever and the node hangs in STAGE "Booting" — -# apid/etcd/kubelet never start. Control-plane nodes carry the most rules -# (6443 + etcd 2379-2380 + trustd 50001 on top of the worker set), so they hit -# the threshold first: this bricked prod-control-plane-1 during the Talos -# v1.13.4 in-place upgrade (workers, with fewer rules, upgraded fine). The bug -# is latent on 1.13.3 too — 1.13.4 only tips it over via higher early-boot -# memory pressure. See google/nftables#103 and #235; the real upstream fix is -# Talos tuning the conn (SetReadBuffer / NoENOBUFS). -# -# Effective access is IDENTICAL to the previous per-port form — only the rule -# COUNT changes (12 -> 4). Port map (unchanged): -# world (ext. mgmt): apid 50000, kube-api 6443 -# node CIDR + pod CIDR: kubelet 10250, hubble-peer 4244, node-exporter 9100 -# node CIDR only (tcp): etcd 2379-2380, cilium-health 4240, -# cilium-mutual-auth 4250, trustd 50001, -# NodePort 30000-32767 -# node CIDR only (udp): cilium-vxlan 8472, cilium-wireguard 51871 -# -# Cluster network CIDR: 10.0.0.0/16 (from ksail.prod.yaml networkCidr) -# Pod CIDR: 10.244.0.0/16 -# Reference: https://docs.siderolabs.com/talos/v1.12/networking/ingress-firewall ---- -# World-reachable management endpoints: Kubernetes API (6443) + apid (50000). -apiVersion: v1alpha1 -kind: NetworkRuleConfig -name: control-plane-public-ingress -portSelector: - ports: - - 6443 - - 50000 - protocol: tcp -ingress: - - subnet: 0.0.0.0/0 - - subnet: ::/0 ---- -# Cluster-internal TCP reachable from the node CIDR AND the pod CIDR. -# The node CIDR covers cross-node scrapes (Cilium masquerades pod->node-IP to -# the source node IP); same-node scrapes are NOT masqueraded, so kubelet / -# node-exporter see the raw pod IP and the pod CIDR must be allowed too. -# kubelet 10250, hubble-peer 4244, node-exporter 9100 -apiVersion: v1alpha1 -kind: NetworkRuleConfig -name: control-plane-internal-nodepod-ingress -portSelector: - ports: - - 4244 - - 9100 - - 10250 - protocol: tcp -ingress: - - subnet: 10.0.0.0/16 - - subnet: 10.244.0.0/16 ---- -# Cluster-internal TCP reachable from the node CIDR only. -# etcd 2379-2380, cilium-health 4240, cilium-mutual-auth 4250, -# trustd 50001, NodePort range 30000-32767 -# cilium-mutual-auth 4250: cilium-agent agent-to-agent mTLS handshake for flows -# the require-mutual-auth CiliumClusterwideNetworkPolicy marks -# authentication.mode: required; the source is the peer node's host-network -# cilium-agent, i.e. the node CIDR. Without it the cross-node handshake times -# out and Cilium black-holes the authenticated flow cluster-wide. -apiVersion: v1alpha1 -kind: NetworkRuleConfig -name: control-plane-internal-node-ingress -portSelector: - ports: - - 2379-2380 - - 4240 - - 4250 - - 30000-32767 - - 50001 - protocol: tcp -ingress: - - subnet: 10.0.0.0/16 ---- -# Cluster-internal UDP reachable from the node CIDR only. -# Cilium VXLAN 8472, Cilium WireGuard 51871 -# WireGuard transparent encryption (encryption.type: wireguard) tunnels -# VXLAN-encapsulated pod traffic between nodes through cilium_wg0; the default -# block would otherwise drop inter-node WireGuard and break pod-to-pod traffic. -apiVersion: v1alpha1 -kind: NetworkRuleConfig -name: control-plane-internal-udp-ingress -portSelector: - ports: - - 8472 - - 51871 - protocol: udp -ingress: - - subnet: 10.0.0.0/16 diff --git a/talos/workers/allow-apid-ingress.yaml b/talos/workers/allow-apid-ingress.yaml new file mode 100644 index 000000000..e28555495 --- /dev/null +++ b/talos/workers/allow-apid-ingress.yaml @@ -0,0 +1,15 @@ +# apid (50000) open to all on worker nodes — KSail manages workers via +# public IPs. Pairs with cluster/block-ingress-by-default.yaml. +# Keep the rule COUNT low — consolidate ports into an existing rule when the +# subnet set matches (see the ENOBUFS note in +# ../control-planes/allow-public-ingress.yaml). +apiVersion: v1alpha1 +kind: NetworkRuleConfig +name: apid-ingress +portSelector: + ports: + - 50000 + protocol: tcp +ingress: + - subnet: 0.0.0.0/0 + - subnet: ::/0 diff --git a/talos/workers/allow-cilium-health-ingress.yaml b/talos/workers/allow-cilium-health-ingress.yaml new file mode 100644 index 000000000..de0a547cc --- /dev/null +++ b/talos/workers/allow-cilium-health-ingress.yaml @@ -0,0 +1,11 @@ +# Cilium health checks (4240) cluster-internal only on worker nodes. Pairs +# with cluster/block-ingress-by-default.yaml. +apiVersion: v1alpha1 +kind: NetworkRuleConfig +name: cilium-health-ingress +portSelector: + ports: + - 4240 + protocol: tcp +ingress: + - subnet: 10.0.0.0/16 diff --git a/talos/workers/allow-cilium-mutual-auth-ingress.yaml b/talos/workers/allow-cilium-mutual-auth-ingress.yaml new file mode 100644 index 000000000..ea47c96a6 --- /dev/null +++ b/talos/workers/allow-cilium-mutual-auth-ingress.yaml @@ -0,0 +1,22 @@ +# Cilium SPIRE mutual authentication (authentication.mutual.spire.enabled in +# the cilium HelmRelease). cilium-agents perform the agent-to-agent mTLS +# handshake on TCP 4250 (the mesh-auth listener) for every pod-to-pod flow +# the hetzner require-mutual-auth CiliumClusterwideNetworkPolicy marks +# `authentication.mode: required`. The source is the peer node's +# host-network cilium-agent, so the traffic arrives from the node CIDR. +# NetworkDefaultActionConfig: block drops inbound 4250 unless allowed here; +# without this rule the cross-node handshake times out +# (`dial tcp 10.0.1.x:4250: i/o timeout`) and Cilium black-holes the +# authenticated flow. That silently broke the coroot-heartbeat dead-man's +# switch: its coroot-prometheus health-gate only succeeded when the CronJob +# happened to land on Prometheus's node, so the external healthchecks.io +# ping was suppressed almost every minute. +apiVersion: v1alpha1 +kind: NetworkRuleConfig +name: cilium-mutual-auth-ingress +portSelector: + ports: + - 4250 + protocol: tcp +ingress: + - subnet: 10.0.0.0/16 diff --git a/talos/workers/allow-cilium-wireguard-ingress.yaml b/talos/workers/allow-cilium-wireguard-ingress.yaml new file mode 100644 index 000000000..b13fb20a9 --- /dev/null +++ b/talos/workers/allow-cilium-wireguard-ingress.yaml @@ -0,0 +1,14 @@ +# Cilium WireGuard transparent encryption (encryption.type: wireguard). +# Tunnels VXLAN-encapsulated pod traffic between nodes through cilium_wg0. +# Default port is UDP 51871; Talos's NetworkDefaultActionConfig: block +# would otherwise drop inter-node WireGuard packets and break pod-to-pod +# traffic the moment encryption is enabled. +apiVersion: v1alpha1 +kind: NetworkRuleConfig +name: cilium-wireguard-ingress +portSelector: + ports: + - 51871 + protocol: udp +ingress: + - subnet: 10.0.0.0/16 diff --git a/talos/workers/allow-cni-vxlan-ingress.yaml b/talos/workers/allow-cni-vxlan-ingress.yaml new file mode 100644 index 000000000..3c48418b3 --- /dev/null +++ b/talos/workers/allow-cni-vxlan-ingress.yaml @@ -0,0 +1,11 @@ +# Cilium VXLAN (8472) cluster-internal only on worker nodes. Pairs with +# cluster/block-ingress-by-default.yaml. +apiVersion: v1alpha1 +kind: NetworkRuleConfig +name: cni-vxlan-ingress +portSelector: + ports: + - 8472 + protocol: udp +ingress: + - subnet: 10.0.0.0/16 diff --git a/talos/workers/allow-hubble-peer-ingress.yaml b/talos/workers/allow-hubble-peer-ingress.yaml new file mode 100644 index 000000000..5061b6c6d --- /dev/null +++ b/talos/workers/allow-hubble-peer-ingress.yaml @@ -0,0 +1,12 @@ +# Hubble peer service (4244) reachable from the node CIDR and the pod CIDR +# on worker nodes. Pairs with cluster/block-ingress-by-default.yaml. +apiVersion: v1alpha1 +kind: NetworkRuleConfig +name: hubble-peer-ingress +portSelector: + ports: + - 4244 + protocol: tcp +ingress: + - subnet: 10.0.0.0/16 + - subnet: 10.244.0.0/16 diff --git a/talos/workers/allow-kubelet-ingress.yaml b/talos/workers/allow-kubelet-ingress.yaml new file mode 100644 index 000000000..53dc86268 --- /dev/null +++ b/talos/workers/allow-kubelet-ingress.yaml @@ -0,0 +1,16 @@ +# kubelet (10250) cluster-internal only on worker nodes. Pairs with +# cluster/block-ingress-by-default.yaml. +apiVersion: v1alpha1 +kind: NetworkRuleConfig +name: kubelet-ingress +portSelector: + ports: + - 10250 + protocol: tcp +ingress: + # Node CIDR covers cross-node scrapes: Cilium masquerades pod->node-IP + # traffic to the source node IP. Same-node scrapes (e.g. the local + # metrics-server replica hitting its own kubelet) are NOT masqueraded, so + # the kubelet sees the raw pod IP and the pod CIDR must be allowed too. + - subnet: 10.0.0.0/16 + - subnet: 10.244.0.0/16 diff --git a/talos/workers/allow-node-exporter-ingress.yaml b/talos/workers/allow-node-exporter-ingress.yaml new file mode 100644 index 000000000..3c6f31070 --- /dev/null +++ b/talos/workers/allow-node-exporter-ingress.yaml @@ -0,0 +1,17 @@ +# Node exporter (9100) cluster-internal only on worker nodes (Prometheus +# scraping). Pairs with cluster/block-ingress-by-default.yaml. +apiVersion: v1alpha1 +kind: NetworkRuleConfig +name: node-exporter-ingress +portSelector: + ports: + - 9100 + protocol: tcp +ingress: + # Node CIDR covers cross-node scrapes: Cilium masquerades pod->node-IP + # traffic to the source node IP. Same-node scrapes (the Prometheus pod + # hitting the hostNetwork node-exporter on its own node) are NOT + # masqueraded, so node-exporter sees the raw pod IP and the pod CIDR must + # be allowed too. + - subnet: 10.0.0.0/16 + - subnet: 10.244.0.0/16 diff --git a/talos/workers/allow-nodeport-ingress.yaml b/talos/workers/allow-nodeport-ingress.yaml new file mode 100644 index 000000000..a040c6701 --- /dev/null +++ b/talos/workers/allow-nodeport-ingress.yaml @@ -0,0 +1,12 @@ +# NodePort range (30000-32767) cluster-internal only on worker nodes — the +# Hetzner Cloud LB reaches NodePorts over the private network. Pairs with +# cluster/block-ingress-by-default.yaml. +apiVersion: v1alpha1 +kind: NetworkRuleConfig +name: nodeport-ingress +portSelector: + ports: + - 30000-32767 + protocol: tcp +ingress: + - subnet: 10.0.0.0/16 diff --git a/talos/workers/ingress-firewall.yaml b/talos/workers/ingress-firewall.yaml deleted file mode 100644 index 21a8dcf2a..000000000 --- a/talos/workers/ingress-firewall.yaml +++ /dev/null @@ -1,134 +0,0 @@ -# Ingress firewall rules for worker nodes. -# Requires NetworkDefaultActionConfig (ingress: block) from cluster/. -# -# Policy summary (workers are more restrictive than control planes): -# - apid (50000): open to all (KSail manages workers via public IPs) -# - kubelet (10250): cluster-internal only -# - Cilium VXLAN (8472), WireGuard (51871), health (4240), Hubble peer (4244), mutual-auth (4250): cluster-internal only -# - NodePort range (30000-32767): cluster-internal (Hetzner Cloud LB) -# - Node exporter (9100): cluster-internal (Prometheus scraping) -# -# Cluster network CIDR: 10.0.0.0/16 (from ksail.prod.yaml networkCidr) -# -# Reference: https://docs.siderolabs.com/talos/v1.12/networking/ingress-firewall ---- -apiVersion: v1alpha1 -kind: NetworkRuleConfig -name: apid-ingress -portSelector: - ports: - - 50000 - protocol: tcp -ingress: - - subnet: 0.0.0.0/0 - - subnet: ::/0 ---- -apiVersion: v1alpha1 -kind: NetworkRuleConfig -name: kubelet-ingress -portSelector: - ports: - - 10250 - protocol: tcp -ingress: - # Node CIDR covers cross-node scrapes: Cilium masquerades pod->node-IP - # traffic to the source node IP. Same-node scrapes (e.g. the local - # metrics-server replica hitting its own kubelet) are NOT masqueraded, so - # the kubelet sees the raw pod IP and the pod CIDR must be allowed too. - - subnet: 10.0.0.0/16 - - subnet: 10.244.0.0/16 ---- -apiVersion: v1alpha1 -kind: NetworkRuleConfig -name: cni-vxlan-ingress -portSelector: - ports: - - 8472 - protocol: udp -ingress: - - subnet: 10.0.0.0/16 ---- -# Cilium WireGuard transparent encryption (encryption.type: wireguard). -# Tunnels VXLAN-encapsulated pod traffic between nodes through cilium_wg0. -# Default port is UDP 51871; Talos's NetworkDefaultActionConfig: block -# would otherwise drop inter-node WireGuard packets and break pod-to-pod -# traffic the moment encryption is enabled. -apiVersion: v1alpha1 -kind: NetworkRuleConfig -name: cilium-wireguard-ingress -portSelector: - ports: - - 51871 - protocol: udp -ingress: - - subnet: 10.0.0.0/16 ---- -apiVersion: v1alpha1 -kind: NetworkRuleConfig -name: cilium-health-ingress -portSelector: - ports: - - 4240 - protocol: tcp -ingress: - - subnet: 10.0.0.0/16 ---- -apiVersion: v1alpha1 -kind: NetworkRuleConfig -name: hubble-peer-ingress -portSelector: - ports: - - 4244 - protocol: tcp -ingress: - - subnet: 10.0.0.0/16 - - subnet: 10.244.0.0/16 ---- -# Cilium SPIRE mutual authentication (authentication.mutual.spire.enabled in -# the cilium HelmRelease). cilium-agents perform the agent-to-agent mTLS -# handshake on TCP 4250 (the mesh-auth listener) for every pod-to-pod flow -# the hetzner require-mutual-auth CiliumClusterwideNetworkPolicy marks -# `authentication.mode: required`. The source is the peer node's -# host-network cilium-agent, so the traffic arrives from the node CIDR. -# NetworkDefaultActionConfig: block drops inbound 4250 unless allowed here; -# without this rule the cross-node handshake times out -# (`dial tcp 10.0.1.x:4250: i/o timeout`) and Cilium black-holes the -# authenticated flow. That silently broke the coroot-heartbeat dead-man's -# switch: its coroot-prometheus health-gate only succeeded when the CronJob -# happened to land on Prometheus's node, so the external healthchecks.io -# ping was suppressed almost every minute. -apiVersion: v1alpha1 -kind: NetworkRuleConfig -name: cilium-mutual-auth-ingress -portSelector: - ports: - - 4250 - protocol: tcp -ingress: - - subnet: 10.0.0.0/16 ---- -apiVersion: v1alpha1 -kind: NetworkRuleConfig -name: nodeport-ingress -portSelector: - ports: - - 30000-32767 - protocol: tcp -ingress: - - subnet: 10.0.0.0/16 ---- -apiVersion: v1alpha1 -kind: NetworkRuleConfig -name: node-exporter-ingress -portSelector: - ports: - - 9100 - protocol: tcp -ingress: - # Node CIDR covers cross-node scrapes: Cilium masquerades pod->node-IP - # traffic to the source node IP. Same-node scrapes (the Prometheus pod - # hitting the hostNetwork node-exporter on its own node) are NOT - # masqueraded, so node-exporter sees the raw pod IP and the pod CIDR must - # be allowed too. - - subnet: 10.0.0.0/16 - - subnet: 10.244.0.0/16 diff --git a/talos/workers/node-labels.yaml b/talos/workers/label-nodes.yaml similarity index 100% rename from talos/workers/node-labels.yaml rename to talos/workers/label-nodes.yaml diff --git a/talos/workers/kubevirt.yaml b/talos/workers/load-kvm-modules.yaml similarity index 100% rename from talos/workers/kubevirt.yaml rename to talos/workers/load-kvm-modules.yaml diff --git a/talos/workers/longhorn.yaml b/talos/workers/mount-longhorn-data.yaml similarity index 89% rename from talos/workers/longhorn.yaml rename to talos/workers/mount-longhorn-data.yaml index 3a1746dc5..2c6b34aab 100644 --- a/talos/workers/longhorn.yaml +++ b/talos/workers/mount-longhorn-data.yaml @@ -8,9 +8,9 @@ # volumes and break any PVC backed by hcloud-csi. # # NOTE: node-labels for Longhorn (node.longhorn.io/create-default-disk=true) -# are set in node-labels.yaml. Talos strategic merge overwrites map keys, +# are set in label-nodes.yaml. Talos strategic merge overwrites map keys, # so all kubelet.extraArgs.node-labels values must live in a single patch -# file — see node-labels.yaml. +# file — see label-nodes.yaml. machine: kubelet: extraMounts: