From d90709e6d8616b03fd674d42eee6fcf1c16be9b6 Mon Sep 17 00:00:00 2001 From: Patrick Dillon Date: Mon, 2 Mar 2026 16:57:36 -0500 Subject: [PATCH 1/4] bootkube: enable konnectivity Enables kube-apiserver running on the bootstrap node to access the pod network, specifically to enable access to webhooks running in the cluster. Changes: * Adds a new static Konnectivity server pod running on the bootstrap node * Configures the bootstrap KAS to use its local Konnectivity server for outbound cluster traffic * Add a daemonset deployed into the cluster to run Konnectivity agent on every cluster node * Removes daemonset automatically in bootstrap teardown Co-authored-by: Matthew Booth --- .../opt/openshift/egress-selector-config.yaml | 15 ++++ .../konnectivity-agent-certs-secret.yaml | 13 ++++ .../konnectivity-agent-daemonset.yaml | 58 ++++++++++++++ .../konnectivity-config-override.yaml | 5 ++ .../opt/openshift/konnectivity-namespace.yaml | 6 ++ .../openshift/konnectivity-server-pod.yaml | 53 +++++++++++++ .../files/usr/local/bin/bootkube.sh.template | 13 +++- .../files/usr/local/bin/konnectivity-certs.sh | 55 ++++++++++++++ .../usr/local/bin/konnectivity.sh.template | 76 +++++++++++++++++++ 9 files changed, 293 insertions(+), 1 deletion(-) create mode 100644 data/data/bootstrap/files/opt/openshift/egress-selector-config.yaml create mode 100644 data/data/bootstrap/files/opt/openshift/konnectivity-agent-certs-secret.yaml create mode 100644 data/data/bootstrap/files/opt/openshift/konnectivity-agent-daemonset.yaml create mode 100644 data/data/bootstrap/files/opt/openshift/konnectivity-config-override.yaml create mode 100644 data/data/bootstrap/files/opt/openshift/konnectivity-namespace.yaml create mode 100644 data/data/bootstrap/files/opt/openshift/konnectivity-server-pod.yaml create mode 100644 data/data/bootstrap/files/usr/local/bin/konnectivity-certs.sh create mode 100644 data/data/bootstrap/files/usr/local/bin/konnectivity.sh.template diff --git a/data/data/bootstrap/files/opt/openshift/egress-selector-config.yaml b/data/data/bootstrap/files/opt/openshift/egress-selector-config.yaml new file mode 100644 index 00000000000..b9518ebca87 --- /dev/null +++ b/data/data/bootstrap/files/opt/openshift/egress-selector-config.yaml @@ -0,0 +1,15 @@ +apiVersion: apiserver.k8s.io/v1beta1 +kind: EgressSelectorConfiguration +egressSelections: +- name: cluster + connection: + proxyProtocol: GRPC + transport: + uds: + udsName: /etc/kubernetes/config/konnectivity-server.socket +- name: controlplane + connection: + proxyProtocol: Direct +- name: etcd + connection: + proxyProtocol: Direct diff --git a/data/data/bootstrap/files/opt/openshift/konnectivity-agent-certs-secret.yaml b/data/data/bootstrap/files/opt/openshift/konnectivity-agent-certs-secret.yaml new file mode 100644 index 00000000000..4fe0d702b5e --- /dev/null +++ b/data/data/bootstrap/files/opt/openshift/konnectivity-agent-certs-secret.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Secret +metadata: + name: konnectivity-agent-certs + namespace: openshift-bootstrap-konnectivity + labels: + app: konnectivity-agent + openshift.io/bootstrap-only: "true" +type: Opaque +data: + tls.crt: ${KONNECTIVITY_AGENT_CERT_BASE64} + tls.key: ${KONNECTIVITY_AGENT_KEY_BASE64} + ca.crt: ${KONNECTIVITY_CA_CERT_BASE64} diff --git a/data/data/bootstrap/files/opt/openshift/konnectivity-agent-daemonset.yaml b/data/data/bootstrap/files/opt/openshift/konnectivity-agent-daemonset.yaml new file mode 100644 index 00000000000..10a9f19464e --- /dev/null +++ b/data/data/bootstrap/files/opt/openshift/konnectivity-agent-daemonset.yaml @@ -0,0 +1,58 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: konnectivity-agent + namespace: openshift-bootstrap-konnectivity + labels: + app: konnectivity-agent + openshift.io/bootstrap-only: "true" +spec: + selector: + matchLabels: + app: konnectivity-agent + template: + metadata: + labels: + app: konnectivity-agent + spec: + hostNetwork: true + dnsPolicy: Default + priorityClassName: system-node-critical + tolerations: + - operator: Exists + containers: + - name: konnectivity-agent + image: ${KONNECTIVITY_IMAGE} + command: + - /usr/bin/proxy-agent + args: + - --logtostderr=true + - --ca-cert=/etc/konnectivity/ca.crt + - --agent-cert=/etc/konnectivity/tls.crt + - --agent-key=/etc/konnectivity/tls.key + - --proxy-server-host=${BOOTSTRAP_NODE_IP} + - --proxy-server-port=8091 + - --health-server-port=2041 + - --agent-identifiers=default-route=true + - --keepalive-time=30s + - --probe-interval=5s + - --sync-interval=5s + - --sync-interval-cap=30s + livenessProbe: + httpGet: + path: /healthz + port: 2041 + initialDelaySeconds: 10 + periodSeconds: 10 + resources: + requests: + cpu: 40m + memory: 50Mi + volumeMounts: + - name: konnectivity-certs + mountPath: /etc/konnectivity + readOnly: true + volumes: + - name: konnectivity-certs + secret: + secretName: konnectivity-agent-certs diff --git a/data/data/bootstrap/files/opt/openshift/konnectivity-config-override.yaml b/data/data/bootstrap/files/opt/openshift/konnectivity-config-override.yaml new file mode 100644 index 00000000000..034779e03f1 --- /dev/null +++ b/data/data/bootstrap/files/opt/openshift/konnectivity-config-override.yaml @@ -0,0 +1,5 @@ +apiVersion: kubecontrolplane.config.openshift.io/v1 +kind: KubeAPIServerConfig +apiServerArguments: + egress-selector-config-file: + - "/etc/kubernetes/config/egress-selector-config.yaml" diff --git a/data/data/bootstrap/files/opt/openshift/konnectivity-namespace.yaml b/data/data/bootstrap/files/opt/openshift/konnectivity-namespace.yaml new file mode 100644 index 00000000000..cc668ac3364 --- /dev/null +++ b/data/data/bootstrap/files/opt/openshift/konnectivity-namespace.yaml @@ -0,0 +1,6 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: openshift-bootstrap-konnectivity + labels: + openshift.io/bootstrap-only: "true" diff --git a/data/data/bootstrap/files/opt/openshift/konnectivity-server-pod.yaml b/data/data/bootstrap/files/opt/openshift/konnectivity-server-pod.yaml new file mode 100644 index 00000000000..ffcba0e9732 --- /dev/null +++ b/data/data/bootstrap/files/opt/openshift/konnectivity-server-pod.yaml @@ -0,0 +1,53 @@ +apiVersion: v1 +kind: Pod +metadata: + name: konnectivity-server + namespace: kube-system + labels: + app: konnectivity-server +spec: + hostNetwork: true + priorityClassName: system-node-critical + containers: + - name: konnectivity-server + image: ${KONNECTIVITY_IMAGE} + command: + - /usr/bin/proxy-server + args: + - --logtostderr=true + - --cluster-cert=/etc/konnectivity/server.crt + - --cluster-key=/etc/konnectivity/server.key + - --cluster-ca-cert=/etc/konnectivity/ca.crt + - --uds-name=/etc/kubernetes/bootstrap-configs/konnectivity-server.socket + - --server-port=0 + - --agent-port=8091 + - --health-port=2041 + - --mode=grpc + - --proxy-strategies=destHost,defaultRoute + - --keepalive-time=30s + - --frontend-keepalive-time=30s + livenessProbe: + httpGet: + path: /healthz + port: 2041 + initialDelaySeconds: 10 + periodSeconds: 10 + resources: + requests: + cpu: 40m + memory: 50Mi + volumeMounts: + - name: config-dir + mountPath: /etc/kubernetes/bootstrap-configs + - name: konnectivity-certs + mountPath: /etc/konnectivity + readOnly: true + volumes: + - name: config-dir + hostPath: + path: /etc/kubernetes/bootstrap-configs + type: DirectoryOrCreate + - name: konnectivity-certs + hostPath: + path: /opt/openshift/tls/konnectivity + type: Directory diff --git a/data/data/bootstrap/files/usr/local/bin/bootkube.sh.template b/data/data/bootstrap/files/usr/local/bin/bootkube.sh.template index 6aa5d7e253d..7191d0bc1a1 100755 --- a/data/data/bootstrap/files/usr/local/bin/bootkube.sh.template +++ b/data/data/bootstrap/files/usr/local/bin/bootkube.sh.template @@ -10,6 +10,8 @@ set -euoE pipefail ## -E option will cause functions to inherit trap . /usr/local/bin/bootstrap-cluster-gather.sh # shellcheck source=bootstrap-verify-api-server-urls.sh . /usr/local/bin/bootstrap-verify-api-server-urls.sh +# shellcheck source=konnectivity.sh.template +. /usr/local/bin/konnectivity.sh mkdir --parents /etc/kubernetes/{manifests,bootstrap-configs,bootstrap-manifests} @@ -245,6 +247,8 @@ then record_service_stage_success fi +konnectivity_setup + if [ ! -f kube-apiserver-bootstrap.done ] then record_service_stage_start "kube-apiserver-bootstrap" @@ -269,9 +273,12 @@ then --infra-config-file=/assets/manifests/cluster-infrastructure-02-config.yml \ --rendered-manifest-files=/assets/manifests \ --payload-version=$VERSION \ - --operand-kubernetes-version="${KUBERNETES_VERSION}" + --operand-kubernetes-version="${KUBERNETES_VERSION}" \ + --config-override-files=/assets/konnectivity-config-override.yaml cp kube-apiserver-bootstrap/config /etc/kubernetes/bootstrap-configs/kube-apiserver-config.yaml + # Copy egress selector config to bootstrap-configs where KAS can read it + cp /opt/openshift/egress-selector-config.yaml /etc/kubernetes/bootstrap-configs/egress-selector-config.yaml cp kube-apiserver-bootstrap/bootstrap-manifests/* bootstrap-manifests/ cp kube-apiserver-bootstrap/manifests/* manifests/ @@ -566,6 +573,8 @@ then record_service_stage_success fi +konnectivity_manifests + REQUIRED_PODS="openshift-kube-apiserver/kube-apiserver,openshift-kube-scheduler/openshift-kube-scheduler,openshift-kube-controller-manager/kube-controller-manager,openshift-cluster-version/cluster-version-operator" if [ "$BOOTSTRAP_INPLACE" = true ] then @@ -651,6 +660,8 @@ if [ ! -f api-int-dns-check.done ]; then fi fi +konnectivity_cleanup + # Workaround for https://github.com/opencontainers/runc/pull/1807 touch /opt/openshift/.bootkube.done echo "bootkube.service complete" diff --git a/data/data/bootstrap/files/usr/local/bin/konnectivity-certs.sh b/data/data/bootstrap/files/usr/local/bin/konnectivity-certs.sh new file mode 100644 index 00000000000..e71a299174e --- /dev/null +++ b/data/data/bootstrap/files/usr/local/bin/konnectivity-certs.sh @@ -0,0 +1,55 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Generate Konnectivity certificates with a self-signed CA (1-day validity). +# These are needed for mTLS between the Konnectivity server and agents +# during the bootstrap phase. +# +# Usage: konnectivity-certs.sh + +BOOTSTRAP_NODE_IP="${1:?Usage: konnectivity-certs.sh }" + +KONNECTIVITY_CERT_DIR=/opt/openshift/tls/konnectivity +mkdir -p "${KONNECTIVITY_CERT_DIR}" + +echo "Generating Konnectivity certificates in ${KONNECTIVITY_CERT_DIR}..." + +# Generate self-signed Konnectivity CA +openssl req -x509 -newkey rsa:2048 -nodes \ + -keyout "${KONNECTIVITY_CERT_DIR}/ca.key" \ + -out "${KONNECTIVITY_CERT_DIR}/ca.crt" \ + -days 1 \ + -subj "/CN=konnectivity-signer/O=openshift" + +# Server certificate for agent endpoint (needs bootstrap IP as SAN) +openssl req -new -newkey rsa:2048 -nodes \ + -keyout "${KONNECTIVITY_CERT_DIR}/server.key" \ + -out "${KONNECTIVITY_CERT_DIR}/server.csr" \ + -subj "/CN=konnectivity-server/O=openshift" + +openssl x509 -req -in "${KONNECTIVITY_CERT_DIR}/server.csr" \ + -CA "${KONNECTIVITY_CERT_DIR}/ca.crt" \ + -CAkey "${KONNECTIVITY_CERT_DIR}/ca.key" \ + -CAcreateserial \ + -out "${KONNECTIVITY_CERT_DIR}/server.crt" \ + -days 1 \ + -extfile <(printf "extendedKeyUsage=serverAuth\nsubjectAltName=IP:%s" "${BOOTSTRAP_NODE_IP}") + +# Agent client certificate (shared by all agents) +openssl req -new -newkey rsa:2048 -nodes \ + -keyout "${KONNECTIVITY_CERT_DIR}/agent.key" \ + -out "${KONNECTIVITY_CERT_DIR}/agent.csr" \ + -subj "/CN=konnectivity-agent/O=openshift" + +openssl x509 -req -in "${KONNECTIVITY_CERT_DIR}/agent.csr" \ + -CA "${KONNECTIVITY_CERT_DIR}/ca.crt" \ + -CAkey "${KONNECTIVITY_CERT_DIR}/ca.key" \ + -CAcreateserial \ + -out "${KONNECTIVITY_CERT_DIR}/agent.crt" \ + -days 1 \ + -extfile <(printf "extendedKeyUsage=clientAuth") + +# Clean up CSR files +rm -f "${KONNECTIVITY_CERT_DIR}"/*.csr + +echo "Konnectivity certificates generated successfully." diff --git a/data/data/bootstrap/files/usr/local/bin/konnectivity.sh.template b/data/data/bootstrap/files/usr/local/bin/konnectivity.sh.template new file mode 100644 index 00000000000..7d3e2778fa8 --- /dev/null +++ b/data/data/bootstrap/files/usr/local/bin/konnectivity.sh.template @@ -0,0 +1,76 @@ +#!/usr/bin/env bash +# Konnectivity bootstrap functions. +# Sourced by bootkube.sh — do not execute directly. + +# konnectivity_setup detects the bootstrap node IP, generates certificates, +# and creates the konnectivity server static pod manifest. +konnectivity_setup() { +{{- if .BootstrapNodeIP }} + # Use explicitly configured bootstrap node IP + export BOOTSTRAP_NODE_IP="{{.BootstrapNodeIP}}" + echo "Using configured bootstrap node IP: ${BOOTSTRAP_NODE_IP}" +{{- else }} + # Detect bootstrap node IP at runtime using the default route source address. + # Konnectivity agents use this to connect back to the bootstrap server. + {{- if .UseIPv6ForNodeIP }} + export BOOTSTRAP_NODE_IP=$(ip -6 -j route get 2001:4860:4860::8888 | jq -r '.[0].prefsrc') + {{- else }} + export BOOTSTRAP_NODE_IP=$(ip -j route get 1.1.1.1 | jq -r '.[0].prefsrc') + {{- end }} + echo "Detected bootstrap node IP: ${BOOTSTRAP_NODE_IP}" +{{- end }} + + if [ ! -f konnectivity-certs.done ]; then + record_service_stage_start "konnectivity-certs" + /usr/local/bin/konnectivity-certs.sh "${BOOTSTRAP_NODE_IP}" + touch konnectivity-certs.done + record_service_stage_success + fi + + if [ ! -f konnectivity-server-bootstrap.done ]; then + record_service_stage_start "konnectivity-server-bootstrap" + echo "Creating Konnectivity server static pod manifest..." + export KONNECTIVITY_IMAGE=$(image_for apiserver-network-proxy) + envsubst < /opt/openshift/konnectivity-server-pod.yaml > /etc/kubernetes/manifests/konnectivity-server-pod.yaml + touch konnectivity-server-bootstrap.done + record_service_stage_success + fi +} + +# konnectivity_manifests creates the agent namespace, secret, and daemonset +# manifests for cluster deployment. +konnectivity_manifests() { + if [ ! -f konnectivity-agent-manifest.done ]; then + record_service_stage_start "konnectivity-agent-manifest" + echo "Creating Konnectivity agent manifests..." + + KONNECTIVITY_CERT_DIR=/opt/openshift/tls/konnectivity + + cp /opt/openshift/konnectivity-namespace.yaml manifests/konnectivity-namespace.yaml + + export KONNECTIVITY_AGENT_CERT_BASE64=$(base64 -w0 "${KONNECTIVITY_CERT_DIR}/agent.crt") + export KONNECTIVITY_AGENT_KEY_BASE64=$(base64 -w0 "${KONNECTIVITY_CERT_DIR}/agent.key") + export KONNECTIVITY_CA_CERT_BASE64=$(base64 -w0 "${KONNECTIVITY_CERT_DIR}/ca.crt") + envsubst < /opt/openshift/konnectivity-agent-certs-secret.yaml > manifests/konnectivity-agent-certs.yaml + + envsubst < /opt/openshift/konnectivity-agent-daemonset.yaml > manifests/konnectivity-agent-daemonset.yaml + + touch konnectivity-agent-manifest.done + record_service_stage_success + fi +} + +# konnectivity_cleanup removes bootstrap konnectivity resources by deleting +# the namespace (cascading to DaemonSet and Secret) and the server static pod. +konnectivity_cleanup() { + if [ ! -f konnectivity-cleanup.done ]; then + record_service_stage_start "konnectivity-cleanup" + echo "Cleaning up bootstrap konnectivity resources..." + oc delete namespace openshift-bootstrap-konnectivity \ + --kubeconfig=/opt/openshift/auth/kubeconfig \ + --ignore-not-found=true + rm -f /etc/kubernetes/manifests/konnectivity-server-pod.yaml + touch konnectivity-cleanup.done + record_service_stage_success + fi +} From 5ed318a9e65e332cc9412bce4a7dd4fd563d8bda Mon Sep 17 00:00:00 2001 From: Patrick Dillon Date: Mon, 2 Mar 2026 17:00:31 -0500 Subject: [PATCH 2/4] pkg/gather: analyze konnectivity failures Adds error handling to report konnectivity specific failures when running gather bootstrap or analyze. --- .../usr/local/bin/konnectivity.sh.template | 1 + pkg/gather/service/analyze.go | 20 +++++++++++-------- pkg/gather/service/analyze_test.go | 8 +++++--- pkg/gather/service/entry.go | 2 +- 4 files changed, 19 insertions(+), 12 deletions(-) diff --git a/data/data/bootstrap/files/usr/local/bin/konnectivity.sh.template b/data/data/bootstrap/files/usr/local/bin/konnectivity.sh.template index 7d3e2778fa8..244a3c7c7b9 100644 --- a/data/data/bootstrap/files/usr/local/bin/konnectivity.sh.template +++ b/data/data/bootstrap/files/usr/local/bin/konnectivity.sh.template @@ -44,6 +44,7 @@ konnectivity_manifests() { record_service_stage_start "konnectivity-agent-manifest" echo "Creating Konnectivity agent manifests..." + export KONNECTIVITY_IMAGE=$(image_for apiserver-network-proxy) KONNECTIVITY_CERT_DIR=/opt/openshift/tls/konnectivity cp /opt/openshift/konnectivity-namespace.yaml manifests/konnectivity-namespace.yaml diff --git a/pkg/gather/service/analyze.go b/pkg/gather/service/analyze.go index 680dc5b6247..7c5e9e062ef 100644 --- a/pkg/gather/service/analyze.go +++ b/pkg/gather/service/analyze.go @@ -114,18 +114,22 @@ func checkReleaseImageDownload(a analysis) bool { return false } -// bootstrap-verify-api-servel-urls.sh is currently running as part of the bootkube service. -// And the verification of the API and API-Int URLs are the only stage where a failure is -// currently reported. So, here we are able to conclude that a failure corresponds to a -// failure to resolve either the API URL or API-Int URL or both. If that changes and if -// any other stage in the bootkube service starts reporting a failure, we need to revisit -// this. At that point verification of the URLs could be moved to its own service. func checkBootkubeService(a analysis) bool { if a.successful { return true } - // Note: Even when there is a stage failure, we are not returning false here. That is - // intentional because we donot want to report this as an error in the "analyze" output. + switch a.failingStage { + case "konnectivity-certs": + logrus.Error("The bootstrap machine failed to generate konnectivity certificates") + case "konnectivity-server-bootstrap": + logrus.Error("The bootstrap machine failed to start the konnectivity server") + case "konnectivity-agent-manifest": + logrus.Error("The bootstrap machine failed to create konnectivity agent manifests") + case "konnectivity-cleanup": + logrus.Error("The bootstrap machine failed to clean up konnectivity resources") + default: + logrus.Errorf("The bootkube service failed at stage %q", a.failingStage) + } a.logLastError() return true } diff --git a/pkg/gather/service/analyze_test.go b/pkg/gather/service/analyze_test.go index 4764783dc3a..2fa9156a21f 100644 --- a/pkg/gather/service/analyze_test.go +++ b/pkg/gather/service/analyze_test.go @@ -4,6 +4,7 @@ import ( "archive/tar" "bytes" "compress/gzip" + "fmt" "testing" "github.com/sirupsen/logrus" @@ -37,8 +38,9 @@ func failedReleaseImage() []logrus.Entry { } } -func failedURLChecks() []logrus.Entry { +func failedBootkubeStage(stage string) []logrus.Entry { return []logrus.Entry{ + {Level: logrus.ErrorLevel, Message: fmt.Sprintf("The bootkube service failed at stage %q", stage)}, {Level: logrus.InfoLevel, Message: "Line 1"}, {Level: logrus.InfoLevel, Message: "Line 2"}, {Level: logrus.InfoLevel, Message: "Line 3"}, @@ -104,7 +106,7 @@ func TestAnalyzeGatherBundle(t *testing.T) { "log-bundle/log-bundle-bootstrap/bootstrap/services/release-image.json": generateSuccessOutput("pull-release-image"), "log-bundle/bootstrap/services/bootkube.json": generateFailureOutput("check-api-url"), }, - expectedOutput: failedURLChecks(), + expectedOutput: failedBootkubeStage("check-api-url"), }, { name: "API-INT Server URL failed", @@ -112,7 +114,7 @@ func TestAnalyzeGatherBundle(t *testing.T) { "log-bundle/log-bundle-bootstrap/bootstrap/services/release-image.json": generateSuccessOutput("pull-release-image"), "log-bundle/bootstrap/services/bootkube.json": generateFailureOutput("check-api-int-url"), }, - expectedOutput: failedURLChecks(), + expectedOutput: failedBootkubeStage("check-api-int-url"), }, { name: "both release-image and API Server URLs failed", diff --git a/pkg/gather/service/entry.go b/pkg/gather/service/entry.go index 0fda98f1ee1..88c2f2bd1b6 100644 --- a/pkg/gather/service/entry.go +++ b/pkg/gather/service/entry.go @@ -10,7 +10,7 @@ type Entry struct { // present when the phase is an ending phase. Result Result `json:"result,omitempty"` // Stage is the name of the stage being executed. This is only present when the phase is either StageStart or StageEnd. - Stage string `json:"string,omitempty"` + Stage string `json:"stage,omitempty"` // PreCommand is the name of the pre-command being executed. This is only present when the phase is either // PreCommandStart or PreCommandEnd. PreCommand string `json:"preCommand,omitempty"` From 4fb358d163262cd1c093ce8ec92f760004f5ac22 Mon Sep 17 00:00:00 2001 From: Patrick Dillon Date: Tue, 24 Mar 2026 15:05:42 -0400 Subject: [PATCH 3/4] Manage konnectivity port 8091 bootstrap infrastructure Opens the port 8091 for use by konnectivity proxy during bootstrapping. Cleans up the port on bootstrap destroy. PowerVS does not seem to do any sort of bootstrap destroy cleanup, but all other platforms are handled. --- pkg/asset/manifests/aws/cluster.go | 17 +++-- pkg/asset/manifests/azure/cluster.go | 37 ++++++++++ .../manifests/ibmcloud/securitygroups.go | 20 ++++++ pkg/asset/manifests/powervs/securitygroups.go | 17 +++++ pkg/infrastructure/aws/clusterapi/aws.go | 69 +++++++++++-------- pkg/infrastructure/azure/azure.go | 21 ++++++ .../gcp/clusterapi/firewallrules.go | 22 +++++- .../openstack/preprovision/securitygroups.go | 2 + 8 files changed, 171 insertions(+), 34 deletions(-) diff --git a/pkg/asset/manifests/aws/cluster.go b/pkg/asset/manifests/aws/cluster.go index a7edba6fae4..4c13784efbb 100644 --- a/pkg/asset/manifests/aws/cluster.go +++ b/pkg/asset/manifests/aws/cluster.go @@ -18,10 +18,12 @@ import ( "github.com/openshift/installer/pkg/types/network" ) -// BootstrapSSHDescription is the description for the -// ingress rule that provides SSH access to the bootstrap node -// & identifies the rule for removal during bootstrap destroy. -const BootstrapSSHDescription = "Bootstrap SSH Access" +// Bootstrap ingress rule descriptions identify rules for removal +// during bootstrap destroy. +const ( + BootstrapSSHDescription = "Bootstrap SSH Access" + BootstrapKonnectivityDescription = "Bootstrap Konnectivity" +) // GenerateClusterAssets generates the manifests for the cluster-api. func GenerateClusterAssets(ic *installconfig.InstallConfig, clusterID *installconfig.ClusterID) (*capiutils.GenerateClusterAssetsOutput, error) { @@ -145,6 +147,13 @@ func GenerateClusterAssets(ic *installconfig.InstallConfig, clusterID *installco ToPort: 10259, SourceSecurityGroupRoles: []capa.SecurityGroupRole{"controlplane", "node"}, }, + { + Description: BootstrapKonnectivityDescription, + Protocol: capa.SecurityGroupProtocolTCP, + FromPort: 8091, + ToPort: 8091, + SourceSecurityGroupRoles: []capa.SecurityGroupRole{"controlplane", "node"}, + }, { Description: BootstrapSSHDescription, Protocol: capa.SecurityGroupProtocolTCP, diff --git a/pkg/asset/manifests/azure/cluster.go b/pkg/asset/manifests/azure/cluster.go index 2a9d4e264ac..897ec6b05f3 100644 --- a/pkg/asset/manifests/azure/cluster.go +++ b/pkg/asset/manifests/azure/cluster.go @@ -65,6 +65,17 @@ func GenerateClusterAssets(installConfig *installconfig.InstallConfig, clusterID Destination: ptr.To("*"), Action: capz.SecurityRuleActionAllow, }, + { + Name: "konnectivity_in", + Protocol: capz.SecurityGroupProtocolTCP, + Direction: capz.SecurityRuleDirectionInbound, + Priority: 230, + SourcePorts: ptr.To("*"), + DestinationPorts: ptr.To("8091"), + Source: ptr.To("*"), + Destination: ptr.To("*"), + Action: capz.SecurityRuleActionAllow, + }, } // If we are using Internal publishing, we need a security rule for each CIDR @@ -97,6 +108,19 @@ func GenerateClusterAssets(installConfig *installconfig.InstallConfig, clusterID Action: capz.SecurityRuleActionAllow, }) securityRulePriority += 10 + + securityRules = append(securityRules, capz.SecurityRule{ + Name: fmt.Sprintf("konnectivity_in_ipv4_%02d", i), + Protocol: capz.SecurityGroupProtocolTCP, + Direction: capz.SecurityRuleDirectionInbound, + SourcePorts: ptr.To("*"), + DestinationPorts: ptr.To("8091"), + Source: ptr.To(addressFamilySubnets.GetIPv4Subnets()[i].String()), + Destination: ptr.To("*"), + Priority: securityRulePriority, + Action: capz.SecurityRuleActionAllow, + }) + securityRulePriority += 10 } } if addressFamilySubnets.IPv6Count() > 0 && !installConfig.Config.PublicAPI() { @@ -126,6 +150,19 @@ func GenerateClusterAssets(installConfig *installconfig.InstallConfig, clusterID Action: capz.SecurityRuleActionAllow, }) securityRulePriority += 10 + + securityRules = append(securityRules, capz.SecurityRule{ + Name: fmt.Sprintf("konnectivity_in_ipv6_%02d", i), + Protocol: capz.SecurityGroupProtocolTCP, + Direction: capz.SecurityRuleDirectionInbound, + SourcePorts: ptr.To("*"), + DestinationPorts: ptr.To("8091"), + Source: ptr.To(addressFamilySubnets.GetIPv6Subnets()[i].String()), + Destination: ptr.To("*"), + Priority: securityRulePriority, + Action: capz.SecurityRuleActionAllow, + }) + securityRulePriority += 10 } } if len(securityRules) == 0 { diff --git a/pkg/asset/manifests/ibmcloud/securitygroups.go b/pkg/asset/manifests/ibmcloud/securitygroups.go index 24f9a020f2e..93cd0691a75 100644 --- a/pkg/asset/manifests/ibmcloud/securitygroups.go +++ b/pkg/asset/manifests/ibmcloud/securitygroups.go @@ -472,6 +472,8 @@ func buildBootstrapSecurityGroup(infraID string, allSubnets []capibmcloud.Subnet }) } } + clusterWideSGNamePtr := ptr.To(fmt.Sprintf("%s-%s", infraID, clusterWideSGNameSuffix)) + return capibmcloud.VPCSecurityGroup{ Name: bootstrapSGNamePtr, Rules: []*capibmcloud.VPCSecurityGroupRule{ @@ -488,6 +490,24 @@ func buildBootstrapSecurityGroup(infraID string, allSubnets []capibmcloud.Subnet Remotes: remotes, }, }, + { + // Konnectivity + Action: capibmcloud.VPCSecurityGroupRuleActionAllow, + Direction: capibmcloud.VPCSecurityGroupRuleDirectionInbound, + Source: &capibmcloud.VPCSecurityGroupRulePrototype{ + PortRange: &capibmcloud.VPCSecurityGroupPortRange{ + MaximumPort: 8091, + MinimumPort: 8091, + }, + Protocol: capibmcloud.VPCSecurityGroupRuleProtocolTCP, + Remotes: []capibmcloud.VPCSecurityGroupRuleRemote{ + { + RemoteType: capibmcloud.VPCSecurityGroupRuleRemoteTypeSG, + SecurityGroupName: clusterWideSGNamePtr, + }, + }, + }, + }, }, } } diff --git a/pkg/asset/manifests/powervs/securitygroups.go b/pkg/asset/manifests/powervs/securitygroups.go index bf474d3f28a..69bd0067e11 100644 --- a/pkg/asset/manifests/powervs/securitygroups.go +++ b/pkg/asset/manifests/powervs/securitygroups.go @@ -50,6 +50,23 @@ func buildControlPlaneSecurityGroup(infraID string) capibmcloud.VPCSecurityGroup }, }, }, + { + // Konnectivity + Action: capibmcloud.VPCSecurityGroupRuleActionAllow, + Direction: capibmcloud.VPCSecurityGroupRuleDirectionInbound, + Source: &capibmcloud.VPCSecurityGroupRulePrototype{ + PortRange: &capibmcloud.VPCSecurityGroupPortRange{ + MaximumPort: 8091, + MinimumPort: 8091, + }, + Protocol: capibmcloud.VPCSecurityGroupRuleProtocolTCP, + Remotes: []capibmcloud.VPCSecurityGroupRuleRemote{ + { + RemoteType: capibmcloud.VPCSecurityGroupRuleRemoteTypeAny, + }, + }, + }, + }, { Action: capibmcloud.VPCSecurityGroupRuleActionAllow, Direction: capibmcloud.VPCSecurityGroupRuleDirectionInbound, diff --git a/pkg/infrastructure/aws/clusterapi/aws.go b/pkg/infrastructure/aws/clusterapi/aws.go index 9be200ca39f..79e1ba51cb3 100644 --- a/pkg/infrastructure/aws/clusterapi/aws.go +++ b/pkg/infrastructure/aws/clusterapi/aws.go @@ -503,34 +503,34 @@ func (p *Provider) DestroyBootstrap(ctx context.Context, in clusterapi.Bootstrap startTime := time.Now() untilTime := startTime.Add(timeout) timezone, _ := untilTime.Zone() - logrus.Debugf("Waiting up to %v (until %v %s) for bootstrap SSH rule to be destroyed...", timeout, untilTime.Format(time.Kitchen), timezone) + logrus.Debugf("Waiting up to %v (until %v %s) for bootstrap rules to be destroyed...", timeout, untilTime.Format(time.Kitchen), timezone) if err := wait.PollUntilContextTimeout(ctx, 15*time.Second, timeout, true, func(ctx context.Context) (bool, error) { - if err := removeSSHRule(ctx, in.Client, in.Metadata.InfraID); err != nil { + if err := removeBootstrapRules(ctx, in.Client, in.Metadata.InfraID); err != nil { // If the cluster object has been modified between Get and Update, k8s client will refuse to update it. // In that case, we need to retry. if k8serrors.IsConflict(err) { - logrus.Debugf("AWSCluster update conflict during SSH rule removal: %v", err) + logrus.Debugf("AWSCluster update conflict during bootstrap rule removal: %v", err) return false, nil } - return true, fmt.Errorf("failed to remove bootstrap SSH rule: %w", err) + return true, fmt.Errorf("failed to remove bootstrap rules: %w", err) } - return isSSHRuleGone(ctx, ec2Client, sgID) + return isBootstrapRuleGone(ctx, ec2Client, sgID) }, ); err != nil { if wait.Interrupted(err) { - return fmt.Errorf("bootstrap ssh rule was not removed within %v: %w", timeout, err) + return fmt.Errorf("bootstrap rules were not removed within %v: %w", timeout, err) } - return fmt.Errorf("unable to remove bootstrap ssh rule: %w", err) + return fmt.Errorf("unable to remove bootstrap rules: %w", err) } - logrus.Debugf("Completed removing bootstrap SSH rule after %v", time.Since(startTime)) + logrus.Debugf("Completed removing bootstrap rules after %v", time.Since(startTime)) return nil } -// removeSSHRule removes the SSH rule for accessing the bootstrap node -// by removing the rule from the cluster spec and updating the object. -func removeSSHRule(ctx context.Context, cl k8sClient.Client, infraID string) error { +// removeBootstrapRules removes bootstrap-only rules (SSH and konnectivity) +// by removing them from the cluster spec and updating the object. +func removeBootstrapRules(ctx context.Context, cl k8sClient.Client, infraID string) error { awsCluster := &capa.AWSCluster{} key := k8sClient.ObjectKey{ Name: infraID, @@ -545,6 +545,9 @@ func removeSSHRule(ctx context.Context, cl k8sClient.Client, infraID string) err if strings.EqualFold(rule.Description, awsmanifest.BootstrapSSHDescription) { continue } + if strings.EqualFold(rule.Description, awsmanifest.BootstrapKonnectivityDescription) { + continue + } postBootstrapRules = append(postBootstrapRules, rule) } @@ -555,14 +558,15 @@ func removeSSHRule(ctx context.Context, cl k8sClient.Client, infraID string) err if err := cl.Update(ctx, awsCluster); err != nil { return fmt.Errorf("failed to update AWSCluster during bootstrap destroy: %w", err) } - logrus.Debug("Updated AWSCluster to remove bootstrap SSH rule") + logrus.Debug("Updated AWSCluster to remove bootstrap rules") } return nil } -// isSSHRuleGone checks that the Public SSH rule has been removed from the security group. -func isSSHRuleGone(ctx context.Context, client *ec2.Client, sgID string) (bool, error) { +// isBootstrapRuleGone checks that the bootstrap-only rules (SSH and konnectivity) +// have been removed from the security group. +func isBootstrapRuleGone(ctx context.Context, client *ec2.Client, sgID string) (bool, error) { sgs, err := awsconfig.DescribeSecurityGroups(ctx, client, []string{sgID}) if err != nil { return false, fmt.Errorf("error getting security group: %w", err) @@ -578,22 +582,29 @@ func isSSHRuleGone(ctx context.Context, client *ec2.Client, sgID string) (bool, sg := sgs[0] for _, rule := range sg.IpPermissions { - if ptr.Deref(rule.ToPort, 0) != 22 { - continue - } - // Check IPv4 rules - for _, source := range rule.IpRanges { - if source.CidrIp != nil && *source.CidrIp == "0.0.0.0/0" { - ruleDesc := ptr.Deref(source.Description, "[no description]") - logrus.Debugf("Found ingress rule %s with source cidr %s. Still waiting for deletion...", ruleDesc, *source.CidrIp) - return false, nil + port := ptr.Deref(rule.ToPort, 0) + switch port { + case 22: + // Check IPv4 rules + for _, source := range rule.IpRanges { + if source.CidrIp != nil && *source.CidrIp == "0.0.0.0/0" { + ruleDesc := ptr.Deref(source.Description, "[no description]") + logrus.Debugf("Found ingress rule %s with source cidr %s. Still waiting for deletion...", ruleDesc, *source.CidrIp) + return false, nil + } } - } - // Check IPv6 rules - for _, source := range rule.Ipv6Ranges { - if source.CidrIpv6 != nil && *source.CidrIpv6 == "::/0" { - ruleDesc := ptr.Deref(source.Description, "[no description]") - logrus.Debugf("Found ingress rule %s with source cidr %s. Still waiting for deletion...", ruleDesc, *source.CidrIpv6) + // Check IPv6 rules + for _, source := range rule.Ipv6Ranges { + if source.CidrIpv6 != nil && *source.CidrIpv6 == "::/0" { + ruleDesc := ptr.Deref(source.Description, "[no description]") + logrus.Debugf("Found ingress rule %s with source cidr %s. Still waiting for deletion...", ruleDesc, *source.CidrIpv6) + return false, nil + } + } + case 8091: + // Check konnectivity rule (uses security group sources) + if len(rule.UserIdGroupPairs) > 0 { + logrus.Debugf("Found konnectivity ingress rule on port 8091. Still waiting for deletion...") return false, nil } } diff --git a/pkg/infrastructure/azure/azure.go b/pkg/infrastructure/azure/azure.go index be6ce23f383..7a1833e64aa 100644 --- a/pkg/infrastructure/azure/azure.go +++ b/pkg/infrastructure/azure/azure.go @@ -693,6 +693,7 @@ func (p *Provider) PostDestroy(ctx context.Context, in clusterapi.PostDestroyerI } securityGroupName := fmt.Sprintf("%s-nsg", in.Metadata.InfraID) sshRuleName := fmt.Sprintf("%s_ssh_in", in.Metadata.InfraID) + konnectivityRuleName := fmt.Sprintf("%s_konnectivity_in", in.Metadata.InfraID) // See if a security group rule exists with the name ${InfraID}_ssh_in. // If it does, this is a private cluster. If it does not, this is a @@ -734,6 +735,26 @@ func (p *Provider) PostDestroy(ctx context.Context, in clusterapi.PostDestroyerI return fmt.Errorf("failed to delete inbound nat rule: %w", err) } } + + // Remove konnectivity security group rule used during bootstrap. + _, err = networkClientFactory.NewSecurityRulesClient().Get(ctx, + resourceGroupName, + securityGroupName, + konnectivityRuleName, + nil, + ) + if err == nil { + err = deleteSecurityGroupRule(ctx, &securityGroupInput{ + resourceGroupName: resourceGroupName, + securityGroupName: securityGroupName, + securityRuleName: konnectivityRuleName, + securityRulePort: "8091", + networkClientFactory: networkClientFactory, + }) + if err != nil { + return fmt.Errorf("failed to delete konnectivity security rule: %w", err) + } + } return nil } diff --git a/pkg/infrastructure/gcp/clusterapi/firewallrules.go b/pkg/infrastructure/gcp/clusterapi/firewallrules.go index f5b2952ec33..fa8fcca61bf 100644 --- a/pkg/infrastructure/gcp/clusterapi/firewallrules.go +++ b/pkg/infrastructure/gcp/clusterapi/firewallrules.go @@ -319,7 +319,22 @@ func createBootstrapFirewallRules(ctx context.Context, in clusterapi.InfraReadyI machineCIDR := in.InstallConfig.Config.Networking.MachineNetwork[0].CIDR.String() srcRanges = []string{machineCIDR} } - return addFirewallRule(ctx, svc, firewallName, network, projectID, getBootstrapSSHPorts(), srcTags, targetTags, srcRanges) + if err := addFirewallRule(ctx, svc, firewallName, network, projectID, getBootstrapSSHPorts(), srcTags, targetTags, srcRanges); err != nil { + return err + } + + // Konnectivity is only needed during bootstrap + workerTag := fmt.Sprintf("%s-worker", in.InfraID) + firewallName = fmt.Sprintf("%s-bootstrap-in-konnectivity", in.InfraID) + srcTags = []string{workerTag, bootstrapTag} + targetTags = []string{bootstrapTag} + konnectivityPorts := []*compute.FirewallAllowed{ + { + IPProtocol: "tcp", + Ports: []string{"8091"}, + }, + } + return addFirewallRule(ctx, svc, firewallName, network, projectID, konnectivityPorts, srcTags, targetTags, nil) } // removeBootstrapFirewallRules removes the rules created for the bootstrap node. @@ -334,6 +349,11 @@ func removeBootstrapFirewallRules(ctx context.Context, infraID, projectID string } firewallName := fmt.Sprintf("%s-bootstrap-in-ssh", infraID) + if err := deleteFirewallRule(ctx, svc, firewallName, projectID); err != nil { + return err + } + + firewallName = fmt.Sprintf("%s-bootstrap-in-konnectivity", infraID) return deleteFirewallRule(ctx, svc, firewallName, projectID) } diff --git a/pkg/infrastructure/openstack/preprovision/securitygroups.go b/pkg/infrastructure/openstack/preprovision/securitygroups.go index efbf14c0eff..a31d200ca0d 100644 --- a/pkg/infrastructure/openstack/preprovision/securitygroups.go +++ b/pkg/infrastructure/openstack/preprovision/securitygroups.go @@ -138,6 +138,7 @@ func SecurityGroups(ctx context.Context, installConfig *installconfig.InstallCon serviceIKENat = service{udp, 4500, 4500} serviceInternal = service{tcp | udp, 9000, 9999} serviceKCM = service{tcp, 10257, 10257} + serviceKonnectivity = service{tcp, 8091, 8091} serviceKubeScheduler = service{tcp, 10259, 10259} serviceKubelet = service{tcp, 10250, 10250} serviceMCS = service{tcp, 22623, 22623} @@ -234,6 +235,7 @@ func SecurityGroups(ctx context.Context, installConfig *installconfig.InstallCon addMasterRules(serviceDNS, ipVersion, CIDRs) addMasterRules(serviceETCD, ipVersion, CIDRs) addMasterRules(serviceKCM, ipVersion, CIDRs) + addBootstrapRules(serviceKonnectivity, ipVersion, CIDRs) addMasterRules(serviceKubeScheduler, ipVersion, CIDRs) addMasterRules(serviceMCS, ipVersion, CIDRs) addMasterRules(serviceOVNDB, ipVersion, CIDRs) From f3ac64e69ca57e4f3b0a7ed1d6beca830892b6ed Mon Sep 17 00:00:00 2001 From: Patrick Dillon Date: Wed, 25 Mar 2026 15:54:40 -0400 Subject: [PATCH 4/4] bootkube: gate konnectivity This gates the deployment of konnectivity to when either CRDCompatibilityRequirementOperator or ClusterAPIMachineManagement feature gate is enabled. Once either feature gate is GA, we can remove the feature gate check entirely. We also need to skip konnectivity when bootstrap in place is specified, in that case there is no separate control plane node for the bootstrap to proxy to. --- .../bootstrap/files/usr/local/bin/bootkube.sh.template | 4 ++++ .../files/usr/local/bin/konnectivity.sh.template | 9 +++++++++ pkg/asset/ignition/bootstrap/common.go | 5 +++++ 3 files changed, 18 insertions(+) diff --git a/data/data/bootstrap/files/usr/local/bin/bootkube.sh.template b/data/data/bootstrap/files/usr/local/bin/bootkube.sh.template index 7191d0bc1a1..5ef66762520 100755 --- a/data/data/bootstrap/files/usr/local/bin/bootkube.sh.template +++ b/data/data/bootstrap/files/usr/local/bin/bootkube.sh.template @@ -274,11 +274,15 @@ then --rendered-manifest-files=/assets/manifests \ --payload-version=$VERSION \ --operand-kubernetes-version="${KUBERNETES_VERSION}" \ +{{- if .KonnectivityEnabled }} --config-override-files=/assets/konnectivity-config-override.yaml +{{- end }} cp kube-apiserver-bootstrap/config /etc/kubernetes/bootstrap-configs/kube-apiserver-config.yaml +{{- if .KonnectivityEnabled }} # Copy egress selector config to bootstrap-configs where KAS can read it cp /opt/openshift/egress-selector-config.yaml /etc/kubernetes/bootstrap-configs/egress-selector-config.yaml +{{- end }} cp kube-apiserver-bootstrap/bootstrap-manifests/* bootstrap-manifests/ cp kube-apiserver-bootstrap/manifests/* manifests/ diff --git a/data/data/bootstrap/files/usr/local/bin/konnectivity.sh.template b/data/data/bootstrap/files/usr/local/bin/konnectivity.sh.template index 244a3c7c7b9..ee2cb9e615b 100644 --- a/data/data/bootstrap/files/usr/local/bin/konnectivity.sh.template +++ b/data/data/bootstrap/files/usr/local/bin/konnectivity.sh.template @@ -5,6 +5,7 @@ # konnectivity_setup detects the bootstrap node IP, generates certificates, # and creates the konnectivity server static pod manifest. konnectivity_setup() { +{{- if .KonnectivityEnabled }} {{- if .BootstrapNodeIP }} # Use explicitly configured bootstrap node IP export BOOTSTRAP_NODE_IP="{{.BootstrapNodeIP}}" @@ -35,11 +36,14 @@ konnectivity_setup() { touch konnectivity-server-bootstrap.done record_service_stage_success fi +{{- end }} + : } # konnectivity_manifests creates the agent namespace, secret, and daemonset # manifests for cluster deployment. konnectivity_manifests() { +{{- if .KonnectivityEnabled }} if [ ! -f konnectivity-agent-manifest.done ]; then record_service_stage_start "konnectivity-agent-manifest" echo "Creating Konnectivity agent manifests..." @@ -59,11 +63,14 @@ konnectivity_manifests() { touch konnectivity-agent-manifest.done record_service_stage_success fi +{{- end }} + : } # konnectivity_cleanup removes bootstrap konnectivity resources by deleting # the namespace (cascading to DaemonSet and Secret) and the server static pod. konnectivity_cleanup() { +{{- if .KonnectivityEnabled }} if [ ! -f konnectivity-cleanup.done ]; then record_service_stage_start "konnectivity-cleanup" echo "Cleaning up bootstrap konnectivity resources..." @@ -74,4 +81,6 @@ konnectivity_cleanup() { touch konnectivity-cleanup.done record_service_stage_success fi +{{- end }} + : } diff --git a/pkg/asset/ignition/bootstrap/common.go b/pkg/asset/ignition/bootstrap/common.go index ef83f40d6f7..68cb117acf1 100644 --- a/pkg/asset/ignition/bootstrap/common.go +++ b/pkg/asset/ignition/bootstrap/common.go @@ -26,6 +26,7 @@ import ( "k8s.io/utils/ptr" configv1 "github.com/openshift/api/config/v1" + "github.com/openshift/api/features" "github.com/openshift/installer/data" "github.com/openshift/installer/pkg/asset" "github.com/openshift/installer/pkg/asset/ignition" @@ -99,6 +100,7 @@ type bootstrapTemplateData struct { Invoker string ClusterDomain string OSImageStream types.OSImageStream + KonnectivityEnabled bool } // platformTemplateData is the data to use to replace values in bootstrap @@ -397,6 +399,8 @@ func (a *Common) getTemplateData(dependencies asset.Parents, bootstrapInPlace bo } pullSecret = merged } + konnectivityFeatureGateEnabled := (installConfig.Config.Enabled(features.FeatureGateCRDCompatibilityRequirementOperator) || + installConfig.Config.Enabled(features.FeatureGateClusterAPIMachineManagement)) return &bootstrapTemplateData{ AdditionalTrustBundle: installConfig.Config.AdditionalTrustBundle, @@ -422,6 +426,7 @@ func (a *Common) getTemplateData(dependencies asset.Parents, bootstrapInPlace bo Invoker: openshiftInstallInvoker, ClusterDomain: installConfig.Config.ClusterDomain(), OSImageStream: installConfig.Config.OSImageStream, + KonnectivityEnabled: konnectivityFeatureGateEnabled && !bootstrapInPlace, } }