diff --git a/data/data/bootstrap/files/opt/openshift/egress-selector-config.yaml b/data/data/bootstrap/files/opt/openshift/egress-selector-config.yaml new file mode 100644 index 00000000000..b9518ebca87 --- /dev/null +++ b/data/data/bootstrap/files/opt/openshift/egress-selector-config.yaml @@ -0,0 +1,15 @@ +apiVersion: apiserver.k8s.io/v1beta1 +kind: EgressSelectorConfiguration +egressSelections: +- name: cluster + connection: + proxyProtocol: GRPC + transport: + uds: + udsName: /etc/kubernetes/config/konnectivity-server.socket +- name: controlplane + connection: + proxyProtocol: Direct +- name: etcd + connection: + proxyProtocol: Direct diff --git a/data/data/bootstrap/files/opt/openshift/konnectivity-agent-certs-secret.yaml b/data/data/bootstrap/files/opt/openshift/konnectivity-agent-certs-secret.yaml new file mode 100644 index 00000000000..4fe0d702b5e --- /dev/null +++ b/data/data/bootstrap/files/opt/openshift/konnectivity-agent-certs-secret.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Secret +metadata: + name: konnectivity-agent-certs + namespace: openshift-bootstrap-konnectivity + labels: + app: konnectivity-agent + openshift.io/bootstrap-only: "true" +type: Opaque +data: + tls.crt: ${KONNECTIVITY_AGENT_CERT_BASE64} + tls.key: ${KONNECTIVITY_AGENT_KEY_BASE64} + ca.crt: ${KONNECTIVITY_CA_CERT_BASE64} diff --git a/data/data/bootstrap/files/opt/openshift/konnectivity-agent-daemonset.yaml b/data/data/bootstrap/files/opt/openshift/konnectivity-agent-daemonset.yaml new file mode 100644 index 00000000000..10a9f19464e --- /dev/null +++ b/data/data/bootstrap/files/opt/openshift/konnectivity-agent-daemonset.yaml @@ -0,0 +1,58 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: konnectivity-agent + namespace: openshift-bootstrap-konnectivity + labels: + app: konnectivity-agent + openshift.io/bootstrap-only: "true" +spec: + selector: + matchLabels: + app: konnectivity-agent + template: + metadata: + labels: + app: konnectivity-agent + spec: + hostNetwork: true + dnsPolicy: Default + priorityClassName: system-node-critical + tolerations: + - operator: Exists + containers: + - name: konnectivity-agent + image: ${KONNECTIVITY_IMAGE} + command: + - /usr/bin/proxy-agent + args: + - --logtostderr=true + - --ca-cert=/etc/konnectivity/ca.crt + - --agent-cert=/etc/konnectivity/tls.crt + - --agent-key=/etc/konnectivity/tls.key + - --proxy-server-host=${BOOTSTRAP_NODE_IP} + - --proxy-server-port=8091 + - --health-server-port=2041 + - --agent-identifiers=default-route=true + - --keepalive-time=30s + - --probe-interval=5s + - --sync-interval=5s + - --sync-interval-cap=30s + livenessProbe: + httpGet: + path: /healthz + port: 2041 + initialDelaySeconds: 10 + periodSeconds: 10 + resources: + requests: + cpu: 40m + memory: 50Mi + volumeMounts: + - name: konnectivity-certs + mountPath: /etc/konnectivity + readOnly: true + volumes: + - name: konnectivity-certs + secret: + secretName: konnectivity-agent-certs diff --git a/data/data/bootstrap/files/opt/openshift/konnectivity-config-override.yaml b/data/data/bootstrap/files/opt/openshift/konnectivity-config-override.yaml new file mode 100644 index 00000000000..034779e03f1 --- /dev/null +++ b/data/data/bootstrap/files/opt/openshift/konnectivity-config-override.yaml @@ -0,0 +1,5 @@ +apiVersion: kubecontrolplane.config.openshift.io/v1 +kind: KubeAPIServerConfig +apiServerArguments: + egress-selector-config-file: + - "/etc/kubernetes/config/egress-selector-config.yaml" diff --git a/data/data/bootstrap/files/opt/openshift/konnectivity-namespace.yaml b/data/data/bootstrap/files/opt/openshift/konnectivity-namespace.yaml new file mode 100644 index 00000000000..cc668ac3364 --- /dev/null +++ b/data/data/bootstrap/files/opt/openshift/konnectivity-namespace.yaml @@ -0,0 +1,6 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: openshift-bootstrap-konnectivity + labels: + openshift.io/bootstrap-only: "true" diff --git a/data/data/bootstrap/files/opt/openshift/konnectivity-server-pod.yaml b/data/data/bootstrap/files/opt/openshift/konnectivity-server-pod.yaml new file mode 100644 index 00000000000..ffcba0e9732 --- /dev/null +++ b/data/data/bootstrap/files/opt/openshift/konnectivity-server-pod.yaml @@ -0,0 +1,53 @@ +apiVersion: v1 +kind: Pod +metadata: + name: konnectivity-server + namespace: kube-system + labels: + app: konnectivity-server +spec: + hostNetwork: true + priorityClassName: system-node-critical + containers: + - name: konnectivity-server + image: ${KONNECTIVITY_IMAGE} + command: + - /usr/bin/proxy-server + args: + - --logtostderr=true + - --cluster-cert=/etc/konnectivity/server.crt + - --cluster-key=/etc/konnectivity/server.key + - --cluster-ca-cert=/etc/konnectivity/ca.crt + - --uds-name=/etc/kubernetes/bootstrap-configs/konnectivity-server.socket + - --server-port=0 + - --agent-port=8091 + - --health-port=2041 + - --mode=grpc + - --proxy-strategies=destHost,defaultRoute + - --keepalive-time=30s + - --frontend-keepalive-time=30s + livenessProbe: + httpGet: + path: /healthz + port: 2041 + initialDelaySeconds: 10 + periodSeconds: 10 + resources: + requests: + cpu: 40m + memory: 50Mi + volumeMounts: + - name: config-dir + mountPath: /etc/kubernetes/bootstrap-configs + - name: konnectivity-certs + mountPath: /etc/konnectivity + readOnly: true + volumes: + - name: config-dir + hostPath: + path: /etc/kubernetes/bootstrap-configs + type: DirectoryOrCreate + - name: konnectivity-certs + hostPath: + path: /opt/openshift/tls/konnectivity + type: Directory diff --git a/data/data/bootstrap/files/usr/local/bin/bootkube.sh.template b/data/data/bootstrap/files/usr/local/bin/bootkube.sh.template index 6aa5d7e253d..5ef66762520 100755 --- a/data/data/bootstrap/files/usr/local/bin/bootkube.sh.template +++ b/data/data/bootstrap/files/usr/local/bin/bootkube.sh.template @@ -10,6 +10,8 @@ set -euoE pipefail ## -E option will cause functions to inherit trap . /usr/local/bin/bootstrap-cluster-gather.sh # shellcheck source=bootstrap-verify-api-server-urls.sh . /usr/local/bin/bootstrap-verify-api-server-urls.sh +# shellcheck source=konnectivity.sh.template +. /usr/local/bin/konnectivity.sh mkdir --parents /etc/kubernetes/{manifests,bootstrap-configs,bootstrap-manifests} @@ -245,6 +247,8 @@ then record_service_stage_success fi +konnectivity_setup + if [ ! -f kube-apiserver-bootstrap.done ] then record_service_stage_start "kube-apiserver-bootstrap" @@ -269,9 +273,16 @@ then --infra-config-file=/assets/manifests/cluster-infrastructure-02-config.yml \ --rendered-manifest-files=/assets/manifests \ --payload-version=$VERSION \ - --operand-kubernetes-version="${KUBERNETES_VERSION}" + --operand-kubernetes-version="${KUBERNETES_VERSION}" \ +{{- if .KonnectivityEnabled }} + --config-override-files=/assets/konnectivity-config-override.yaml +{{- end }} cp kube-apiserver-bootstrap/config /etc/kubernetes/bootstrap-configs/kube-apiserver-config.yaml +{{- if .KonnectivityEnabled }} + # Copy egress selector config to bootstrap-configs where KAS can read it + cp /opt/openshift/egress-selector-config.yaml /etc/kubernetes/bootstrap-configs/egress-selector-config.yaml +{{- end }} cp kube-apiserver-bootstrap/bootstrap-manifests/* bootstrap-manifests/ cp kube-apiserver-bootstrap/manifests/* manifests/ @@ -566,6 +577,8 @@ then record_service_stage_success fi +konnectivity_manifests + REQUIRED_PODS="openshift-kube-apiserver/kube-apiserver,openshift-kube-scheduler/openshift-kube-scheduler,openshift-kube-controller-manager/kube-controller-manager,openshift-cluster-version/cluster-version-operator" if [ "$BOOTSTRAP_INPLACE" = true ] then @@ -651,6 +664,8 @@ if [ ! -f api-int-dns-check.done ]; then fi fi +konnectivity_cleanup + # Workaround for https://github.com/opencontainers/runc/pull/1807 touch /opt/openshift/.bootkube.done echo "bootkube.service complete" diff --git a/data/data/bootstrap/files/usr/local/bin/konnectivity-certs.sh b/data/data/bootstrap/files/usr/local/bin/konnectivity-certs.sh new file mode 100644 index 00000000000..e71a299174e --- /dev/null +++ b/data/data/bootstrap/files/usr/local/bin/konnectivity-certs.sh @@ -0,0 +1,55 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Generate Konnectivity certificates with a self-signed CA (1-day validity). +# These are needed for mTLS between the Konnectivity server and agents +# during the bootstrap phase. +# +# Usage: konnectivity-certs.sh + +BOOTSTRAP_NODE_IP="${1:?Usage: konnectivity-certs.sh }" + +KONNECTIVITY_CERT_DIR=/opt/openshift/tls/konnectivity +mkdir -p "${KONNECTIVITY_CERT_DIR}" + +echo "Generating Konnectivity certificates in ${KONNECTIVITY_CERT_DIR}..." + +# Generate self-signed Konnectivity CA +openssl req -x509 -newkey rsa:2048 -nodes \ + -keyout "${KONNECTIVITY_CERT_DIR}/ca.key" \ + -out "${KONNECTIVITY_CERT_DIR}/ca.crt" \ + -days 1 \ + -subj "/CN=konnectivity-signer/O=openshift" + +# Server certificate for agent endpoint (needs bootstrap IP as SAN) +openssl req -new -newkey rsa:2048 -nodes \ + -keyout "${KONNECTIVITY_CERT_DIR}/server.key" \ + -out "${KONNECTIVITY_CERT_DIR}/server.csr" \ + -subj "/CN=konnectivity-server/O=openshift" + +openssl x509 -req -in "${KONNECTIVITY_CERT_DIR}/server.csr" \ + -CA "${KONNECTIVITY_CERT_DIR}/ca.crt" \ + -CAkey "${KONNECTIVITY_CERT_DIR}/ca.key" \ + -CAcreateserial \ + -out "${KONNECTIVITY_CERT_DIR}/server.crt" \ + -days 1 \ + -extfile <(printf "extendedKeyUsage=serverAuth\nsubjectAltName=IP:%s" "${BOOTSTRAP_NODE_IP}") + +# Agent client certificate (shared by all agents) +openssl req -new -newkey rsa:2048 -nodes \ + -keyout "${KONNECTIVITY_CERT_DIR}/agent.key" \ + -out "${KONNECTIVITY_CERT_DIR}/agent.csr" \ + -subj "/CN=konnectivity-agent/O=openshift" + +openssl x509 -req -in "${KONNECTIVITY_CERT_DIR}/agent.csr" \ + -CA "${KONNECTIVITY_CERT_DIR}/ca.crt" \ + -CAkey "${KONNECTIVITY_CERT_DIR}/ca.key" \ + -CAcreateserial \ + -out "${KONNECTIVITY_CERT_DIR}/agent.crt" \ + -days 1 \ + -extfile <(printf "extendedKeyUsage=clientAuth") + +# Clean up CSR files +rm -f "${KONNECTIVITY_CERT_DIR}"/*.csr + +echo "Konnectivity certificates generated successfully." diff --git a/data/data/bootstrap/files/usr/local/bin/konnectivity.sh.template b/data/data/bootstrap/files/usr/local/bin/konnectivity.sh.template new file mode 100644 index 00000000000..ee2cb9e615b --- /dev/null +++ b/data/data/bootstrap/files/usr/local/bin/konnectivity.sh.template @@ -0,0 +1,86 @@ +#!/usr/bin/env bash +# Konnectivity bootstrap functions. +# Sourced by bootkube.sh — do not execute directly. + +# konnectivity_setup detects the bootstrap node IP, generates certificates, +# and creates the konnectivity server static pod manifest. +konnectivity_setup() { +{{- if .KonnectivityEnabled }} +{{- if .BootstrapNodeIP }} + # Use explicitly configured bootstrap node IP + export BOOTSTRAP_NODE_IP="{{.BootstrapNodeIP}}" + echo "Using configured bootstrap node IP: ${BOOTSTRAP_NODE_IP}" +{{- else }} + # Detect bootstrap node IP at runtime using the default route source address. + # Konnectivity agents use this to connect back to the bootstrap server. + {{- if .UseIPv6ForNodeIP }} + export BOOTSTRAP_NODE_IP=$(ip -6 -j route get 2001:4860:4860::8888 | jq -r '.[0].prefsrc') + {{- else }} + export BOOTSTRAP_NODE_IP=$(ip -j route get 1.1.1.1 | jq -r '.[0].prefsrc') + {{- end }} + echo "Detected bootstrap node IP: ${BOOTSTRAP_NODE_IP}" +{{- end }} + + if [ ! -f konnectivity-certs.done ]; then + record_service_stage_start "konnectivity-certs" + /usr/local/bin/konnectivity-certs.sh "${BOOTSTRAP_NODE_IP}" + touch konnectivity-certs.done + record_service_stage_success + fi + + if [ ! -f konnectivity-server-bootstrap.done ]; then + record_service_stage_start "konnectivity-server-bootstrap" + echo "Creating Konnectivity server static pod manifest..." + export KONNECTIVITY_IMAGE=$(image_for apiserver-network-proxy) + envsubst < /opt/openshift/konnectivity-server-pod.yaml > /etc/kubernetes/manifests/konnectivity-server-pod.yaml + touch konnectivity-server-bootstrap.done + record_service_stage_success + fi +{{- end }} + : +} + +# konnectivity_manifests creates the agent namespace, secret, and daemonset +# manifests for cluster deployment. +konnectivity_manifests() { +{{- if .KonnectivityEnabled }} + if [ ! -f konnectivity-agent-manifest.done ]; then + record_service_stage_start "konnectivity-agent-manifest" + echo "Creating Konnectivity agent manifests..." + + export KONNECTIVITY_IMAGE=$(image_for apiserver-network-proxy) + KONNECTIVITY_CERT_DIR=/opt/openshift/tls/konnectivity + + cp /opt/openshift/konnectivity-namespace.yaml manifests/konnectivity-namespace.yaml + + export KONNECTIVITY_AGENT_CERT_BASE64=$(base64 -w0 "${KONNECTIVITY_CERT_DIR}/agent.crt") + export KONNECTIVITY_AGENT_KEY_BASE64=$(base64 -w0 "${KONNECTIVITY_CERT_DIR}/agent.key") + export KONNECTIVITY_CA_CERT_BASE64=$(base64 -w0 "${KONNECTIVITY_CERT_DIR}/ca.crt") + envsubst < /opt/openshift/konnectivity-agent-certs-secret.yaml > manifests/konnectivity-agent-certs.yaml + + envsubst < /opt/openshift/konnectivity-agent-daemonset.yaml > manifests/konnectivity-agent-daemonset.yaml + + touch konnectivity-agent-manifest.done + record_service_stage_success + fi +{{- end }} + : +} + +# konnectivity_cleanup removes bootstrap konnectivity resources by deleting +# the namespace (cascading to DaemonSet and Secret) and the server static pod. +konnectivity_cleanup() { +{{- if .KonnectivityEnabled }} + if [ ! -f konnectivity-cleanup.done ]; then + record_service_stage_start "konnectivity-cleanup" + echo "Cleaning up bootstrap konnectivity resources..." + oc delete namespace openshift-bootstrap-konnectivity \ + --kubeconfig=/opt/openshift/auth/kubeconfig \ + --ignore-not-found=true + rm -f /etc/kubernetes/manifests/konnectivity-server-pod.yaml + touch konnectivity-cleanup.done + record_service_stage_success + fi +{{- end }} + : +} diff --git a/pkg/asset/ignition/bootstrap/common.go b/pkg/asset/ignition/bootstrap/common.go index ef83f40d6f7..68cb117acf1 100644 --- a/pkg/asset/ignition/bootstrap/common.go +++ b/pkg/asset/ignition/bootstrap/common.go @@ -26,6 +26,7 @@ import ( "k8s.io/utils/ptr" configv1 "github.com/openshift/api/config/v1" + "github.com/openshift/api/features" "github.com/openshift/installer/data" "github.com/openshift/installer/pkg/asset" "github.com/openshift/installer/pkg/asset/ignition" @@ -99,6 +100,7 @@ type bootstrapTemplateData struct { Invoker string ClusterDomain string OSImageStream types.OSImageStream + KonnectivityEnabled bool } // platformTemplateData is the data to use to replace values in bootstrap @@ -397,6 +399,8 @@ func (a *Common) getTemplateData(dependencies asset.Parents, bootstrapInPlace bo } pullSecret = merged } + konnectivityFeatureGateEnabled := (installConfig.Config.Enabled(features.FeatureGateCRDCompatibilityRequirementOperator) || + installConfig.Config.Enabled(features.FeatureGateClusterAPIMachineManagement)) return &bootstrapTemplateData{ AdditionalTrustBundle: installConfig.Config.AdditionalTrustBundle, @@ -422,6 +426,7 @@ func (a *Common) getTemplateData(dependencies asset.Parents, bootstrapInPlace bo Invoker: openshiftInstallInvoker, ClusterDomain: installConfig.Config.ClusterDomain(), OSImageStream: installConfig.Config.OSImageStream, + KonnectivityEnabled: konnectivityFeatureGateEnabled && !bootstrapInPlace, } } diff --git a/pkg/asset/manifests/aws/cluster.go b/pkg/asset/manifests/aws/cluster.go index a7edba6fae4..4c13784efbb 100644 --- a/pkg/asset/manifests/aws/cluster.go +++ b/pkg/asset/manifests/aws/cluster.go @@ -18,10 +18,12 @@ import ( "github.com/openshift/installer/pkg/types/network" ) -// BootstrapSSHDescription is the description for the -// ingress rule that provides SSH access to the bootstrap node -// & identifies the rule for removal during bootstrap destroy. -const BootstrapSSHDescription = "Bootstrap SSH Access" +// Bootstrap ingress rule descriptions identify rules for removal +// during bootstrap destroy. +const ( + BootstrapSSHDescription = "Bootstrap SSH Access" + BootstrapKonnectivityDescription = "Bootstrap Konnectivity" +) // GenerateClusterAssets generates the manifests for the cluster-api. func GenerateClusterAssets(ic *installconfig.InstallConfig, clusterID *installconfig.ClusterID) (*capiutils.GenerateClusterAssetsOutput, error) { @@ -145,6 +147,13 @@ func GenerateClusterAssets(ic *installconfig.InstallConfig, clusterID *installco ToPort: 10259, SourceSecurityGroupRoles: []capa.SecurityGroupRole{"controlplane", "node"}, }, + { + Description: BootstrapKonnectivityDescription, + Protocol: capa.SecurityGroupProtocolTCP, + FromPort: 8091, + ToPort: 8091, + SourceSecurityGroupRoles: []capa.SecurityGroupRole{"controlplane", "node"}, + }, { Description: BootstrapSSHDescription, Protocol: capa.SecurityGroupProtocolTCP, diff --git a/pkg/asset/manifests/azure/cluster.go b/pkg/asset/manifests/azure/cluster.go index 2a9d4e264ac..897ec6b05f3 100644 --- a/pkg/asset/manifests/azure/cluster.go +++ b/pkg/asset/manifests/azure/cluster.go @@ -65,6 +65,17 @@ func GenerateClusterAssets(installConfig *installconfig.InstallConfig, clusterID Destination: ptr.To("*"), Action: capz.SecurityRuleActionAllow, }, + { + Name: "konnectivity_in", + Protocol: capz.SecurityGroupProtocolTCP, + Direction: capz.SecurityRuleDirectionInbound, + Priority: 230, + SourcePorts: ptr.To("*"), + DestinationPorts: ptr.To("8091"), + Source: ptr.To("*"), + Destination: ptr.To("*"), + Action: capz.SecurityRuleActionAllow, + }, } // If we are using Internal publishing, we need a security rule for each CIDR @@ -97,6 +108,19 @@ func GenerateClusterAssets(installConfig *installconfig.InstallConfig, clusterID Action: capz.SecurityRuleActionAllow, }) securityRulePriority += 10 + + securityRules = append(securityRules, capz.SecurityRule{ + Name: fmt.Sprintf("konnectivity_in_ipv4_%02d", i), + Protocol: capz.SecurityGroupProtocolTCP, + Direction: capz.SecurityRuleDirectionInbound, + SourcePorts: ptr.To("*"), + DestinationPorts: ptr.To("8091"), + Source: ptr.To(addressFamilySubnets.GetIPv4Subnets()[i].String()), + Destination: ptr.To("*"), + Priority: securityRulePriority, + Action: capz.SecurityRuleActionAllow, + }) + securityRulePriority += 10 } } if addressFamilySubnets.IPv6Count() > 0 && !installConfig.Config.PublicAPI() { @@ -126,6 +150,19 @@ func GenerateClusterAssets(installConfig *installconfig.InstallConfig, clusterID Action: capz.SecurityRuleActionAllow, }) securityRulePriority += 10 + + securityRules = append(securityRules, capz.SecurityRule{ + Name: fmt.Sprintf("konnectivity_in_ipv6_%02d", i), + Protocol: capz.SecurityGroupProtocolTCP, + Direction: capz.SecurityRuleDirectionInbound, + SourcePorts: ptr.To("*"), + DestinationPorts: ptr.To("8091"), + Source: ptr.To(addressFamilySubnets.GetIPv6Subnets()[i].String()), + Destination: ptr.To("*"), + Priority: securityRulePriority, + Action: capz.SecurityRuleActionAllow, + }) + securityRulePriority += 10 } } if len(securityRules) == 0 { diff --git a/pkg/asset/manifests/ibmcloud/securitygroups.go b/pkg/asset/manifests/ibmcloud/securitygroups.go index 24f9a020f2e..93cd0691a75 100644 --- a/pkg/asset/manifests/ibmcloud/securitygroups.go +++ b/pkg/asset/manifests/ibmcloud/securitygroups.go @@ -472,6 +472,8 @@ func buildBootstrapSecurityGroup(infraID string, allSubnets []capibmcloud.Subnet }) } } + clusterWideSGNamePtr := ptr.To(fmt.Sprintf("%s-%s", infraID, clusterWideSGNameSuffix)) + return capibmcloud.VPCSecurityGroup{ Name: bootstrapSGNamePtr, Rules: []*capibmcloud.VPCSecurityGroupRule{ @@ -488,6 +490,24 @@ func buildBootstrapSecurityGroup(infraID string, allSubnets []capibmcloud.Subnet Remotes: remotes, }, }, + { + // Konnectivity + Action: capibmcloud.VPCSecurityGroupRuleActionAllow, + Direction: capibmcloud.VPCSecurityGroupRuleDirectionInbound, + Source: &capibmcloud.VPCSecurityGroupRulePrototype{ + PortRange: &capibmcloud.VPCSecurityGroupPortRange{ + MaximumPort: 8091, + MinimumPort: 8091, + }, + Protocol: capibmcloud.VPCSecurityGroupRuleProtocolTCP, + Remotes: []capibmcloud.VPCSecurityGroupRuleRemote{ + { + RemoteType: capibmcloud.VPCSecurityGroupRuleRemoteTypeSG, + SecurityGroupName: clusterWideSGNamePtr, + }, + }, + }, + }, }, } } diff --git a/pkg/asset/manifests/powervs/securitygroups.go b/pkg/asset/manifests/powervs/securitygroups.go index bf474d3f28a..69bd0067e11 100644 --- a/pkg/asset/manifests/powervs/securitygroups.go +++ b/pkg/asset/manifests/powervs/securitygroups.go @@ -50,6 +50,23 @@ func buildControlPlaneSecurityGroup(infraID string) capibmcloud.VPCSecurityGroup }, }, }, + { + // Konnectivity + Action: capibmcloud.VPCSecurityGroupRuleActionAllow, + Direction: capibmcloud.VPCSecurityGroupRuleDirectionInbound, + Source: &capibmcloud.VPCSecurityGroupRulePrototype{ + PortRange: &capibmcloud.VPCSecurityGroupPortRange{ + MaximumPort: 8091, + MinimumPort: 8091, + }, + Protocol: capibmcloud.VPCSecurityGroupRuleProtocolTCP, + Remotes: []capibmcloud.VPCSecurityGroupRuleRemote{ + { + RemoteType: capibmcloud.VPCSecurityGroupRuleRemoteTypeAny, + }, + }, + }, + }, { Action: capibmcloud.VPCSecurityGroupRuleActionAllow, Direction: capibmcloud.VPCSecurityGroupRuleDirectionInbound, diff --git a/pkg/gather/service/analyze.go b/pkg/gather/service/analyze.go index 680dc5b6247..7c5e9e062ef 100644 --- a/pkg/gather/service/analyze.go +++ b/pkg/gather/service/analyze.go @@ -114,18 +114,22 @@ func checkReleaseImageDownload(a analysis) bool { return false } -// bootstrap-verify-api-servel-urls.sh is currently running as part of the bootkube service. -// And the verification of the API and API-Int URLs are the only stage where a failure is -// currently reported. So, here we are able to conclude that a failure corresponds to a -// failure to resolve either the API URL or API-Int URL or both. If that changes and if -// any other stage in the bootkube service starts reporting a failure, we need to revisit -// this. At that point verification of the URLs could be moved to its own service. func checkBootkubeService(a analysis) bool { if a.successful { return true } - // Note: Even when there is a stage failure, we are not returning false here. That is - // intentional because we donot want to report this as an error in the "analyze" output. + switch a.failingStage { + case "konnectivity-certs": + logrus.Error("The bootstrap machine failed to generate konnectivity certificates") + case "konnectivity-server-bootstrap": + logrus.Error("The bootstrap machine failed to start the konnectivity server") + case "konnectivity-agent-manifest": + logrus.Error("The bootstrap machine failed to create konnectivity agent manifests") + case "konnectivity-cleanup": + logrus.Error("The bootstrap machine failed to clean up konnectivity resources") + default: + logrus.Errorf("The bootkube service failed at stage %q", a.failingStage) + } a.logLastError() return true } diff --git a/pkg/gather/service/analyze_test.go b/pkg/gather/service/analyze_test.go index 4764783dc3a..2fa9156a21f 100644 --- a/pkg/gather/service/analyze_test.go +++ b/pkg/gather/service/analyze_test.go @@ -4,6 +4,7 @@ import ( "archive/tar" "bytes" "compress/gzip" + "fmt" "testing" "github.com/sirupsen/logrus" @@ -37,8 +38,9 @@ func failedReleaseImage() []logrus.Entry { } } -func failedURLChecks() []logrus.Entry { +func failedBootkubeStage(stage string) []logrus.Entry { return []logrus.Entry{ + {Level: logrus.ErrorLevel, Message: fmt.Sprintf("The bootkube service failed at stage %q", stage)}, {Level: logrus.InfoLevel, Message: "Line 1"}, {Level: logrus.InfoLevel, Message: "Line 2"}, {Level: logrus.InfoLevel, Message: "Line 3"}, @@ -104,7 +106,7 @@ func TestAnalyzeGatherBundle(t *testing.T) { "log-bundle/log-bundle-bootstrap/bootstrap/services/release-image.json": generateSuccessOutput("pull-release-image"), "log-bundle/bootstrap/services/bootkube.json": generateFailureOutput("check-api-url"), }, - expectedOutput: failedURLChecks(), + expectedOutput: failedBootkubeStage("check-api-url"), }, { name: "API-INT Server URL failed", @@ -112,7 +114,7 @@ func TestAnalyzeGatherBundle(t *testing.T) { "log-bundle/log-bundle-bootstrap/bootstrap/services/release-image.json": generateSuccessOutput("pull-release-image"), "log-bundle/bootstrap/services/bootkube.json": generateFailureOutput("check-api-int-url"), }, - expectedOutput: failedURLChecks(), + expectedOutput: failedBootkubeStage("check-api-int-url"), }, { name: "both release-image and API Server URLs failed", diff --git a/pkg/gather/service/entry.go b/pkg/gather/service/entry.go index 0fda98f1ee1..88c2f2bd1b6 100644 --- a/pkg/gather/service/entry.go +++ b/pkg/gather/service/entry.go @@ -10,7 +10,7 @@ type Entry struct { // present when the phase is an ending phase. Result Result `json:"result,omitempty"` // Stage is the name of the stage being executed. This is only present when the phase is either StageStart or StageEnd. - Stage string `json:"string,omitempty"` + Stage string `json:"stage,omitempty"` // PreCommand is the name of the pre-command being executed. This is only present when the phase is either // PreCommandStart or PreCommandEnd. PreCommand string `json:"preCommand,omitempty"` diff --git a/pkg/infrastructure/aws/clusterapi/aws.go b/pkg/infrastructure/aws/clusterapi/aws.go index 9be200ca39f..79e1ba51cb3 100644 --- a/pkg/infrastructure/aws/clusterapi/aws.go +++ b/pkg/infrastructure/aws/clusterapi/aws.go @@ -503,34 +503,34 @@ func (p *Provider) DestroyBootstrap(ctx context.Context, in clusterapi.Bootstrap startTime := time.Now() untilTime := startTime.Add(timeout) timezone, _ := untilTime.Zone() - logrus.Debugf("Waiting up to %v (until %v %s) for bootstrap SSH rule to be destroyed...", timeout, untilTime.Format(time.Kitchen), timezone) + logrus.Debugf("Waiting up to %v (until %v %s) for bootstrap rules to be destroyed...", timeout, untilTime.Format(time.Kitchen), timezone) if err := wait.PollUntilContextTimeout(ctx, 15*time.Second, timeout, true, func(ctx context.Context) (bool, error) { - if err := removeSSHRule(ctx, in.Client, in.Metadata.InfraID); err != nil { + if err := removeBootstrapRules(ctx, in.Client, in.Metadata.InfraID); err != nil { // If the cluster object has been modified between Get and Update, k8s client will refuse to update it. // In that case, we need to retry. if k8serrors.IsConflict(err) { - logrus.Debugf("AWSCluster update conflict during SSH rule removal: %v", err) + logrus.Debugf("AWSCluster update conflict during bootstrap rule removal: %v", err) return false, nil } - return true, fmt.Errorf("failed to remove bootstrap SSH rule: %w", err) + return true, fmt.Errorf("failed to remove bootstrap rules: %w", err) } - return isSSHRuleGone(ctx, ec2Client, sgID) + return isBootstrapRuleGone(ctx, ec2Client, sgID) }, ); err != nil { if wait.Interrupted(err) { - return fmt.Errorf("bootstrap ssh rule was not removed within %v: %w", timeout, err) + return fmt.Errorf("bootstrap rules were not removed within %v: %w", timeout, err) } - return fmt.Errorf("unable to remove bootstrap ssh rule: %w", err) + return fmt.Errorf("unable to remove bootstrap rules: %w", err) } - logrus.Debugf("Completed removing bootstrap SSH rule after %v", time.Since(startTime)) + logrus.Debugf("Completed removing bootstrap rules after %v", time.Since(startTime)) return nil } -// removeSSHRule removes the SSH rule for accessing the bootstrap node -// by removing the rule from the cluster spec and updating the object. -func removeSSHRule(ctx context.Context, cl k8sClient.Client, infraID string) error { +// removeBootstrapRules removes bootstrap-only rules (SSH and konnectivity) +// by removing them from the cluster spec and updating the object. +func removeBootstrapRules(ctx context.Context, cl k8sClient.Client, infraID string) error { awsCluster := &capa.AWSCluster{} key := k8sClient.ObjectKey{ Name: infraID, @@ -545,6 +545,9 @@ func removeSSHRule(ctx context.Context, cl k8sClient.Client, infraID string) err if strings.EqualFold(rule.Description, awsmanifest.BootstrapSSHDescription) { continue } + if strings.EqualFold(rule.Description, awsmanifest.BootstrapKonnectivityDescription) { + continue + } postBootstrapRules = append(postBootstrapRules, rule) } @@ -555,14 +558,15 @@ func removeSSHRule(ctx context.Context, cl k8sClient.Client, infraID string) err if err := cl.Update(ctx, awsCluster); err != nil { return fmt.Errorf("failed to update AWSCluster during bootstrap destroy: %w", err) } - logrus.Debug("Updated AWSCluster to remove bootstrap SSH rule") + logrus.Debug("Updated AWSCluster to remove bootstrap rules") } return nil } -// isSSHRuleGone checks that the Public SSH rule has been removed from the security group. -func isSSHRuleGone(ctx context.Context, client *ec2.Client, sgID string) (bool, error) { +// isBootstrapRuleGone checks that the bootstrap-only rules (SSH and konnectivity) +// have been removed from the security group. +func isBootstrapRuleGone(ctx context.Context, client *ec2.Client, sgID string) (bool, error) { sgs, err := awsconfig.DescribeSecurityGroups(ctx, client, []string{sgID}) if err != nil { return false, fmt.Errorf("error getting security group: %w", err) @@ -578,22 +582,29 @@ func isSSHRuleGone(ctx context.Context, client *ec2.Client, sgID string) (bool, sg := sgs[0] for _, rule := range sg.IpPermissions { - if ptr.Deref(rule.ToPort, 0) != 22 { - continue - } - // Check IPv4 rules - for _, source := range rule.IpRanges { - if source.CidrIp != nil && *source.CidrIp == "0.0.0.0/0" { - ruleDesc := ptr.Deref(source.Description, "[no description]") - logrus.Debugf("Found ingress rule %s with source cidr %s. Still waiting for deletion...", ruleDesc, *source.CidrIp) - return false, nil + port := ptr.Deref(rule.ToPort, 0) + switch port { + case 22: + // Check IPv4 rules + for _, source := range rule.IpRanges { + if source.CidrIp != nil && *source.CidrIp == "0.0.0.0/0" { + ruleDesc := ptr.Deref(source.Description, "[no description]") + logrus.Debugf("Found ingress rule %s with source cidr %s. Still waiting for deletion...", ruleDesc, *source.CidrIp) + return false, nil + } } - } - // Check IPv6 rules - for _, source := range rule.Ipv6Ranges { - if source.CidrIpv6 != nil && *source.CidrIpv6 == "::/0" { - ruleDesc := ptr.Deref(source.Description, "[no description]") - logrus.Debugf("Found ingress rule %s with source cidr %s. Still waiting for deletion...", ruleDesc, *source.CidrIpv6) + // Check IPv6 rules + for _, source := range rule.Ipv6Ranges { + if source.CidrIpv6 != nil && *source.CidrIpv6 == "::/0" { + ruleDesc := ptr.Deref(source.Description, "[no description]") + logrus.Debugf("Found ingress rule %s with source cidr %s. Still waiting for deletion...", ruleDesc, *source.CidrIpv6) + return false, nil + } + } + case 8091: + // Check konnectivity rule (uses security group sources) + if len(rule.UserIdGroupPairs) > 0 { + logrus.Debugf("Found konnectivity ingress rule on port 8091. Still waiting for deletion...") return false, nil } } diff --git a/pkg/infrastructure/azure/azure.go b/pkg/infrastructure/azure/azure.go index be6ce23f383..7a1833e64aa 100644 --- a/pkg/infrastructure/azure/azure.go +++ b/pkg/infrastructure/azure/azure.go @@ -693,6 +693,7 @@ func (p *Provider) PostDestroy(ctx context.Context, in clusterapi.PostDestroyerI } securityGroupName := fmt.Sprintf("%s-nsg", in.Metadata.InfraID) sshRuleName := fmt.Sprintf("%s_ssh_in", in.Metadata.InfraID) + konnectivityRuleName := fmt.Sprintf("%s_konnectivity_in", in.Metadata.InfraID) // See if a security group rule exists with the name ${InfraID}_ssh_in. // If it does, this is a private cluster. If it does not, this is a @@ -734,6 +735,26 @@ func (p *Provider) PostDestroy(ctx context.Context, in clusterapi.PostDestroyerI return fmt.Errorf("failed to delete inbound nat rule: %w", err) } } + + // Remove konnectivity security group rule used during bootstrap. + _, err = networkClientFactory.NewSecurityRulesClient().Get(ctx, + resourceGroupName, + securityGroupName, + konnectivityRuleName, + nil, + ) + if err == nil { + err = deleteSecurityGroupRule(ctx, &securityGroupInput{ + resourceGroupName: resourceGroupName, + securityGroupName: securityGroupName, + securityRuleName: konnectivityRuleName, + securityRulePort: "8091", + networkClientFactory: networkClientFactory, + }) + if err != nil { + return fmt.Errorf("failed to delete konnectivity security rule: %w", err) + } + } return nil } diff --git a/pkg/infrastructure/gcp/clusterapi/firewallrules.go b/pkg/infrastructure/gcp/clusterapi/firewallrules.go index f5b2952ec33..fa8fcca61bf 100644 --- a/pkg/infrastructure/gcp/clusterapi/firewallrules.go +++ b/pkg/infrastructure/gcp/clusterapi/firewallrules.go @@ -319,7 +319,22 @@ func createBootstrapFirewallRules(ctx context.Context, in clusterapi.InfraReadyI machineCIDR := in.InstallConfig.Config.Networking.MachineNetwork[0].CIDR.String() srcRanges = []string{machineCIDR} } - return addFirewallRule(ctx, svc, firewallName, network, projectID, getBootstrapSSHPorts(), srcTags, targetTags, srcRanges) + if err := addFirewallRule(ctx, svc, firewallName, network, projectID, getBootstrapSSHPorts(), srcTags, targetTags, srcRanges); err != nil { + return err + } + + // Konnectivity is only needed during bootstrap + workerTag := fmt.Sprintf("%s-worker", in.InfraID) + firewallName = fmt.Sprintf("%s-bootstrap-in-konnectivity", in.InfraID) + srcTags = []string{workerTag, bootstrapTag} + targetTags = []string{bootstrapTag} + konnectivityPorts := []*compute.FirewallAllowed{ + { + IPProtocol: "tcp", + Ports: []string{"8091"}, + }, + } + return addFirewallRule(ctx, svc, firewallName, network, projectID, konnectivityPorts, srcTags, targetTags, nil) } // removeBootstrapFirewallRules removes the rules created for the bootstrap node. @@ -334,6 +349,11 @@ func removeBootstrapFirewallRules(ctx context.Context, infraID, projectID string } firewallName := fmt.Sprintf("%s-bootstrap-in-ssh", infraID) + if err := deleteFirewallRule(ctx, svc, firewallName, projectID); err != nil { + return err + } + + firewallName = fmt.Sprintf("%s-bootstrap-in-konnectivity", infraID) return deleteFirewallRule(ctx, svc, firewallName, projectID) } diff --git a/pkg/infrastructure/openstack/preprovision/securitygroups.go b/pkg/infrastructure/openstack/preprovision/securitygroups.go index efbf14c0eff..a31d200ca0d 100644 --- a/pkg/infrastructure/openstack/preprovision/securitygroups.go +++ b/pkg/infrastructure/openstack/preprovision/securitygroups.go @@ -138,6 +138,7 @@ func SecurityGroups(ctx context.Context, installConfig *installconfig.InstallCon serviceIKENat = service{udp, 4500, 4500} serviceInternal = service{tcp | udp, 9000, 9999} serviceKCM = service{tcp, 10257, 10257} + serviceKonnectivity = service{tcp, 8091, 8091} serviceKubeScheduler = service{tcp, 10259, 10259} serviceKubelet = service{tcp, 10250, 10250} serviceMCS = service{tcp, 22623, 22623} @@ -234,6 +235,7 @@ func SecurityGroups(ctx context.Context, installConfig *installconfig.InstallCon addMasterRules(serviceDNS, ipVersion, CIDRs) addMasterRules(serviceETCD, ipVersion, CIDRs) addMasterRules(serviceKCM, ipVersion, CIDRs) + addBootstrapRules(serviceKonnectivity, ipVersion, CIDRs) addMasterRules(serviceKubeScheduler, ipVersion, CIDRs) addMasterRules(serviceMCS, ipVersion, CIDRs) addMasterRules(serviceOVNDB, ipVersion, CIDRs)