diff --git a/jobs/competitive-test.yml b/jobs/competitive-test.yml index 4f2c6a08f2..f97f937d63 100644 --- a/jobs/competitive-test.yml +++ b/jobs/competitive-test.yml @@ -48,6 +48,9 @@ parameters: - name: ssh_key_enabled type: boolean default: true +- name: skip_publish + type: boolean + default: false jobs: - job: ${{ parameters.cloud }} @@ -89,14 +92,15 @@ jobs: engine: ${{ parameters.engine }} regions: ${{ parameters.regions }} engine_input: ${{ parameters.engine_input }} - - template: /steps/publish-results.yml - parameters: - cloud: ${{ parameters.cloud }} - topology: ${{ parameters.topology }} - engine: ${{ parameters.engine }} - regions: ${{ parameters.regions }} - engine_input: ${{ parameters.engine_input }} - credential_type: ${{ parameters.credential_type }} + - ${{ if not(parameters.skip_publish) }}: + - template: /steps/publish-results.yml + parameters: + cloud: ${{ parameters.cloud }} + topology: ${{ parameters.topology }} + engine: ${{ parameters.engine }} + regions: ${{ parameters.regions }} + engine_input: ${{ parameters.engine_input }} + credential_type: ${{ parameters.credential_type }} - template: /steps/cleanup-resources.yml parameters: cloud: ${{ parameters.cloud }} diff --git a/modules/python/clusterloader2/clustermesh-scale/config/config.yaml b/modules/python/clusterloader2/clustermesh-scale/config/config.yaml new file mode 100644 index 0000000000..6eace02220 --- /dev/null +++ b/modules/python/clusterloader2/clustermesh-scale/config/config.yaml @@ -0,0 +1,105 @@ +name: clustermesh-scale-test + +# Workload: deploy a small fixed number of pods on this cluster (no churn, +# no traffic). Measurement modules under modules/measurements/ run the actual +# scale-test instrumentation (cilium agent/operator CPU+memory, kube-apiserver +# health, mesh-specific PromQL) so each per-cluster JSONL row carries the data +# needed for cross-cluster comparison in Kusto. The workload is deliberately +# trivial — fan-out, attribution, and metric coverage are what we're testing +# in Phase 1; richer workloads land per scenario in Phase 2+. + +{{$namespaces := DefaultParam .CL2_NAMESPACES 1}} +{{$deploymentsPerNamespace := DefaultParam .CL2_DEPLOYMENTS_PER_NAMESPACE 2}} +{{$replicasPerDeployment := DefaultParam .CL2_REPLICAS_PER_DEPLOYMENT 2}} +{{$operationTimeout := DefaultParam .CL2_OPERATION_TIMEOUT "15m"}} +{{$apiServerCallsPerSecond := DefaultParam .CL2_API_SERVER_CALLS_PER_SECOND 5}} + +namespace: + number: {{$namespaces}} + prefix: clustermesh-scale + deleteStaleNamespaces: true + deleteAutomanagedNamespaces: true + enableExistingNamespaces: false + deleteNamespaceTimeout: 20m + +tuningSets: + - name: Sequence + parallelismLimitedLoad: + parallelismLimit: 1 + - name: DeploymentCreateQps + qpsLoad: + qps: {{$apiServerCallsPerSecond}} + +steps: + # ----- Start measurements ----- + # control-plane.yaml owns PodStartupLatency + APIResponsivenessPrometheus + + # apiserver CPU/mem queries; cilium.yaml owns cilium-agent + cilium-operator + # CPU/mem; clustermesh-metrics.yaml owns mesh-specific PromQL (remote-cluster + # connectivity, kvstore event rate, identity count, etc.). All three are + # gathered later (see "Gather measurements" below) so the steady-state window + # is bounded by the workload create/delete pair. + - module: + path: /modules/measurements/control-plane.yaml + params: + action: start + group: clustermesh-scale-test + + - module: + path: /modules/measurements/cilium.yaml + params: + action: start + + - module: + path: /modules/measurements/clustermesh-metrics.yaml + params: + action: start + + - module: + path: /modules/clustermesh.yaml + params: + actionName: create + tuningSet: DeploymentCreateQps + + - module: + path: /modules/scale-test.yaml + params: + actionName: create + namespaces: {{$namespaces}} + deploymentsPerNamespace: {{$deploymentsPerNamespace}} + replicasPerDeployment: {{$replicasPerDeployment}} + tuningSet: DeploymentCreateQps + operationTimeout: {{$operationTimeout}} + + # ----- Gather measurements ----- + # Mirror the start block above. Order matches network-scale convention. + - module: + path: /modules/measurements/control-plane.yaml + params: + action: gather + group: clustermesh-scale-test + + - module: + path: /modules/measurements/cilium.yaml + params: + action: gather + + - module: + path: /modules/measurements/clustermesh-metrics.yaml + params: + action: gather + + - module: + path: /modules/scale-test.yaml + params: + actionName: delete + namespaces: {{$namespaces}} + deploymentsPerNamespace: {{$deploymentsPerNamespace}} + replicasPerDeployment: {{$replicasPerDeployment}} + tuningSet: DeploymentCreateQps + operationTimeout: {{$operationTimeout}} + + - module: + path: /modules/clustermesh.yaml + params: + actionName: delete + tuningSet: DeploymentCreateQps diff --git a/modules/python/clusterloader2/clustermesh-scale/config/event-throughput.yaml b/modules/python/clusterloader2/clustermesh-scale/config/event-throughput.yaml new file mode 100644 index 0000000000..439fdc4e71 --- /dev/null +++ b/modules/python/clusterloader2/clustermesh-scale/config/event-throughput.yaml @@ -0,0 +1,166 @@ +name: clustermesh-event-throughput + +# Scale scenario #1: Cross-Cluster Event Throughput. +# +# Goal (scale testing.txt line 42-54): determine max sustainable and burst +# event rates for endpoints, services, and identities propagating across +# the mesh; measure events/sec processed and time-to-convergence proxy. +# +# Sequence (every cluster runs this in parallel; CL2 fan-out lives in +# steps/engine/.../execute.yml): +# +# 1. Start measurements (control-plane, cilium, clustermesh-metrics + +# scenario-specific clustermesh-throughput + etcd-metrics). +# 2. Deploy PodMonitor scraping clustermesh-apiserver. +# 3. Create N pods + N global Services per cluster at a controlled QPS. +# 4. Warmup sleep — let initial create-flurry settle into steady state. +# 5. Burst rolling-restart of every Deployment (closes the "burst" +# coverage gap from scale testing.txt line 52). +# 6. Settle sleep — let kvstore queues drain and propagation latency +# histograms accumulate steady-state samples. +# 7. Gather all measurements. +# 8. Tear down the workload + PodMonitor. + +{{$namespaces := DefaultParam .CL2_NAMESPACES 5}} +{{$deploymentsPerNamespace := DefaultParam .CL2_DEPLOYMENTS_PER_NAMESPACE 4}} +{{$replicasPerDeployment := DefaultParam .CL2_REPLICAS_PER_DEPLOYMENT 10}} +{{$operationTimeout := DefaultParam .CL2_OPERATION_TIMEOUT "20m"}} +{{$apiServerCallsPerSecond := DefaultParam .CL2_API_SERVER_CALLS_PER_SECOND 20}} +{{$warmupDuration := DefaultParam .CL2_WARMUP_DURATION "30s"}} +{{$holdDuration := DefaultParam .CL2_HOLD_DURATION "2m"}} +{{$restartGeneration := DefaultParam .CL2_RESTART_GENERATION 1}} + +namespace: + number: {{$namespaces}} + prefix: clustermesh-et + deleteStaleNamespaces: true + deleteAutomanagedNamespaces: true + enableExistingNamespaces: false + deleteNamespaceTimeout: 20m + +tuningSets: + - name: Sequence + parallelismLimitedLoad: + parallelismLimit: 1 + - name: DeploymentCreateQps + qpsLoad: + qps: {{$apiServerCallsPerSecond}} + +steps: + # ----- Start measurements ----- + - module: + path: /modules/measurements/control-plane.yaml + params: + action: start + group: clustermesh-event-throughput + + - module: + path: /modules/measurements/cilium.yaml + params: + action: start + + - module: + path: /modules/measurements/clustermesh-metrics.yaml + params: + action: start + + - module: + path: /modules/measurements/clustermesh-throughput.yaml + params: + action: start + + - module: + path: /modules/measurements/etcd-metrics.yaml + params: + action: start + + - module: + path: /modules/clustermesh.yaml + params: + actionName: create + tuningSet: DeploymentCreateQps + + # ----- Workload: create ----- + - module: + path: /modules/event-throughput-workload.yaml + params: + actionName: create + generation: 0 + namespaces: {{$namespaces}} + deploymentsPerNamespace: {{$deploymentsPerNamespace}} + replicasPerDeployment: {{$replicasPerDeployment}} + tuningSet: DeploymentCreateQps + operationTimeout: {{$operationTimeout}} + + # ----- Warmup: let the create-flurry settle into steady state ----- + - name: Warmup before burst + measurements: + - Identifier: WarmupSleep + Method: Sleep + Params: + duration: {{$warmupDuration}} + + # ----- Burst: rolling-restart of every Deployment ----- + - module: + path: /modules/event-throughput-workload.yaml + params: + actionName: restart + generation: {{$restartGeneration}} + namespaces: {{$namespaces}} + deploymentsPerNamespace: {{$deploymentsPerNamespace}} + replicasPerDeployment: {{$replicasPerDeployment}} + tuningSet: DeploymentCreateQps + operationTimeout: {{$operationTimeout}} + + # ----- Settle: let kvstore queues drain post-burst ----- + - name: Settle after burst + measurements: + - Identifier: SettleSleep + Method: Sleep + Params: + duration: {{$holdDuration}} + + # ----- Gather measurements ----- + - module: + path: /modules/measurements/control-plane.yaml + params: + action: gather + group: clustermesh-event-throughput + + - module: + path: /modules/measurements/cilium.yaml + params: + action: gather + + - module: + path: /modules/measurements/clustermesh-metrics.yaml + params: + action: gather + + - module: + path: /modules/measurements/clustermesh-throughput.yaml + params: + action: gather + + - module: + path: /modules/measurements/etcd-metrics.yaml + params: + action: gather + + # ----- Workload: delete ----- + - module: + path: /modules/event-throughput-workload.yaml + params: + actionName: delete + generation: {{$restartGeneration}} + namespaces: {{$namespaces}} + deploymentsPerNamespace: {{$deploymentsPerNamespace}} + replicasPerDeployment: {{$replicasPerDeployment}} + tuningSet: DeploymentCreateQps + operationTimeout: {{$operationTimeout}} + + - module: + path: /modules/clustermesh.yaml + params: + actionName: delete + tuningSet: DeploymentCreateQps diff --git a/modules/python/clusterloader2/clustermesh-scale/config/modules/clustermesh.yaml b/modules/python/clusterloader2/clustermesh-scale/config/modules/clustermesh.yaml new file mode 100644 index 0000000000..175387b2ae --- /dev/null +++ b/modules/python/clusterloader2/clustermesh-scale/config/modules/clustermesh.yaml @@ -0,0 +1,26 @@ +## ClusterMesh module: deploys a PodMonitor for clustermesh-apiserver so the +## CL2-spawned Prometheus picks up at least one mesh-side metric per cluster. +## Phase 1 exit criteria require this — see plan.md Phase 1 line 318. + +{{$tuningSet := DefaultParam .tuningSet "DeploymentCreateQps"}} +{{$interval := DefaultParam .interval "15s"}} +{{ $replicasPerNamespace := 1 }} + +{{if eq .actionName "create"}} + {{ $replicasPerNamespace = 1 }} +{{else}} + {{ $replicasPerNamespace = 0 }} +{{end}} + +steps: + - name: {{.actionName}} ClusterMesh Pod Monitor + phases: + - namespaceList: + - "monitoring" + replicasPerNamespace: {{$replicasPerNamespace}} + tuningSet: {{$tuningSet}} + objectBundle: + - objectTemplatePath: "modules/clustermesh/podmonitor.yaml" + basename: clustermesh-apiserver + templateFillMap: + Interval: {{$interval}} diff --git a/modules/python/clusterloader2/clustermesh-scale/config/modules/clustermesh/podmonitor.yaml b/modules/python/clusterloader2/clustermesh-scale/config/modules/clustermesh/podmonitor.yaml new file mode 100644 index 0000000000..f667f9e94a --- /dev/null +++ b/modules/python/clusterloader2/clustermesh-scale/config/modules/clustermesh/podmonitor.yaml @@ -0,0 +1,35 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + name: clustermesh-apiserver + namespace: monitoring +spec: + # Cilium clustermesh-apiserver exposes metrics on port 9963 (apiserver) and + # 9964 (kvstoremesh sidecar) when Prometheus integration is enabled. AKS + # managed Cilium uses the same upstream defaults. If a future preview + # changes these, override via __address__ relabel below. + selector: + matchLabels: + k8s-app: clustermesh-apiserver + namespaceSelector: + matchNames: + - kube-system + podMetricsEndpoints: + - interval: {{.Interval}} + honorLabels: true + path: /metrics + relabelings: + - sourceLabels: [__address__] + action: replace + targetLabel: __address__ + regex: (.+?)(\:\d+)? + replacement: $1:9963 + - interval: {{.Interval}} + honorLabels: true + path: /metrics + relabelings: + - sourceLabels: [__address__] + action: replace + targetLabel: __address__ + regex: (.+?)(\:\d+)? + replacement: $1:9964 diff --git a/modules/python/clusterloader2/clustermesh-scale/config/modules/event-throughput-deployment.yaml b/modules/python/clusterloader2/clustermesh-scale/config/modules/event-throughput-deployment.yaml new file mode 100644 index 0000000000..06d677b1b0 --- /dev/null +++ b/modules/python/clusterloader2/clustermesh-scale/config/modules/event-throughput-deployment.yaml @@ -0,0 +1,42 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{.Name}} + labels: + group: {{.Group}} + app: {{.Name}} +spec: + replicas: {{.Replicas}} + selector: + matchLabels: + name: {{.Name}} + template: + metadata: + labels: + name: {{.Name}} + group: {{.Group}} + app: {{.Name}} + annotations: + # Bumping RestartGeneration in the pod template forces a rolling + # restart on the next CL2 apply — the canonical Kubernetes pattern + # for triggering deployment rollouts without changing image. This + # drives the burst event flurry for scale-scenario #1. + restart-generation: "{{.RestartGeneration}}" + spec: + containers: + - name: pause + image: mcr.microsoft.com/oss/kubernetes/pause:3.6 + # pause:3.6 is the Kubernetes pause container — it literally sleeps + # forever and consumes single-digit CPU shares + ~few MB. The + # earlier 50m CPU / 50Mi memory limits caused per-node CPU + # overcommit (~160% of allocatable on Standard_D4s_v4) at + # 100 pods/node, which starves the kubelet+CNI sandbox setup and + # leaves a few stragglers stuck Pending → CL2 timeout. Tighter + # limits here mirror what real pause-pod e2e fixtures use. + resources: + requests: + cpu: 1m + memory: 5Mi + limits: + cpu: 5m + memory: 20Mi diff --git a/modules/python/clusterloader2/clustermesh-scale/config/modules/event-throughput-service.yaml b/modules/python/clusterloader2/clustermesh-scale/config/modules/event-throughput-service.yaml new file mode 100644 index 0000000000..7c795f65c3 --- /dev/null +++ b/modules/python/clusterloader2/clustermesh-scale/config/modules/event-throughput-service.yaml @@ -0,0 +1,27 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{.Name}} + labels: + group: {{.Group}} + app: {{.Name}} + annotations: + # Modern annotation (Cilium >= 1.13). The clustermesh-apiserver fans + # this service's endpoints out to all peer clusters, exercising the + # service-propagation path that scale-scenario #1 measures. + service.cilium.io/global: "true" + # Legacy annotation (pre-1.13). Applied defensively because the AKS + # managed Cilium build version is not yet verified by us. Cilium + # ignores annotations it does not understand, so carrying both is safe. + io.cilium/global-service: "true" +spec: + selector: + name: {{.Name}} + ports: + - port: 80 + targetPort: 80 + protocol: TCP + # Headless: backends are advertised across the mesh by clustermesh-apiserver + # rather than routed through a per-cluster ClusterIP. Reduces noise from + # ClusterIP allocation under high churn. + clusterIP: None diff --git a/modules/python/clusterloader2/clustermesh-scale/config/modules/event-throughput-workload.yaml b/modules/python/clusterloader2/clustermesh-scale/config/modules/event-throughput-workload.yaml new file mode 100644 index 0000000000..0e0a3e36bd --- /dev/null +++ b/modules/python/clusterloader2/clustermesh-scale/config/modules/event-throughput-workload.yaml @@ -0,0 +1,73 @@ +name: clustermesh-event-throughput-workload + +# Workload module for scale-scenario #1: Cross-Cluster Event Throughput. +# +# Exercises three flavors of cross-cluster events on every cluster in parallel: +# +# create — bring N pods + N global Services up at a controlled QPS. +# Drives endpoint+identity creation events into the local +# clustermesh-apiserver, which fans out N*(M-1) writes across +# the mesh on every other peer's etcd. +# restart — bump a pod-template annotation so the Deployment triggers a +# rolling restart. Closes the "burst creation/deletion" gap from +# scale testing.txt line 52 — measures peak event-flurry capacity +# when an entire cluster's pods churn over within seconds. +# delete — set replicasPerNamespace to 0; drives the symmetric delete-event +# throughput number. + +{{$actionName := .actionName}} +{{$generation := DefaultParam .generation 0}} +{{$namespaces := .namespaces}} +{{$deploymentsPerNamespace := .deploymentsPerNamespace}} +{{$replicasPerDeployment := .replicasPerDeployment}} +{{$tuningSet := .tuningSet}} +{{$operationTimeout := .operationTimeout}} + +# delete = bring object count to 0; create/restart keep configured count. +{{$replicasInPhase := $deploymentsPerNamespace}} +{{if eq $actionName "delete"}}{{$replicasInPhase = 0}}{{end}} + +steps: + # Per-action WaitForControlledPodsRunning lifecycle: start (registers + # watcher with apiVersion+kind so CL2 knows which controllers to track), + # then create/restart/delete the workload, then gather. Using a per-action + # Identifier keeps the create/restart/delete invocations from clobbering + # each other's metric state across the three module calls in + # event-throughput.yaml. + - name: Start tracking event-throughput pods to be {{$actionName}}d + measurements: + - Identifier: WaitForControlledPodsRunning-{{$actionName}} + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + kind: Deployment + checkIfPodsAreUpdated: true + labelSelector: group = clustermesh-event-throughput + operationTimeout: {{$operationTimeout}} + + - name: {{$actionName}} event-throughput workload + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$replicasInPhase}} + tuningSet: {{$tuningSet}} + objectBundle: + - basename: et + objectTemplatePath: /modules/event-throughput-deployment.yaml + templateFillMap: + Replicas: {{$replicasPerDeployment}} + Group: clustermesh-event-throughput + RestartGeneration: {{$generation}} + - basename: et + objectTemplatePath: /modules/event-throughput-service.yaml + templateFillMap: + Group: clustermesh-event-throughput + + - name: Wait for event-throughput pods to be {{$actionName}}d + measurements: + - Identifier: WaitForControlledPodsRunning-{{$actionName}} + Method: WaitForControlledPodsRunning + Params: + action: gather diff --git a/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/cilium.yaml b/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/cilium.yaml new file mode 100644 index 0000000000..4d27607347 --- /dev/null +++ b/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/cilium.yaml @@ -0,0 +1,226 @@ +{{$action := .action}} # start, gather + +{{$suffix := DefaultParam .suffix ""}} + +steps: + - name: {{$action}} Additional Cilium Measurements + measurements: + - Identifier: CiliumAvgCPUUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Cilium Average CPU Usage {{$suffix}} + metricVersion: v1 + unit: cpu + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(rate(cilium_process_cpu_seconds_total[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, avg_over_time(rate(cilium_process_cpu_seconds_total[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time(rate(cilium_process_cpu_seconds_total[1m])[%v:])) + - Identifier: CiliumMaxCPUUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Cilium Max CPU Usage {{$suffix}} + metricVersion: v1 + unit: cpu + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(rate(cilium_process_cpu_seconds_total[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, max_over_time(rate(cilium_process_cpu_seconds_total[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, max_over_time(rate(cilium_process_cpu_seconds_total[1m])[%v:])) + - Identifier: CiliumAvgMemUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Cilium Avg Memory Usage {{$suffix}} + metricVersion: v1 + unit: MB + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(cilium_process_resident_memory_bytes[%v:]) / 1024 / 1024) + - name: Perc90 + query: quantile(0.90, avg_over_time(cilium_process_resident_memory_bytes[%v:]) / 1024 / 1024) + - name: Perc50 + query: quantile(0.5, avg_over_time(cilium_process_resident_memory_bytes[%v:]) / 1024 / 1024) + - Identifier: CiliumMaxMemUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Cilium Max Memory Usage {{$suffix}} + metricVersion: v1 + unit: MB + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(cilium_process_resident_memory_bytes[%v:]) / 1024 / 1024) + - name: Perc90 + query: quantile(0.90, max_over_time(cilium_process_resident_memory_bytes[%v:]) / 1024 / 1024) + - name: Perc50 + query: quantile(0.5, max_over_time(cilium_process_resident_memory_bytes[%v:]) / 1024 / 1024) + - Identifier: CiliumOperatorAvgCPUUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Cilium Operator Avg CPU Usage {{$suffix}} + metricVersion: v1 + unit: cpu + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(rate(cilium_operator_process_cpu_seconds_total[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, avg_over_time(rate(cilium_operator_process_cpu_seconds_total[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time(rate(cilium_operator_process_cpu_seconds_total[1m])[%v:])) + - Identifier: CiliumOperatorMaxCPUUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Cilium Operator Max CPU Usage {{$suffix}} + metricVersion: v1 + unit: cpu + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(rate(cilium_operator_process_cpu_seconds_total[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, max_over_time(rate(cilium_operator_process_cpu_seconds_total[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, max_over_time(rate(cilium_operator_process_cpu_seconds_total[1m])[%v:])) + - Identifier: CiliumOperatorMaxMemUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Cilium Operator Max Memory Usage {{$suffix}} + metricVersion: v1 + unit: MB + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(cilium_operator_process_resident_memory_bytes[%v:]) / 1024 / 1024) + - name: Perc90 + query: quantile(0.90, max_over_time(cilium_operator_process_resident_memory_bytes[%v:]) / 1024 / 1024) + - name: Perc50 + query: quantile(0.5, max_over_time(cilium_operator_process_resident_memory_bytes[%v:]) / 1024 / 1024) + - Identifier: CiliumOperatorAvgMemUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Cilium Operator Avg Memory Usage {{$suffix}} + metricVersion: v1 + unit: MB + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(cilium_operator_process_resident_memory_bytes[%v:]) / 1024 / 1024) + - name: Perc90 + query: quantile(0.90, avg_over_time(cilium_operator_process_resident_memory_bytes[%v:]) / 1024 / 1024) + - name: Perc50 + query: quantile(0.5, avg_over_time(cilium_operator_process_resident_memory_bytes[%v:]) / 1024 / 1024) + - Identifier: CiliumContainerFsAvgWrittenBytes{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Cilium Container FS Average Written Bytes {{$suffix}} + metricVersion: v1 + unit: bytes/s + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(rate(container_fs_writes_bytes_total{container="cilium-agent"}[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, avg_over_time(rate(container_fs_writes_bytes_total{container="cilium-agent"}[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time(rate(container_fs_writes_bytes_total{container="cilium-agent"}[1m])[%v:])) + - Identifier: CiliumContainerFsMaxWrittenBytes{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Cilium Container FS Max Written Bytes {{$suffix}} + metricVersion: v1 + unit: bytes/s + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(rate(container_fs_writes_bytes_total{container="cilium-agent"}[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, max_over_time(rate(container_fs_writes_bytes_total{container="cilium-agent"}[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, max_over_time(rate(container_fs_writes_bytes_total{container="cilium-agent"}[1m])[%v:])) + # NOTE: FS write latency (avg/max) was intentionally dropped from this + # scenario. The query (rate(container_fs_write_seconds_total) / rate( + # container_fs_writes_total) for container="cilium-agent") returns no + # samples here because cilium-agent in the clustermesh scenario does + # almost all I/O via in-kernel bpf maps, not container fs — the write-op + # rate is genuinely ~0, so the division yields no result. Written-bytes + # rates (above) still produce useful data and remain the FS signal. + + # --------------------------------------------------------------------- + # Network usage (spec line 38, 134: "CPU/memory/network per + # component"). cAdvisor exposes container_network_*_bytes_total per + # pod. We pin to pod="cilium-.*" instead of container="cilium-agent" + # because cAdvisor reports network counters at the pod-sandbox level + # (container="POD"), not the per-container level — so a + # container="cilium-agent" filter would return empty. + # --------------------------------------------------------------------- + - Identifier: CiliumContainerNetworkTransmitBytes{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Cilium Network Transmit Bytes {{$suffix}} + metricVersion: v1 + unit: bytes/s + enableViolations: false + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(rate(container_network_transmit_bytes_total{pod=~"cilium-.*",namespace="kube-system"}[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time(rate(container_network_transmit_bytes_total{pod=~"cilium-.*",namespace="kube-system"}[1m])[%v:])) + - Identifier: CiliumContainerNetworkReceiveBytes{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Cilium Network Receive Bytes {{$suffix}} + metricVersion: v1 + unit: bytes/s + enableViolations: false + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(rate(container_network_receive_bytes_total{pod=~"cilium-.*",namespace="kube-system"}[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time(rate(container_network_receive_bytes_total{pod=~"cilium-.*",namespace="kube-system"}[1m])[%v:])) + + - Identifier: CiliumContainerRestarts{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Cilium Container Restarts {{$suffix}} + metricVersion: v1 + unit: "#" + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(increase(kube_pod_container_status_restarts_total{container="cilium-agent"}[%v])[%v:])) + - name: Perc90 + query: quantile(0.90, max_over_time(increase(kube_pod_container_status_restarts_total{container="cilium-agent"}[%v])[%v:])) + - name: Perc50 + query: quantile(0.50, max_over_time(increase(kube_pod_container_status_restarts_total{container="cilium-agent"}[%v])[%v:])) + # - Identifier: AvgCiliumHubbleMetricsCardinality{{$suffix}} + # Method: GenericPrometheusQuery + # Params: + # action: {{$action}} + # metricName: Average Cilium Hubble Metrics Cardinality {{$suffix}} + # metricVersion: v1 + # unit: "#" + # enableViolations: true + # queries: + # - name: Avg + # query: count({__name__=~"hubble_.*"}) diff --git a/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/clustermesh-metrics.yaml b/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/clustermesh-metrics.yaml new file mode 100644 index 0000000000..18d0a2a85c --- /dev/null +++ b/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/clustermesh-metrics.yaml @@ -0,0 +1,192 @@ +{{$action := .action}} # start, gather + +{{$suffix := DefaultParam .suffix ""}} + +# ClusterMesh-specific Prometheus measurements. +# +# All metrics here are upstream Cilium clustermesh-apiserver / cilium-agent +# metrics, scraped via the PodMonitor deployed by config/modules/clustermesh.yaml. +# If AKS managed Cilium does not expose a given metric, GenericPrometheusQuery +# returns empty data items (CL2 logs a warning, the run continues) — refine +# query strings once we have a live mesh to inspect. + +steps: + - name: {{$action}} ClusterMesh Measurements + measurements: + # --------------------------------------------------------------------- + # Mesh health: how many remote clusters are connected from this cluster's + # perspective. In an N-cluster mesh, this gauge should reach (N-1) on every + # cluster. Capturing percentile shape across the run window flags drops. + # --------------------------------------------------------------------- + - Identifier: ClusterMeshRemoteClustersConnected{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: ClusterMesh Remote Clusters Connected {{$suffix}} + metricVersion: v1 + unit: "#" + enableViolations: false + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(cilium_clustermesh_remote_clusters[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time(cilium_clustermesh_remote_clusters[%v:])) + - name: Min + query: min_over_time(min(cilium_clustermesh_remote_clusters)[%v:]) + + # --------------------------------------------------------------------- + # Mesh failure counter: cumulative remote-cluster connection failures. + # Healthy runs should keep this at 0; we track the max increase observed + # over the run to surface flapping links during scale-up. + # --------------------------------------------------------------------- + - Identifier: ClusterMeshRemoteClusterFailures{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: ClusterMesh Remote Cluster Failures {{$suffix}} + metricVersion: v1 + unit: "#" + enableViolations: false + queries: + - name: MaxIncrease + query: max(max_over_time(cilium_clustermesh_remote_cluster_failures[%v:])) - min(min_over_time(cilium_clustermesh_remote_cluster_failures[%v:])) + + # --------------------------------------------------------------------- + # Cross-cluster event throughput — the headline metric for scale scenario + # #1 (Cross-Cluster Event Throughput) and #2 (Pod Churn). Rate of kvstore + # events queued per second on this cluster. + # --------------------------------------------------------------------- + - Identifier: ClusterMeshKvstoreEventsRate{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: ClusterMesh Kvstore Events Rate {{$suffix}} + metricVersion: v1 + unit: events/s + enableViolations: false + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, max_over_time(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count[1m])[%v:])) + + # --------------------------------------------------------------------- + # Per-type event rate breakdown (spec line 131: "Event rate (per + # type)"). The kvstoremesh kvstore-events histogram carries a + # `scope` label tagging which kvstore key family the event touched. + # We split into the three families spec line 5 calls out: endpoints, + # services, identities. Cilium 1.18 uses these scope values: + # identities/v1 — security identities + # services/v1 — global Service objects + # ip/v1 — endpoint IP-to-identity mappings (endpoints) + # nodes/v1 — node tunnel / IPAM advertisements + # serviceexports/v1 — MCS-API ServiceExport objects + # lease — leader election + # cilium/.heartbeat — kvstore liveness heartbeat + # cilium/syncedcanaries — initial-sync barrier markers + # --------------------------------------------------------------------- + - Identifier: ClusterMeshKvstoreEventsRateIdentities{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: ClusterMesh Kvstore Events Rate Identities {{$suffix}} + metricVersion: v1 + unit: events/s + enableViolations: false + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(sum(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count{scope="identities/v1"}[1m]))[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time(sum(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count{scope="identities/v1"}[1m]))[%v:])) + - Identifier: ClusterMeshKvstoreEventsRateServices{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: ClusterMesh Kvstore Events Rate Services {{$suffix}} + metricVersion: v1 + unit: events/s + enableViolations: false + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(sum(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count{scope="services/v1"}[1m]))[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time(sum(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count{scope="services/v1"}[1m]))[%v:])) + - Identifier: ClusterMeshKvstoreEventsRateEndpoints{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: ClusterMesh Kvstore Events Rate Endpoints {{$suffix}} + metricVersion: v1 + unit: events/s + enableViolations: false + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(sum(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count{scope="ip/v1"}[1m]))[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time(sum(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count{scope="ip/v1"}[1m]))[%v:])) + + # --------------------------------------------------------------------- + # Cross-cluster propagation latency proxy: p99 of kvstore operation + # duration. This is the closest upstream metric to "how long does it take + # for a change in cluster A to be visible in cluster B" without injecting + # synthetic probes. + # --------------------------------------------------------------------- + - Identifier: ClusterMeshKvstoreOperationDuration{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: ClusterMesh Kvstore Operation Duration {{$suffix}} + metricVersion: v1 + unit: s + enableViolations: false + queries: + - name: Perc99 + query: histogram_quantile(0.99, sum(rate(cilium_kvstoremesh_kvstore_operations_duration_seconds_bucket[1m])) by (le)) + - name: Perc90 + query: histogram_quantile(0.90, sum(rate(cilium_kvstoremesh_kvstore_operations_duration_seconds_bucket[1m])) by (le)) + - name: Perc50 + query: histogram_quantile(0.50, sum(rate(cilium_kvstoremesh_kvstore_operations_duration_seconds_bucket[1m])) by (le)) + + # --------------------------------------------------------------------- + # Watch queue depth (saturation signal — spec line 37 "Key signals: + # ... Watch queue depth"). cilium_kvstoremesh_kvstore_sync_queue_size + # is a gauge: number of items currently waiting to be processed by + # the kvstoremesh sync loop. A persistently positive or growing value + # is the saturation indicator (event ingest > drain rate). + # --------------------------------------------------------------------- + - Identifier: ClusterMeshKvstoreSyncQueueSize{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: ClusterMesh Kvstore Sync Queue Size {{$suffix}} + metricVersion: v1 + unit: "#" + enableViolations: false + queries: + - name: Max + query: max(max_over_time(cilium_kvstoremesh_kvstore_sync_queue_size[%v:])) + - name: Perc99 + query: quantile(0.99, max_over_time(cilium_kvstoremesh_kvstore_sync_queue_size[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time(cilium_kvstoremesh_kvstore_sync_queue_size[%v:])) + + # --------------------------------------------------------------------- + # Identity propagation: cilium identity count. Under cross-cluster pod + # churn (scenarios #1, #2, #3), this should track the global identity + # set converging across clusters. Divergence flags propagation lag. + # --------------------------------------------------------------------- + - Identifier: ClusterMeshIdentityCount{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: ClusterMesh Identity Count {{$suffix}} + metricVersion: v1 + unit: "#" + enableViolations: false + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(cilium_identity[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time(cilium_identity[%v:])) diff --git a/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/clustermesh-throughput.yaml b/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/clustermesh-throughput.yaml new file mode 100644 index 0000000000..c0dd5f92c6 --- /dev/null +++ b/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/clustermesh-throughput.yaml @@ -0,0 +1,78 @@ +{{$action := .action}} # start, gather + +{{$suffix := DefaultParam .suffix ""}} + +# Scenario #1 (Cross-Cluster Event Throughput) — extra measurements layered +# on top of the always-on clustermesh-metrics.yaml. These are specifically +# tuned to the event-throughput workload's create/restart/delete sequence, +# and are scoped to this scenario because they only make sense when the +# workload is actively churning kvstore writes. + +steps: + - name: {{$action}} ClusterMesh Event Throughput Measurements + measurements: + # --------------------------------------------------------------------- + # Backlog detection: the headline saturation signal. If the rate of + # events queued exceeds the rate at which the local agent drains them, + # the system is over-saturated. A sustained positive value over the + # measurement window is the failure mode scale testing.txt line 14 + # ("upper bounds — effective QPS limit") is asking us to find. + # --------------------------------------------------------------------- + - Identifier: ClusterMeshEventBacklog{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: ClusterMesh Event Backlog Rate {{$suffix}} + metricVersion: v1 + unit: events/s + enableViolations: false + queries: + # Wrap each side in sum() to drop labels — the two metrics carry + # non-identical label sets (e.g. sync_errors_total has a per-cluster + # `source_cluster` label that events_queue_seconds_count doesn't). + # Without sum(), PromQL's binary `-` returns an empty vector when + # operand label sets don't align. sum() collapses both to a single + # series so the subtraction is well-defined. + - name: Perc99 + query: quantile(0.99, max_over_time((sum(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count[1m])) - sum(rate(cilium_kvstoremesh_kvstore_sync_errors_total[1m])))[%v:])) + - name: MaxBurst + query: max(max_over_time(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count[30s])[%v:])) + + # --------------------------------------------------------------------- + # Global services gauge: one row per cluster of how many global services + # this cluster's clustermesh-apiserver has accepted. With the workload + # creating N global Services per cluster across M clusters, every cluster + # should observe roughly N*M global services. Divergence flags either + # scrape failures or service-propagation lag. + # --------------------------------------------------------------------- + - Identifier: ClusterMeshGlobalServices{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: ClusterMesh Global Services {{$suffix}} + metricVersion: v1 + unit: "#" + enableViolations: false + queries: + - name: Max + query: max(max_over_time(cilium_clustermesh_global_services[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time(cilium_clustermesh_global_services[%v:])) + + # --------------------------------------------------------------------- + # Explicit p95 split for kvstore operation latency. clustermesh-metrics.yaml + # already emits p50/p90/p99; for scenario #1 we also surface p95 so the + # scaling-curve dashboard has a smoother percentile gradient when plotting + # latency vs cluster count. + # --------------------------------------------------------------------- + - Identifier: ClusterMeshKvstoreOperationDurationP95{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: ClusterMesh Kvstore Operation Duration P95 {{$suffix}} + metricVersion: v1 + unit: s + enableViolations: false + queries: + - name: Perc95 + query: histogram_quantile(0.95, sum(rate(cilium_kvstoremesh_kvstore_operations_duration_seconds_bucket[1m])) by (le)) diff --git a/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/control-plane.yaml b/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/control-plane.yaml new file mode 100644 index 0000000000..47504cbf89 --- /dev/null +++ b/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/control-plane.yaml @@ -0,0 +1,86 @@ +{{$action := .action}} # start, gather + +# Feature gates +{{$podStartupLatencyThreshold := DefaultParam .CL2_POD_STARTUP_LATENCY_THRESHOLD "15s"}} +{{$ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS_SIMPLE := DefaultParam .CL2_ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS_SIMPLE true}} +{{$PROMETHEUS_SCRAPE_KUBE_PROXY := DefaultParam .PROMETHEUS_SCRAPE_KUBE_PROXY true}} +{{$NETWORK_LATENCY_THRESHOLD := DefaultParam .CL2_NETWORK_LATENCY_THRESHOLD "0s"}} +{{$ENABLE_IN_CLUSTER_NETWORK_LATENCY := DefaultParam .CL2_ENABLE_IN_CLUSTER_NETWORK_LATENCY true}} + +{{$suffix := DefaultParam .suffix ""}} + +steps: + - name: {{$action}} Additional Measurements + measurements: + - Identifier: APIResponsivenessPrometheus{{$suffix}} + Method: APIResponsivenessPrometheus + Params: + action: {{$action}} + enableViolations: {{$ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS_SIMPLE}} + useSimpleLatencyQuery: true + - Identifier: PodStartupLatency{{$suffix}} + Method: PodStartupLatency + Params: + action: {{$action}} + labelSelector: group = {{.group}} + threshold: {{$podStartupLatencyThreshold}} + - Identifier: ApiserverAvgCPUUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Apiserver Average CPU Usage {{$suffix}} + metricVersion: v1 + unit: cpu + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(rate(process_cpu_seconds_total{endpoint="apiserver"}[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, avg_over_time(rate(process_cpu_seconds_total{endpoint="apiserver"}[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time(rate(process_cpu_seconds_total{endpoint="apiserver"}[1m])[%v:])) + - Identifier: ApiserverMaxCPUUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Apiserver Max CPU Usage {{$suffix}} + metricVersion: v1 + unit: cpu + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(rate(process_cpu_seconds_total{endpoint="apiserver"}[1m])[%v:])) + - name: Perc90 + query: quantile(0.90, max_over_time(rate(process_cpu_seconds_total{endpoint="apiserver"}[1m])[%v:])) + - name: Perc50 + query: quantile(0.50, max_over_time(rate(process_cpu_seconds_total{endpoint="apiserver"}[1m])[%v:])) + - Identifier: ApiserverAvgMemUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Apiserver Average Memory Usage {{$suffix}} + metricVersion: v1 + unit: MB + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(process_resident_memory_bytes{endpoint="apiserver"}[%v:]) / 1024 / 1024) + - name: Perc90 + query: quantile(0.90, avg_over_time(process_resident_memory_bytes{endpoint="apiserver"}[%v:]) / 1024 / 1024) + - name: Perc50 + query: quantile(0.5, avg_over_time(process_resident_memory_bytes{endpoint="apiserver"}[%v:]) / 1024 / 1024) + - Identifier: ApiserverMaxMemUsage{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Apiserver Max Memory Usage {{$suffix}} + metricVersion: v1 + unit: MB + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(process_resident_memory_bytes{endpoint="apiserver"}[%v:]) / 1024 / 1024) + - name: Perc90 + query: quantile(0.90, max_over_time(process_resident_memory_bytes{endpoint="apiserver"}[%v:]) / 1024 / 1024) + - name: Perc50 + query: quantile(0.5, max_over_time(process_resident_memory_bytes{endpoint="apiserver"}[%v:]) / 1024 / 1024) diff --git a/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/etcd-metrics.yaml b/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/etcd-metrics.yaml new file mode 100644 index 0000000000..129891204d --- /dev/null +++ b/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/etcd-metrics.yaml @@ -0,0 +1,158 @@ +{{$action := .action}} # start, gather + +{{$suffix := DefaultParam .suffix ""}} + +# Etcd-internal measurements for the embedded etcd inside each cluster's +# clustermesh-apiserver pod. +# +# Spec coverage (scale testing.txt): +# - line 34: "Metrics: Cilium, clustermesh-apiserver, etcd" +# - line 134: "etcd metrics (watch count, compactions, latency)" +# +# Source: the etcd container in the clustermesh-apiserver pod is launched +# with `--listen-metrics-urls=http://0.0.0.0:9963` and `--metrics=basic`. +# Our PodMonitor (modules/clustermesh/podmonitor.yaml, port 9963 endpoint) +# already scrapes that target — we just hadn't been querying the metrics. +# +# `--metrics=basic` only emits the etcd_debugging_* family (despite the +# name, these ARE the basic-tier metrics; the "extensive" tier adds +# etcd_disk_wal_fsync_*, etcd_network_peer_*, etcd_mvcc_db_total_size_in_bytes, +# etc., which AKS-managed Cilium does not enable). Queries below pick the +# best basic-tier proxies for each spec-required signal. + +steps: + - name: {{$action}} ClusterMesh Etcd Measurements + measurements: + # --------------------------------------------------------------------- + # Watch count (spec line 134 "watch count"). Total watchers currently + # registered against this cluster's clustermesh-apiserver etcd. Each + # remote cluster's kvstoremesh maintains watchers for endpoints, + # services, and identities, so this scales with mesh size and traffic. + # Slow-watcher count is the back-pressure signal: a non-zero value + # means watchers can't keep up with the event stream. + # --------------------------------------------------------------------- + - Identifier: ClusterMeshEtcdWatchCount{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: ClusterMesh Etcd Watch Count {{$suffix}} + metricVersion: v1 + unit: "#" + enableViolations: false + queries: + - name: Max + query: max(max_over_time(etcd_debugging_mvcc_watcher_total[%v:])) + - name: Perc99 + query: quantile(0.99, max_over_time(etcd_debugging_mvcc_watcher_total[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time(etcd_debugging_mvcc_watcher_total[%v:])) + + - Identifier: ClusterMeshEtcdSlowWatchers{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: ClusterMesh Etcd Slow Watchers {{$suffix}} + metricVersion: v1 + unit: "#" + enableViolations: false + queries: + - name: Max + query: max(max_over_time(etcd_debugging_mvcc_slow_watcher_total[%v:])) + - name: Perc99 + query: quantile(0.99, max_over_time(etcd_debugging_mvcc_slow_watcher_total[%v:])) + + # --------------------------------------------------------------------- + # Pending events: events queued for delivery to watchers but not yet + # consumed. A growing value over the run window is the etcd-side + # equivalent of the kvstoremesh sync queue depth — back-pressure from + # the consumer side. + # --------------------------------------------------------------------- + - Identifier: ClusterMeshEtcdPendingEvents{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: ClusterMesh Etcd Pending Events {{$suffix}} + metricVersion: v1 + unit: "#" + enableViolations: false + queries: + - name: Max + query: max(max_over_time(etcd_debugging_mvcc_pending_events_total[%v:])) + - name: Perc99 + query: quantile(0.99, max_over_time(etcd_debugging_mvcc_pending_events_total[%v:])) + + # --------------------------------------------------------------------- + # Compactions (spec line 134 "compactions"). Auto-compaction is + # enabled with `--auto-compaction-retention=1` (1-hour retention). Two + # signals: how long a compaction takes (latency) and how many keys + # were removed (work done). + # --------------------------------------------------------------------- + - Identifier: ClusterMeshEtcdCompactionDuration{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: ClusterMesh Etcd Compaction Duration {{$suffix}} + metricVersion: v1 + unit: ms + enableViolations: false + queries: + - name: Perc99 + query: histogram_quantile(0.99, sum(rate(etcd_debugging_mvcc_db_compaction_total_duration_milliseconds_bucket[%v])) by (le)) + - name: Perc50 + query: histogram_quantile(0.50, sum(rate(etcd_debugging_mvcc_db_compaction_total_duration_milliseconds_bucket[%v])) by (le)) + + - Identifier: ClusterMeshEtcdCompactionKeys{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: ClusterMesh Etcd Compacted Keys {{$suffix}} + metricVersion: v1 + unit: "#" + enableViolations: false + queries: + - name: TotalIncrease + query: max(max_over_time(etcd_debugging_mvcc_db_compaction_keys_total[%v:])) - min(min_over_time(etcd_debugging_mvcc_db_compaction_keys_total[%v:])) + + # --------------------------------------------------------------------- + # Disk-write latency (spec line 134 "latency"). With --metrics=basic + # we don't have etcd_disk_wal_fsync_duration_seconds; the closest + # available proxy is etcd_debugging_disk_backend_commit_write_duration + # (how long it takes to commit a write txn to the bbolt backend). + # Together with rebalance/spill durations, this characterizes etcd's + # disk subsystem performance under load. + # --------------------------------------------------------------------- + - Identifier: ClusterMeshEtcdBackendWriteDuration{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: ClusterMesh Etcd Backend Write Duration {{$suffix}} + metricVersion: v1 + unit: s + enableViolations: false + queries: + - name: Perc99 + query: histogram_quantile(0.99, sum(rate(etcd_debugging_disk_backend_commit_write_duration_seconds_bucket[1m])) by (le)) + - name: Perc90 + query: histogram_quantile(0.90, sum(rate(etcd_debugging_disk_backend_commit_write_duration_seconds_bucket[1m])) by (le)) + - name: Perc50 + query: histogram_quantile(0.50, sum(rate(etcd_debugging_disk_backend_commit_write_duration_seconds_bucket[1m])) by (le)) + + # --------------------------------------------------------------------- + # MVCC store size proxy. With --metrics=basic we don't get + # etcd_mvcc_db_total_size_in_bytes; etcd_debugging_mvcc_keys_total is + # the key count and etcd_debugging_mvcc_total_put_size_in_bytes is the + # cumulative bytes written. Together they bound the working set. + # --------------------------------------------------------------------- + - Identifier: ClusterMeshEtcdMvccKeys{{$suffix}} + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: ClusterMesh Etcd MVCC Keys {{$suffix}} + metricVersion: v1 + unit: "#" + enableViolations: false + queries: + - name: Max + query: max(max_over_time(etcd_debugging_mvcc_keys_total[%v:])) + - name: Perc50 + query: quantile(0.50, avg_over_time(etcd_debugging_mvcc_keys_total[%v:])) diff --git a/modules/python/clusterloader2/clustermesh-scale/config/modules/scale-test-deployment.yaml b/modules/python/clusterloader2/clustermesh-scale/config/modules/scale-test-deployment.yaml new file mode 100644 index 0000000000..9ceffc8595 --- /dev/null +++ b/modules/python/clusterloader2/clustermesh-scale/config/modules/scale-test-deployment.yaml @@ -0,0 +1,27 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{.Name}} + labels: + group: {{.Group}} +spec: + replicas: {{.Replicas}} + selector: + matchLabels: + name: {{.Name}} + template: + metadata: + labels: + name: {{.Name}} + group: {{.Group}} + spec: + containers: + - name: pause + image: mcr.microsoft.com/oss/kubernetes/pause:3.6 + resources: + requests: + cpu: 1m + memory: 5Mi + limits: + cpu: 5m + memory: 20Mi diff --git a/modules/python/clusterloader2/clustermesh-scale/config/modules/scale-test.yaml b/modules/python/clusterloader2/clustermesh-scale/config/modules/scale-test.yaml new file mode 100644 index 0000000000..5fd806c60b --- /dev/null +++ b/modules/python/clusterloader2/clustermesh-scale/config/modules/scale-test.yaml @@ -0,0 +1,57 @@ +name: clustermesh-scale-test-module + +# Trivial pod deployment module: creates or deletes +# namespaces x deploymentsPerNamespace x replicasPerDeployment +# pause-image pods on the target cluster. No traffic, no churn, no policies. + +{{$actionName := .actionName}} +{{$namespaces := .namespaces}} +{{$deploymentsPerNamespace := .deploymentsPerNamespace}} +{{$replicasPerDeployment := .replicasPerDeployment}} +{{$tuningSet := .tuningSet}} +{{$operationTimeout := .operationTimeout}} + +{{$totalDeployments := MultiplyInt $namespaces $deploymentsPerNamespace}} + +steps: + # Register a fresh WaitForControlledPodsRunning watcher BEFORE the + # create/delete phase. Without this, the second invocation of this module + # (actionName=delete) errors with "metric WaitForControlledPodsRunning has + # not been started" — CL2 closes the metric after the first `gather`, so + # each invocation needs its own start. We use a per-action Identifier + # ("...-create" / "...-delete") so the start and gather pair cleanly even + # if the runtime ever caches metrics by Identifier across invocations. + - name: Start tracking pods to be {{$actionName}}d + measurements: + - Identifier: WaitForControlledPodsRunning-{{$actionName}} + Method: WaitForControlledPodsRunning + Params: + action: start + # CL2 needs apiVersion+kind to know which controllers to track on + # start; we deploy Deployment objects (see scale-test-deployment.yaml). + apiVersion: apps/v1 + kind: Deployment + checkIfPodsAreUpdated: true + labelSelector: group = clustermesh-scale-test + operationTimeout: {{$operationTimeout}} + + - name: {{$actionName}} deployments + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$deploymentsPerNamespace}} + tuningSet: {{$tuningSet}} + objectBundle: + - basename: scale-test + objectTemplatePath: /modules/scale-test-deployment.yaml + templateFillMap: + Replicas: {{$replicasPerDeployment}} + Group: clustermesh-scale-test + + - name: Wait for deployments to be {{$actionName}}d + measurements: + - Identifier: WaitForControlledPodsRunning-{{$actionName}} + Method: WaitForControlledPodsRunning + Params: + action: gather diff --git a/modules/python/clusterloader2/clustermesh-scale/scale.py b/modules/python/clusterloader2/clustermesh-scale/scale.py new file mode 100644 index 0000000000..35047f122a --- /dev/null +++ b/modules/python/clusterloader2/clustermesh-scale/scale.py @@ -0,0 +1,258 @@ +""" +ClusterMesh scale-test harness. + +Single-cluster invocation. The Telescope pipeline fans out by calling this +script once per fleet member (driven by `az fleet clustermeshprofile list-members` +in steps/topology/clustermesh-scale/execute-clusterloader2.yml). Each invocation +emits one JSONL with a `cluster` attribution column so concatenated results from +N clusters are queryable per-cluster downstream. + +Phase 1 is intentionally trivial: deploy a small fixed number of pods, no churn, +no fortio, no network policies. The goal of Phase 1 is to prove the multi-cluster +harness + topology + aggregation works end-to-end. Real measurements +(cross-cluster event throughput, identity propagation, etc.) come in plan.md +Phase 2 by adding measurement modules to config/modules/measurements/ and new +parameters to configure/collect. +""" +import argparse +import json +import os +from datetime import datetime, timezone + +from clusterloader2.utils import parse_xml_to_json, run_cl2_command, process_cl2_reports + + +def configure_clusterloader2( + namespaces, + deployments_per_namespace, + replicas_per_deployment, + operation_timeout, + override_file, +): + with open(override_file, "w", encoding="utf-8") as f: + # Prometheus stack — keep the Cilium-scrape flags ON so the + # cilium/control-plane/clustermesh measurement modules have data to + # query. The base memory REQUEST is set via the --prometheus-memory-request + # CLI flag in execute_clusterloader2 (the CL2_PROMETHEUS_MEMORY_REQUEST + # overrides key is not honored by this CL2 image). Memory LIMIT below + # IS honored as an overrides key and must be >= the request to satisfy + # k8s admission. + f.write("CL2_PROMETHEUS_TOLERATE_MASTER: true\n") + f.write("CL2_PROMETHEUS_MEMORY_LIMIT: 2Gi\n") + # Pin Prometheus to the dedicated `prompool` node (label + # prometheus=true is set in azure-2.tfvars extra_node_pool). Without + # this, prometheus-k8s lands on the default workload pool and + # competes with the 200 event-throughput pods for CPU/memory, + # causing per-node overcommit and Pending workload pods. + f.write('CL2_PROMETHEUS_NODE_SELECTOR: "prometheus: \\"true\\""\n') + f.write("CL2_PROMETHEUS_SCRAPE_CILIUM_AGENT: true\n") + f.write("CL2_PROMETHEUS_SCRAPE_CILIUM_OPERATOR: true\n") + f.write("CL2_POD_STARTUP_LATENCY_THRESHOLD: 3m\n") + # APIResponsivenessPrometheus default SLO (perc99 ≤ 1s) is tuned for + # production-scale clusters in steady state; on Phase-1 dev clusters + # the kube-apiserver hits multi-second perc99 during the Prometheus + # stack bring-up (mutatingwebhookconfigurations APPLY, + # customresourcedefinitions POST/PUT). The metric is still recorded + # — we just stop CL2 from failing the test on threshold breaches. + f.write("CL2_ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS_SIMPLE: false\n") + + # Topology knobs — trivial defaults for Phase 1 vertical slice. + f.write(f"CL2_NAMESPACES: {namespaces}\n") + f.write(f"CL2_DEPLOYMENTS_PER_NAMESPACE: {deployments_per_namespace}\n") + f.write(f"CL2_REPLICAS_PER_DEPLOYMENT: {replicas_per_deployment}\n") + f.write(f"CL2_OPERATION_TIMEOUT: {operation_timeout}\n") + + with open(override_file, "r", encoding="utf-8") as f: + print(f"Content of file {override_file}:\n{f.read()}") + + +def execute_clusterloader2( + cl2_image, + cl2_config_dir, + cl2_report_dir, + cl2_config_file, + kubeconfig, + provider, +): + run_cl2_command( + kubeconfig, + cl2_image, + cl2_config_dir, + cl2_report_dir, + provider, + cl2_config_file=cl2_config_file, + overrides=True, + enable_prometheus=True, + tear_down_prometheus=False, + scrape_kubelets=True, + scrape_ksm=True, + scrape_metrics_server=True, + # CL2 default is 10Gi which doesn't fit a Standard_D4s_v4 / 16GB node + # after k8s + Cilium overhead. Override via the CLI flag rather than + # `CL2_PROMETHEUS_MEMORY_REQUEST` overrides.yaml key — that key is not + # honored by this CL2 image (verified via prometheus-operator log + # showing PrometheusMemoryRequest:10Gi at runtime). Pair this with + # CL2_PROMETHEUS_MEMORY_LIMIT in the overrides file so request <= limit. + prometheus_memory_request="1Gi", + ) + + +def collect_clusterloader2( + cl2_report_dir, + cloud_info, + run_id, + run_url, + result_file, + test_type, + start_timestamp, + cluster_name, + cluster_count, + mesh_size, + namespaces, + deployments_per_namespace, + replicas_per_deployment, + trigger_reason="", +): + details = parse_xml_to_json(os.path.join(cl2_report_dir, "junit.xml"), indent=2) + json_data = json.loads(details) + testsuites = json_data["testsuites"] + + if testsuites: + status = "success" if testsuites[0]["failures"] == 0 else "failure" + else: + raise Exception(f"No testsuites found in the report! Raw data: {details}") + + template = { + "timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + "status": status, + "group": None, + "measurement": None, + "result": None, + "test_details": { + "trigger_reason": trigger_reason, + # Cluster attribution — every row emitted for this run is tagged + # with the cluster it came from, so downstream Kusto queries can + # group/filter by cluster across an N-cluster mesh test. + "cluster": cluster_name, + # mesh_size is the configured target N (from pipeline matrix); + # cluster_count is what was actually discovered at run time. Querying + # `mesh_size != cluster_count` in Kusto surfaces partial-mesh runs + # (e.g., a Fleet member that failed to join) without needing a join + # to control-plane logs. + "mesh_size": mesh_size, + "cluster_count": cluster_count, + "namespaces": namespaces, + "deployments_per_namespace": deployments_per_namespace, + "replicas_per_deployment": replicas_per_deployment, + "pods_per_cluster": namespaces * deployments_per_namespace * replicas_per_deployment, + "details": ( + testsuites[0]["testcases"][0].get("failure", None) + if testsuites[0].get("testcases") + else None + ), + }, + "cloud_info": cloud_info, + "run_id": run_id, + "run_url": run_url, + "test_type": test_type, + "start_timestamp": start_timestamp, + # parameters (top-level for Kusto column convenience) + "cluster": cluster_name, + "mesh_size": mesh_size, + "cluster_count": cluster_count, + "namespaces": namespaces, + "deployments_per_namespace": deployments_per_namespace, + "replicas_per_deployment": replicas_per_deployment, + } + content = process_cl2_reports(cl2_report_dir, template) + + os.makedirs(os.path.dirname(result_file), exist_ok=True) + with open(result_file, "w", encoding="utf-8") as f: + f.write(content) + + +def main(): + parser = argparse.ArgumentParser(description="ClusterMesh scale-test harness.") + subparsers = parser.add_subparsers(dest="command") + + # configure + pc = subparsers.add_parser("configure", help="Write CL2 overrides file") + pc.add_argument("--namespaces", type=int, required=True) + pc.add_argument("--deployments-per-namespace", type=int, required=True) + pc.add_argument("--replicas-per-deployment", type=int, required=True) + pc.add_argument("--operation-timeout", type=str, default="15m") + pc.add_argument("--cl2_override_file", type=str, required=True, + help="Path to the overrides of CL2 config file") + + # execute + pe = subparsers.add_parser("execute", help="Run CL2 against a single cluster") + pe.add_argument("--cl2-image", type=str, required=True) + pe.add_argument("--cl2-config-dir", type=str, required=True) + pe.add_argument("--cl2-report-dir", type=str, required=True) + pe.add_argument("--cl2-config-file", type=str, required=True) + pe.add_argument("--kubeconfig", type=str, required=True) + pe.add_argument("--provider", type=str, required=True) + + # collect + pco = subparsers.add_parser("collect", help="Collect results for one cluster") + pco.add_argument("--cl2_report_dir", type=str, required=True) + pco.add_argument("--cloud_info", type=str, default="") + pco.add_argument("--run_id", type=str, required=True) + pco.add_argument("--run_url", type=str, default="") + pco.add_argument("--result_file", type=str, required=True) + pco.add_argument("--test_type", type=str, default="default-config") + pco.add_argument("--start_timestamp", type=str, required=True) + pco.add_argument("--cluster-name", type=str, required=True, + help="Fleet member / AKS cluster identity for attribution") + pco.add_argument("--cluster-count", type=int, required=True, + help="Total clusters in the mesh for this run (N)") + pco.add_argument("--mesh-size", type=int, required=True, + help="Configured target cluster count from the pipeline matrix; " + "compared against --cluster-count to detect partial-mesh runs") + pco.add_argument("--namespaces", type=int, required=True) + pco.add_argument("--deployments-per-namespace", type=int, required=True) + pco.add_argument("--replicas-per-deployment", type=int, required=True) + pco.add_argument("--trigger_reason", type=str, default="") + + args = parser.parse_args() + + if args.command == "configure": + configure_clusterloader2( + args.namespaces, + args.deployments_per_namespace, + args.replicas_per_deployment, + args.operation_timeout, + args.cl2_override_file, + ) + elif args.command == "execute": + execute_clusterloader2( + args.cl2_image, + args.cl2_config_dir, + args.cl2_report_dir, + args.cl2_config_file, + args.kubeconfig, + args.provider, + ) + elif args.command == "collect": + collect_clusterloader2( + args.cl2_report_dir, + args.cloud_info, + args.run_id, + args.run_url, + args.result_file, + args.test_type, + args.start_timestamp, + args.cluster_name, + args.cluster_count, + args.mesh_size, + args.namespaces, + args.deployments_per_namespace, + args.replicas_per_deployment, + args.trigger_reason, + ) + else: + parser.print_help() + + +if __name__ == "__main__": + main() diff --git a/modules/python/clusterloader2/utils.py b/modules/python/clusterloader2/utils.py index 50deb2ed85..f0cec83046 100644 --- a/modules/python/clusterloader2/utils.py +++ b/modules/python/clusterloader2/utils.py @@ -25,7 +25,8 @@ def run_cl2_command(kubeconfig, cl2_image, cl2_config_dir, cl2_report_dir, provider, cl2_config_file="config.yaml", overrides=False, enable_prometheus=False, tear_down_prometheus=True, enable_exec_service=False, scrape_kubelets=False, - scrape_containerd=False, scrape_ksm=False, scrape_metrics_server=False): + scrape_containerd=False, scrape_ksm=False, scrape_metrics_server=False, + prometheus_memory_request=None): docker_client = DockerClient() command = f"""--provider={provider} --v=2 @@ -42,6 +43,14 @@ def run_cl2_command(kubeconfig, cl2_image, cl2_config_dir, cl2_report_dir, provi if scrape_containerd: command += f" --prometheus-scrape-containerd={scrape_containerd}" + if prometheus_memory_request: + # CL2 default is 10Gi. Smaller-than-default node SKUs (e.g. AKS + # Standard_D4s_v4 with 16GB) can't schedule the pod with the default + # request, and the resource-quota / limit ratio in the bundled + # prometheus manifests is rejected by k8s admission. Optional + # parameter — None preserves CL2 default for existing callers. + command += f" --prometheus-memory-request={prometheus_memory_request}" + if overrides: command += " --testoverrides=/root/perf-tests/clusterloader2/config/overrides.yaml" diff --git a/modules/python/tests/mock_data/clustermesh-scale/report/mesh-1/GenericPrometheusQuery_PodStartupLatency_clustermesh-scale-test_2026-04-28T15:00:00Z.json b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-1/GenericPrometheusQuery_PodStartupLatency_clustermesh-scale-test_2026-04-28T15:00:00Z.json new file mode 100644 index 0000000000..3100934955 --- /dev/null +++ b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-1/GenericPrometheusQuery_PodStartupLatency_clustermesh-scale-test_2026-04-28T15:00:00Z.json @@ -0,0 +1,29 @@ +{ + "version": "v1", + "dataItems": [ + { + "labels": { + "Metric": "Perc99" + }, + "data": { + "value": 1.2 + } + }, + { + "labels": { + "Metric": "Perc90" + }, + "data": { + "value": 0.8 + } + }, + { + "labels": { + "Metric": "Perc50" + }, + "data": { + "value": 0.4 + } + } + ] +} diff --git a/modules/python/tests/mock_data/clustermesh-scale/report/mesh-1/junit.xml b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-1/junit.xml new file mode 100644 index 0000000000..34a14e3425 --- /dev/null +++ b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-1/junit.xml @@ -0,0 +1,9 @@ + + + + + + + + + diff --git a/modules/python/tests/mock_data/clustermesh-scale/report/mesh-2/GenericPrometheusQuery_PodStartupLatency_clustermesh-scale-test_2026-04-28T15:00:30Z.json b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-2/GenericPrometheusQuery_PodStartupLatency_clustermesh-scale-test_2026-04-28T15:00:30Z.json new file mode 100644 index 0000000000..dbfb9aacc8 --- /dev/null +++ b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-2/GenericPrometheusQuery_PodStartupLatency_clustermesh-scale-test_2026-04-28T15:00:30Z.json @@ -0,0 +1,29 @@ +{ + "version": "v1", + "dataItems": [ + { + "labels": { + "Metric": "Perc99" + }, + "data": { + "value": 1.5 + } + }, + { + "labels": { + "Metric": "Perc90" + }, + "data": { + "value": 1.0 + } + }, + { + "labels": { + "Metric": "Perc50" + }, + "data": { + "value": 0.5 + } + } + ] +} diff --git a/modules/python/tests/mock_data/clustermesh-scale/report/mesh-2/junit.xml b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-2/junit.xml new file mode 100644 index 0000000000..ee983d20bc --- /dev/null +++ b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-2/junit.xml @@ -0,0 +1,9 @@ + + + + + + + + + diff --git a/modules/python/tests/mock_data/clustermesh-scale/report/mesh-fail/GenericPrometheusQuery_PodStartupLatency_clustermesh-scale-test_2026-04-28T15:01:00Z.json b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-fail/GenericPrometheusQuery_PodStartupLatency_clustermesh-scale-test_2026-04-28T15:01:00Z.json new file mode 100644 index 0000000000..868c276002 --- /dev/null +++ b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-fail/GenericPrometheusQuery_PodStartupLatency_clustermesh-scale-test_2026-04-28T15:01:00Z.json @@ -0,0 +1,13 @@ +{ + "version": "v1", + "dataItems": [ + { + "labels": { + "Metric": "Perc99" + }, + "data": { + "value": 99.9 + } + } + ] +} diff --git a/modules/python/tests/mock_data/clustermesh-scale/report/mesh-fail/junit.xml b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-fail/junit.xml new file mode 100644 index 0000000000..a9eb1b2c7f --- /dev/null +++ b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-fail/junit.xml @@ -0,0 +1,8 @@ + + + + timeout waiting for deployments to become ready in cluster mesh-fail + + + + diff --git a/modules/python/tests/test_clustermesh_scale.py b/modules/python/tests/test_clustermesh_scale.py new file mode 100644 index 0000000000..0b9dd7510e --- /dev/null +++ b/modules/python/tests/test_clustermesh_scale.py @@ -0,0 +1,410 @@ +"""Unit tests for the clustermesh-scale CL2 harness. + +Target module: modules/python/clusterloader2/clustermesh-scale/scale.py. +Mirrors tests/test_network_scale.py — the module is loaded via importlib because +the ``clustermesh-scale`` directory contains a hyphen and is not a valid Python +package name. + +The key invariant under test is multi-cluster attribution: when collect_clusterloader2 +is called once per cluster (as the pipeline's collect.yml does), the resulting JSONL +rows must each carry distinct cluster identity while sharing run-level fields. Without +this, downstream Kusto queries cannot group/filter by cluster across the mesh. +""" +import importlib.util +import json +import os +import sys +import tempfile +import unittest +from pathlib import Path +from unittest.mock import patch + +MODULE_PATH = ( + Path(__file__).resolve().parents[1] + / "clusterloader2" + / "clustermesh-scale" + / "scale.py" +) +MODULE_SPEC = importlib.util.spec_from_file_location( + "clusterloader2_clustermesh_scale", MODULE_PATH +) +if MODULE_SPEC is None or MODULE_SPEC.loader is None: + raise ImportError(f"Unable to load module from {MODULE_PATH}") +clustermesh_scale_module = importlib.util.module_from_spec(MODULE_SPEC) +MODULE_SPEC.loader.exec_module(clustermesh_scale_module) + +configure_clusterloader2 = clustermesh_scale_module.configure_clusterloader2 +collect_clusterloader2 = clustermesh_scale_module.collect_clusterloader2 +main = clustermesh_scale_module.main + +MOCK_REPORT_ROOT = os.path.join( + os.path.dirname(__file__), "mock_data", "clustermesh-scale", "report" +) + + +class TestConfigureClustermeshScale(unittest.TestCase): + """configure_clusterloader2 writes the CL2 overrides file the pipeline expects.""" + + def test_overrides_file_contents(self): + """Every CL2_* knob the config template reads must appear in the overrides file.""" + with tempfile.NamedTemporaryFile( + delete=False, mode="w+", encoding="utf-8" + ) as tmp: + tmp_path = tmp.name + + try: + configure_clusterloader2( + namespaces=2, + deployments_per_namespace=3, + replicas_per_deployment=4, + operation_timeout="20m", + override_file=tmp_path, + ) + + with open(tmp_path, "r", encoding="utf-8") as f: + content = f.read() + + # Prometheus knobs — scrape Cilium agent/operator so measurement + # modules have data. Memory LIMIT honored via overrides; the + # REQUEST is set via the --prometheus-memory-request CLI flag in + # execute_clusterloader2 (CL2_PROMETHEUS_MEMORY_REQUEST is not a + # real overrides key for this CL2 image). NODE_SELECTOR pins the + # Prometheus pod to the dedicated `prompool` node defined in + # azure-2.tfvars (label prometheus=true). + self.assertIn("CL2_PROMETHEUS_TOLERATE_MASTER: true", content) + self.assertIn("CL2_PROMETHEUS_MEMORY_LIMIT: 2Gi", content) + self.assertIn('CL2_PROMETHEUS_NODE_SELECTOR: "prometheus: \\"true\\""', content) + self.assertIn("CL2_PROMETHEUS_SCRAPE_CILIUM_AGENT: true", content) + self.assertIn("CL2_PROMETHEUS_SCRAPE_CILIUM_OPERATOR: true", content) + self.assertIn("CL2_POD_STARTUP_LATENCY_THRESHOLD: 3m", content) + self.assertIn("CL2_ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS_SIMPLE: false", content) + self.assertNotIn("CL2_PROMETHEUS_MEMORY_REQUEST", content) + self.assertNotIn("CL2_PROMETHEUS_MEMORY_LIMIT_FACTOR", content) + self.assertNotIn("CL2_PROMETHEUS_MEMORY_SCALE_FACTOR", content) + self.assertNotIn("CL2_PROMETHEUS_CPU_SCALE_FACTOR", content) + + # Topology knobs round-tripped from arguments. + self.assertIn("CL2_NAMESPACES: 2", content) + self.assertIn("CL2_DEPLOYMENTS_PER_NAMESPACE: 3", content) + self.assertIn("CL2_REPLICAS_PER_DEPLOYMENT: 4", content) + self.assertIn("CL2_OPERATION_TIMEOUT: 20m", content) + finally: + os.remove(tmp_path) + + def test_overrides_file_timeout_passthrough(self): + """Caller-provided operation_timeout flows through unchanged (no clamping).""" + with tempfile.NamedTemporaryFile( + delete=False, mode="w+", encoding="utf-8" + ) as tmp: + tmp_path = tmp.name + try: + configure_clusterloader2( + namespaces=1, + deployments_per_namespace=1, + replicas_per_deployment=1, + operation_timeout="45m", + override_file=tmp_path, + ) + with open(tmp_path, "r", encoding="utf-8") as f: + self.assertIn("CL2_OPERATION_TIMEOUT: 45m", f.read()) + finally: + os.remove(tmp_path) + + +class TestCollectSingleCluster(unittest.TestCase): + """collect_clusterloader2 emits one JSONL row per call, tagged with cluster identity.""" + + def _collect(self, *, cluster_name, cluster_count=2, mesh_size=2, + test_type="unit-test", report_subdir="mesh-1"): + result_file = tempfile.mktemp(suffix=".jsonl") + collect_clusterloader2( + cl2_report_dir=os.path.join(MOCK_REPORT_ROOT, report_subdir), + cloud_info=json.dumps({"cloud": "azure", "region": "eastus2"}), + run_id="test-run-123", + run_url="http://example.com/run123", + result_file=result_file, + test_type=test_type, + start_timestamp="2026-04-28T15:00:00Z", + cluster_name=cluster_name, + cluster_count=cluster_count, + mesh_size=mesh_size, + namespaces=2, + deployments_per_namespace=3, + replicas_per_deployment=4, + trigger_reason="Manual", + ) + return result_file + + def test_collect_creates_result_file(self): + """collect_clusterloader2 writes a non-empty JSONL with run-level fields.""" + result_file = self._collect(cluster_name="mesh-1") + try: + self.assertTrue(os.path.exists(result_file)) + with open(result_file, "r", encoding="utf-8") as f: + content = f.read() + self.assertGreater(len(content), 0) + lines = content.strip().split("\n") + self.assertGreaterEqual(len(lines), 1) + row = json.loads(lines[0]) + self.assertEqual(row["status"], "success") + self.assertEqual(row["run_id"], "test-run-123") + self.assertEqual(row["test_type"], "unit-test") + self.assertEqual(row["start_timestamp"], "2026-04-28T15:00:00Z") + finally: + if os.path.exists(result_file): + os.remove(result_file) + + def test_collect_attributes_cluster_identity(self): + """Cluster identity is propagated to BOTH top-level and test_details, per Kusto schema.""" + result_file = self._collect(cluster_name="mesh-1", cluster_count=2) + try: + with open(result_file, "r", encoding="utf-8") as f: + row = json.loads(f.read().strip().split("\n")[0]) + self.assertEqual(row["cluster"], "mesh-1") + self.assertEqual(row["cluster_count"], 2) + self.assertEqual(row["test_details"]["cluster"], "mesh-1") + self.assertEqual(row["test_details"]["cluster_count"], 2) + finally: + if os.path.exists(result_file): + os.remove(result_file) + + def test_collect_computes_pods_per_cluster(self): + """pods_per_cluster = namespaces * deployments * replicas (2 * 3 * 4 = 24).""" + result_file = self._collect(cluster_name="mesh-1") + try: + with open(result_file, "r", encoding="utf-8") as f: + row = json.loads(f.read().strip().split("\n")[0]) + self.assertEqual(row["test_details"]["pods_per_cluster"], 24) + self.assertEqual(row["namespaces"], 2) + self.assertEqual(row["deployments_per_namespace"], 3) + self.assertEqual(row["replicas_per_deployment"], 4) + finally: + if os.path.exists(result_file): + os.remove(result_file) + + def test_collect_emits_mesh_size_independent_of_cluster_count(self): + """mesh_size (configured target) and cluster_count (observed) must be distinct fields. + + Querying ``mesh_size != cluster_count`` in Kusto is how we surface + partial-mesh runs — a Fleet member that failed to join would manifest + as a smaller observed cluster_count than the configured mesh_size. + Both fields must be present at top level AND in test_details. + """ + result_file = self._collect(cluster_name="mesh-1", cluster_count=4, mesh_size=5) + try: + with open(result_file, "r", encoding="utf-8") as f: + row = json.loads(f.read().strip().split("\n")[0]) + self.assertEqual(row["mesh_size"], 5) + self.assertEqual(row["cluster_count"], 4) + self.assertEqual(row["test_details"]["mesh_size"], 5) + self.assertEqual(row["test_details"]["cluster_count"], 4) + self.assertNotEqual(row["mesh_size"], row["cluster_count"]) + finally: + if os.path.exists(result_file): + os.remove(result_file) + + def test_collect_propagates_test_type(self): + """test_type tags every JSONL row so Kusto can filter scenario flavors. + + Scale-scenario #1 (event-throughput) and the default-config Phase-1 + smoke run share one results table; downstream dashboards filter on + ``test_type == 'event-throughput'`` to scope the scaling-curve view + to the right workload. Regression-guards that the field flows through + unmodified. + """ + result_file = self._collect(cluster_name="mesh-1", test_type="event-throughput") + try: + with open(result_file, "r", encoding="utf-8") as f: + row = json.loads(f.read().strip().split("\n")[0]) + self.assertEqual(row["test_type"], "event-throughput") + finally: + if os.path.exists(result_file): + os.remove(result_file) + + +class TestCollectMultiCluster(unittest.TestCase): + """The multi-cluster aggregation invariant — the reason this scenario exists. + + collect.yml calls scale.py once per cluster and concatenates per-cluster JSONL + files into a single TEST_RESULTS_FILE. The resulting stream MUST have: + * one logical row per cluster + * each row's `cluster` field distinct + * `cluster_count` consistent across rows + * `run_id` consistent across rows (same pipeline run) + Without this, downstream Kusto cannot group/filter by cluster. + """ + + def _collect(self, *, cluster_name, report_subdir): + result_file = tempfile.mktemp(suffix=f".{cluster_name}.jsonl") + collect_clusterloader2( + cl2_report_dir=os.path.join(MOCK_REPORT_ROOT, report_subdir), + cloud_info=json.dumps({"cloud": "azure"}), + run_id="multi-cluster-run", + run_url="http://example.com/multi", + result_file=result_file, + test_type="unit-test", + start_timestamp="2026-04-28T15:00:00Z", + cluster_name=cluster_name, + cluster_count=2, + mesh_size=2, + namespaces=1, + deployments_per_namespace=1, + replicas_per_deployment=1, + trigger_reason="", + ) + return result_file + + def test_two_clusters_aggregate_with_distinct_attribution(self): + """Aggregating per-cluster JSONLs yields rows with distinct cluster identity.""" + f1 = self._collect(cluster_name="mesh-1", report_subdir="mesh-1") + f2 = self._collect(cluster_name="mesh-2", report_subdir="mesh-2") + try: + # Mirror what collect.yml does: cat per-cluster files into one stream. + aggregated = "" + for path in (f1, f2): + with open(path, "r", encoding="utf-8") as f: + aggregated += f.read() + + rows = [json.loads(line) for line in aggregated.strip().split("\n") if line] + # Each per-cluster collect emits at least one row (overall testsuite line). + self.assertGreaterEqual(len(rows), 2) + + clusters_seen = {row["cluster"] for row in rows} + self.assertEqual(clusters_seen, {"mesh-1", "mesh-2"}) + + # Run-level fields must be identical across all rows. + run_ids = {row["run_id"] for row in rows} + cluster_counts = {row["cluster_count"] for row in rows} + mesh_sizes = {row["mesh_size"] for row in rows} + self.assertEqual(run_ids, {"multi-cluster-run"}) + self.assertEqual(cluster_counts, {2}) + # mesh_size is a run-level constant — it must be identical across + # every per-cluster row in the aggregated stream. + self.assertEqual(mesh_sizes, {2}) + finally: + for path in (f1, f2): + if os.path.exists(path): + os.remove(path) + + +class TestCollectFailureStatus(unittest.TestCase): + """A junit.xml with failures>0 must produce status=failure (no silent green).""" + + def test_failure_in_junit_propagates_to_status(self): + """A junit testsuite with failures>0 must surface as status=failure in the JSONL.""" + result_file = tempfile.mktemp(suffix=".jsonl") + try: + collect_clusterloader2( + cl2_report_dir=os.path.join(MOCK_REPORT_ROOT, "mesh-fail"), + cloud_info="", + run_id="fail-run", + run_url="", + result_file=result_file, + test_type="unit-test", + start_timestamp="2026-04-28T15:00:00Z", + cluster_name="mesh-fail", + cluster_count=2, + mesh_size=2, + namespaces=1, + deployments_per_namespace=1, + replicas_per_deployment=1, + trigger_reason="", + ) + with open(result_file, "r", encoding="utf-8") as f: + row = json.loads(f.read().strip().split("\n")[0]) + self.assertEqual(row["status"], "failure") + self.assertEqual(row["cluster"], "mesh-fail") + details = row["test_details"]["details"] + self.assertIsNotNone(details) + self.assertIn("timeout", json.dumps(details).lower()) + finally: + if os.path.exists(result_file): + os.remove(result_file) + + +class TestMainArgumentParsing(unittest.TestCase): + """main() dispatches subcommands to the right function with the right args.""" + + @patch.object(clustermesh_scale_module, "configure_clusterloader2") + def test_configure_command_parsing(self, mock_configure): + """`configure` subcommand wires CLI args through to configure_clusterloader2.""" + test_args = [ + "clustermesh-scale/scale.py", + "configure", + "--namespaces", "2", + "--deployments-per-namespace", "3", + "--replicas-per-deployment", "4", + "--operation-timeout", "20m", + "--cl2_override_file", "/tmp/overrides.yaml", + ] + with patch.object(sys, "argv", test_args): + main() + mock_configure.assert_called_once_with(2, 3, 4, "20m", "/tmp/overrides.yaml") + + @patch.object(clustermesh_scale_module, "execute_clusterloader2") + def test_execute_command_parsing(self, mock_execute): + """`execute` subcommand wires CLI args through to execute_clusterloader2.""" + test_args = [ + "clustermesh-scale/scale.py", + "execute", + "--cl2-image", "ghcr.io/azure/clusterloader2:v20250513", + "--cl2-config-dir", "/path/to/config", + "--cl2-report-dir", "/path/to/report", + "--cl2-config-file", "config.yaml", + "--kubeconfig", "/path/to/kubeconfig", + "--provider", "aks", + ] + with patch.object(sys, "argv", test_args): + main() + mock_execute.assert_called_once_with( + "ghcr.io/azure/clusterloader2:v20250513", + "/path/to/config", + "/path/to/report", + "config.yaml", + "/path/to/kubeconfig", + "aks", + ) + + @patch.object(clustermesh_scale_module, "collect_clusterloader2") + def test_collect_command_parsing(self, mock_collect): + """`collect` subcommand wires CLI args through to collect_clusterloader2.""" + test_args = [ + "clustermesh-scale/scale.py", + "collect", + "--cl2_report_dir", "/path/to/report", + "--cloud_info", "{}", + "--run_id", "abc", + "--run_url", "http://example.com", + "--result_file", "/tmp/results.jsonl", + "--test_type", "default-config", + "--start_timestamp", "2026-04-28T15:00:00Z", + "--cluster-name", "mesh-1", + "--cluster-count", "2", + "--mesh-size", "2", + "--namespaces", "1", + "--deployments-per-namespace", "1", + "--replicas-per-deployment", "1", + "--trigger_reason", "Manual", + ] + with patch.object(sys, "argv", test_args): + main() + mock_collect.assert_called_once_with( + "/path/to/report", + "{}", + "abc", + "http://example.com", + "/tmp/results.jsonl", + "default-config", + "2026-04-28T15:00:00Z", + "mesh-1", + 2, + 2, + 1, + 1, + 1, + "Manual", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/modules/terraform/azure/aks-cli/main.tf b/modules/terraform/azure/aks-cli/main.tf index 47395fcab6..687ca04e5b 100644 --- a/modules/terraform/azure/aks-cli/main.tf +++ b/modules/terraform/azure/aks-cli/main.tf @@ -53,6 +53,12 @@ locals { try(var.subnets_map[var.aks_cli_config.subnet_name], null) ) + pod_subnet_id = ( + try(var.aks_cli_config.pod_subnet_name, null) == null ? + null : + try(var.subnets_map[var.aks_cli_config.pod_subnet_name], null) + ) + api_server_subnet_id = ( var.aks_cli_config.api_server_subnet_name == null ? null : @@ -118,6 +124,14 @@ locals { ) ) + pod_subnet_id_parameter = (local.pod_subnet_id == null ? + "" : + format( + "%s %s", + "--pod-subnet-id", local.pod_subnet_id, + ) + ) + managed_identity_parameter = (var.aks_cli_config.managed_identity_name == null ? "--enable-managed-identity" : format( @@ -193,6 +207,7 @@ locals { local.kms_parameters, local.disk_encryption_parameters, local.subnet_id_parameter, + local.pod_subnet_id_parameter, local.managed_identity_parameter, local.kubelet_identity_parameter, local.api_server_vnet_integration_parameter, diff --git a/modules/terraform/azure/fleet/main.tf b/modules/terraform/azure/fleet/main.tf new file mode 100644 index 0000000000..559050996e --- /dev/null +++ b/modules/terraform/azure/fleet/main.tf @@ -0,0 +1,336 @@ +# ============================================================================= +# Fleet + ClusterMesh Profile submodule +# +# Mirrors Steps 4-6 of fleet-setup-script.sh: +# Step 4: az fleet create +# Step 5: az fleet member create --labels mesh=true (per cluster) +# Step 6: az fleet clustermeshprofile create --selector mesh=true +# az fleet clustermeshprofile apply +# +# Design decisions: +# - Fleet resource: azapi_resource. There is no stable azurerm resource that +# covers managed Fleet with the shape we need, and the clustermeshprofile +# lives under the same ARM parent, so keeping Fleet in azapi keeps the +# parent_id references simple. +# - Fleet members: terraform_data + local-exec wrapping +# `az fleet member create --labels`. Member labels (needed by the +# clustermeshprofile selector) are first-class in the Fleet ARM API but +# the azapi resource body shape is currently rejected for this field; +# az CLI is the supported surface today. +# - ClusterMeshProfile create/apply: terraform_data + local-exec, wrapping +# `az fleet clustermeshprofile create` and `apply`. The ARM resource type +# is still private-preview — az CLI (v2.0.4+ private .whl) is currently +# the only path. Create and destroy commands are stored inside +# terraform_data.input so the destroy-time provisioner can reference +# self.input. (destroy-time provisioners can't read vars/locals). +# Same pattern as modules/terraform/azure/aks-cli/main.tf:271-318. +# ============================================================================= + +locals { + fleet_enabled = var.fleet_enabled + + members_by_name = { for m in var.members : m.member_name => m } + + # Construct AKS resource IDs from known inputs. aks-cli does not emit outputs. + # The depends_on chain on the fleet module instance ensures AKS exists before + # these IDs are referenced by the member create call. + aks_resource_id = { + for m in var.members : + m.member_name => format( + "/subscriptions/%s/resourceGroups/%s/providers/Microsoft.ContainerService/managedClusters/%s", + var.subscription_id, + var.resource_group_name, + m.aks_name, + ) + } +} + +# ----------------------------------------------------------------------------- +# Step 4: Fleet resource +# ----------------------------------------------------------------------------- +resource "azapi_resource" "fleet" { + count = local.fleet_enabled ? 1 : 0 + + type = "Microsoft.ContainerService/fleets@2025-03-01" + name = var.fleet_name + parent_id = "/subscriptions/${var.subscription_id}/resourceGroups/${var.resource_group_name}" + location = var.location + tags = var.tags + + body = { + properties = {} + } +} + +# ----------------------------------------------------------------------------- +# Step 5: Fleet members (one per AKS cluster), labeled for the mesh selector. +# +# Implemented via local-exec for two reasons: +# 1. Mirrors the source script exactly (`az fleet member create --labels mesh=true`). +# 2. The Fleet member ARM API rejects azapi-style bodies for the `labels` field; +# az CLI is the supported surface for this resource shape today. +# +# Same pattern as the clustermeshprofile below: command stored in +# terraform_data.input so destroy-time provisioner can reference self.input.*. +# ----------------------------------------------------------------------------- +locals { + member_create_command = { + for m in var.members : m.member_name => join(" ", [ + "az fleet member create", + "--subscription", var.subscription_id, + "--resource-group", var.resource_group_name, + "--fleet-name", var.fleet_name, + "--name", m.member_name, + "--member-cluster-id", local.aks_resource_id[m.member_name], + "--labels", "${var.member_label_key}=${var.member_label_value}", + "--output", "none", + ]) + } + + member_destroy_command = { + for m in var.members : m.member_name => join(" ", [ + "az fleet member delete", + "--subscription", var.subscription_id, + "--resource-group", var.resource_group_name, + "--fleet-name", var.fleet_name, + "--name", m.member_name, + "--yes", + "--output", "none", + ]) + } + + # Re-label members during destroy so the clustermeshprofile's + # `${member_label_key}=${member_label_value}` selector no longer matches — + # this is the only way out of the Fleet API's chicken-and-egg between + # `member delete` (rejects with MemberBelongsToClusterMesh while attached) + # and `clustermeshprofile delete` (rejects with + # CannotDeleteClusterMeshProfileWithMembers while members exist). The + # value `detaching` is intentionally non-matching; `az fleet member update + # --labels` REPLACES the labels map (it's not additive), so this also + # drops the original mesh=true label. + member_relabel_command = { + for m in var.members : m.member_name => join(" ", [ + "az fleet member update", + "--subscription", var.subscription_id, + "--resource-group", var.resource_group_name, + "--fleet-name", var.fleet_name, + "--name", m.member_name, + "--labels", "${var.member_label_key}=detaching", + "--output", "none", + ]) + } +} + +resource "terraform_data" "member" { + for_each = local.fleet_enabled ? local.members_by_name : {} + + depends_on = [azapi_resource.fleet] + + input = { + create_command = local.member_create_command[each.value.member_name] + destroy_command = local.member_destroy_command[each.value.member_name] + } + + # Bash retry loop. The Fleet RP can lag behind the AKS RP by 30-60s after + # a fresh AKS create; without retry, `az fleet member create` returns + # DependentResourceNotFound. Additionally, the AKS cluster can be in + # `Updating` state for several minutes after the Network Contributor role + # assignment on the VNet (granted in modules/terraform/azure/main.tf for the + # clustermesh-apiserver internal LB) — `az fleet member create` rejects + # with `ManagedClusterNotInExpectedState` until reconciliation finishes. + # 60 x 20s = 20 min covers slow Azure days; the happy path exits on the + # first attempt (~5s). + provisioner "local-exec" { + interpreter = ["bash", "-c"] + command = <<-EOT + set -euo pipefail + cmd='${self.input.create_command}' + max=60 + delay=20 + for i in $(seq 1 $max); do + echo "[$i/$max] $cmd" + if eval "$cmd"; then + exit 0 + fi + if [ "$i" -lt "$max" ]; then + echo "Fleet RP not ready yet, retrying in $${delay}s..." + sleep "$delay" + fi + done + echo "az fleet member create failed after $max attempts" >&2 + exit 1 + EOT + } + + provisioner "local-exec" { + when = destroy + interpreter = ["bash", "-c"] + command = "${self.input.destroy_command} || true" + } +} + +# ----------------------------------------------------------------------------- +# Step 6: ClusterMesh profile (create + apply) via local-exec. +# +# Both the create and the destroy commands are stored inside +# terraform_data.input so the destroy provisioner can reference self.input.* +# (destroy-time provisioners cannot reference var.* or local.*). +# +# Destroy ordering: this resource depends on every fleet member, so on destroy +# Terraform tears down the profile BEFORE the members (and before the AKS +# clusters downstream). That matches the source-of-truth teardown: detach the +# mesh before the clusters disappear, else extension reconciliation hangs. +# ----------------------------------------------------------------------------- +locals { + cmp_create_command = local.fleet_enabled ? join(" ", [ + "az fleet clustermeshprofile create", + "--subscription", var.subscription_id, + "--resource-group", var.resource_group_name, + "--fleet-name", var.fleet_name, + "--name", var.cmp_name, + "--selector", "${var.member_label_key}=${var.member_label_value}", + "--output", "none", + ]) : "true" + + cmp_apply_command = local.fleet_enabled ? join(" ", [ + "az fleet clustermeshprofile apply", + "--subscription", var.subscription_id, + "--resource-group", var.resource_group_name, + "--fleet-name", var.fleet_name, + "--name", var.cmp_name, + "--output", "none", + ]) : "true" + + cmp_destroy_command = local.fleet_enabled ? join(" ", [ + "az fleet clustermeshprofile delete", + "--subscription", var.subscription_id, + "--resource-group", var.resource_group_name, + "--fleet-name", var.fleet_name, + "--name", var.cmp_name, + "--yes", + "--output", "none", + ]) : "true" + + # Returns the count of fleet members CURRENTLY APPLIED to the profile (i.e. + # in the profile's reconciled member set, not just selector-matched). Used + # by the destroy provisioner to wait for relabel+apply to drain the set + # before attempting the profile delete. + cmp_list_applied_count_command = local.fleet_enabled ? join(" ", [ + "az fleet clustermeshprofile list-members", + "--subscription", var.subscription_id, + "--resource-group", var.resource_group_name, + "--fleet-name", var.fleet_name, + "--name", var.cmp_name, + "--query", "'length(@)'", + "--output", "tsv", + ]) : "echo 0" +} + +resource "terraform_data" "clustermeshprofile" { + count = local.fleet_enabled ? 1 : 0 + + depends_on = [ + terraform_data.member, + ] + + input = { + create_command = local.cmp_create_command + apply_command = local.cmp_apply_command + delete_command = local.cmp_destroy_command + # `list-members` (default mode) returns members APPLIED to the profile — + # the same set the profile-delete API checks. We poll its count to know + # when the relabel+apply reconcile has actually drained membership. + list_applied_count_command = local.cmp_list_applied_count_command + # Pre-built per-member `az fleet member update --labels` commands. Joined + # with newlines and embedded in self.input because destroy provisioners + # can only access self.input.* (not var.* / local.*). + member_relabel_commands = local.fleet_enabled ? join("\n", values(local.member_relabel_command)) : "" + } + + # create + apply are two separate az calls. Use bash with `set -euo pipefail` + # so any failure aborts the chain. + provisioner "local-exec" { + interpreter = ["bash", "-c"] + command = "set -euo pipefail; ${self.input.create_command}; ${self.input.apply_command}" + } + + # Destroy-time: Fleet's API has a chicken-and-egg between member-delete + # and clustermeshprofile-delete: + # - `az fleet member delete` rejects with `MemberBelongsToClusterMesh` + # while the member is still selected by any clustermeshprofile. + # - `az fleet clustermeshprofile delete` rejects with + # `CannotDeleteClusterMeshProfileWithMembers` while any member is + # still in the profile. + # The az fleet 2.0.4 extension exposes no first-class detach/remove-member + # command. The way out is to UPDATE each member's labels to a value that + # the profile selector no longer matches (the profile selects on + # `${var.member_label_key}=${var.member_label_value}` from create-time), + # then re-`apply` the profile so it reconciles to an empty member set, + # then delete the profile. After that the per-member destroy provisioner + # on terraform_data.member runs successfully (members are no longer + # attached to any profile). + # + # All steps are best-effort (`|| true` / `exit 0` at the end) so a + # partial-state teardown still progresses to RG cleanup. + provisioner "local-exec" { + when = destroy + interpreter = ["bash", "-c"] + command = <<-EOT + set -uo pipefail + # 1. Relabel every member off the profile's selector. After this, a + # subsequent `apply` will reconcile the profile's member set to empty. + printf '%s\n' "${self.input.member_relabel_commands}" | while IFS= read -r cmd; do + [ -n "$cmd" ] || continue + echo "[relabel-member] $cmd" + eval "$cmd" || true + done + + # 2. Issue an apply to start the reconcile. apply is async on the Fleet + # RP — `az fleet clustermeshprofile apply` returns when the LRO is + # accepted, but membership reconciliation (including draining the old + # applied set) can lag behind by several minutes. + echo "[apply-profile] ${self.input.apply_command}" + eval "${self.input.apply_command}" || true + + # 3. Poll the profile's APPLIED member count until it reaches 0. Re-issue + # `apply` periodically as a nudge in case the first one was a no-op + # (e.g. Fleet RP hadn't yet observed the relabeled members). + # Budget: 120 x 5s = 10 min. + drained=false + for i in $(seq 1 120); do + count=$(eval "${self.input.list_applied_count_command}" 2>/dev/null | tr -d '[:space:]') + echo "[poll-members] attempt $i/120: applied count='$count'" + if [ "$count" = "0" ]; then + drained=true + break + fi + # Re-apply every minute (every 12 polls) to push Fleet RP if the + # initial apply didn't pick up the relabel. + if [ "$i" -gt 1 ] && [ $((i % 12)) -eq 0 ]; then + echo "[apply-profile] (nudge) ${self.input.apply_command}" + eval "${self.input.apply_command}" || true + fi + sleep 5 + done + if [ "$drained" != "true" ]; then + echo "[poll-members] timed out waiting for applied set to drain; will still attempt delete" + fi + + # 4. Delete the profile. Brief retry as a backstop in case there's still + # propagation lag between list-members showing 0 and delete being allowed. + echo "[delete-profile] ${self.input.delete_command}" + for i in $(seq 1 30); do + if eval "${self.input.delete_command}"; then + echo "[delete-profile] succeeded on attempt $i" + exit 0 + fi + if [ "$i" -lt 30 ]; then + echo "[delete-profile] retry $i/30 in 5s" + sleep 5 + fi + done + echo "[delete-profile] gave up after 30 attempts; downstream cleanup will proceed" + exit 0 + EOT + } +} diff --git a/modules/terraform/azure/fleet/outputs.tf b/modules/terraform/azure/fleet/outputs.tf new file mode 100644 index 0000000000..04c5ff508e --- /dev/null +++ b/modules/terraform/azure/fleet/outputs.tf @@ -0,0 +1,14 @@ +output "fleet_name" { + description = "Name of the Fleet resource (empty when fleet_enabled=false)." + value = var.fleet_enabled ? var.fleet_name : "" +} + +output "cmp_name" { + description = "Name of the ClusterMesh profile (empty when fleet_enabled=false)." + value = var.fleet_enabled ? var.cmp_name : "" +} + +output "member_names" { + description = "List of fleet member names created." + value = var.fleet_enabled ? [for m in var.members : m.member_name] : [] +} diff --git a/modules/terraform/azure/fleet/variables.tf b/modules/terraform/azure/fleet/variables.tf new file mode 100644 index 0000000000..ee4820e779 --- /dev/null +++ b/modules/terraform/azure/fleet/variables.tf @@ -0,0 +1,57 @@ +variable "fleet_enabled" { + description = "Whether to create the Fleet, members, and clustermeshprofile." + type = bool + default = false +} + +variable "resource_group_name" { + description = "Resource group that contains the Fleet and the member AKS clusters." + type = string +} + +variable "location" { + description = "Azure region for the Fleet resource." + type = string +} + +variable "subscription_id" { + description = "Azure subscription GUID (used to construct AKS resource IDs and CLI calls)." + type = string +} + +variable "fleet_name" { + description = "Name of the Azure Fleet Manager resource." + type = string +} + +variable "cmp_name" { + description = "Name of the Fleet ClusterMesh Profile." + type = string +} + +variable "member_label_key" { + description = "Label key set on fleet members and used as the clustermeshprofile selector." + type = string + default = "mesh" +} + +variable "member_label_value" { + description = "Label value set on fleet members and used as the clustermeshprofile selector." + type = string + default = "true" +} + +variable "members" { + description = "List of fleet members. aks_name identifies the AKS cluster in the same resource group; member_name is the Fleet-side name (intentionally may differ from aks_name)." + type = list(object({ + member_name = string + aks_name = string + })) + default = [] +} + +variable "tags" { + description = "Tags applied to the Fleet resource." + type = map(string) + default = {} +} diff --git a/modules/terraform/azure/fleet/versions.tf b/modules/terraform/azure/fleet/versions.tf new file mode 100644 index 0000000000..71a8e66c18 --- /dev/null +++ b/modules/terraform/azure/fleet/versions.tf @@ -0,0 +1,9 @@ +terraform { + required_version = ">=1.5.6" + required_providers { + azapi = { + source = "Azure/azapi" + version = "2.8.0" + } + } +} diff --git a/modules/terraform/azure/main.tf b/modules/terraform/azure/main.tf index ea48654f41..2d04ad1bf4 100644 --- a/modules/terraform/azure/main.tf +++ b/modules/terraform/azure/main.tf @@ -320,3 +320,94 @@ module "virtual_machine" { # Ensure AKS cluster is created before VM tries to look it up for RBAC depends_on = [module.aks, module.aks-cli, module.azapi] } + +# ============================================================================= +# ClusterMesh add-ons (vnet-peering + fleet + clustermeshprofile). +# +# Both are no-ops unless explicitly enabled in their *_config variable. Used +# today only by the clustermesh-scale scenario. +# ============================================================================= + +data "azurerm_client_config" "current" {} + +module "vnet_peering" { + source = "./vnet-peering" + + peering_enabled = try(var.vnet_peering_config.enabled, false) + resource_group_name = local.run_id + vnet_role_to_id = { for role in keys(local.network_config_map) : role => module.virtual_network[role].vnet_id } + vnet_role_to_name = { for role, nw in local.network_config_map : role => nw.vnet_name } + + depends_on = [module.virtual_network] +} + +# ----------------------------------------------------------------------------- +# Network Contributor on each member's VNet for the AKS control-plane identity. +# +# Required so AKS cloud-controller-manager can provision the +# clustermesh-apiserver internal LoadBalancer Service. `az aks create` +# auto-grants the cluster identity Network Contributor on the *node subnet*, +# but LB provisioning on that subnet additionally needs VNet-level read. +# Without this grant the Service stays at EXTERNAL-IP=, the +# `cilium clustermesh status` CLI fails with "unable to derive service IPs +# automatically", and the per-agent `cilium-clustermesh` secret is never +# populated → cilium-dbg reports "ClusterMesh: 0/0 remote clusters ready". +# +# Mirrors fleet-setup-script.sh Step 3 (the reference manual setup script). +# Gated on fleet_config.enabled so non-clustermesh scenarios are unaffected. +# ----------------------------------------------------------------------------- +locals { + clustermesh_member_roles = try(var.fleet_config.enabled, false) ? { + for m in try(var.fleet_config.members, []) : m.aks_role => m.aks_role + } : {} +} + +data "azurerm_kubernetes_cluster" "clustermesh_member" { + for_each = local.clustermesh_member_roles + + name = local.aks_cli_config_map[each.key].aks_name + resource_group_name = local.run_id + + # aks-cli creates the cluster via local-exec; depends_on defers the data + # read until apply time when the cluster actually exists. + depends_on = [module.aks-cli] +} + +resource "azurerm_role_assignment" "clustermesh_vnet_contributor" { + for_each = local.clustermesh_member_roles + + scope = module.virtual_network[each.key].vnet_id + role_definition_name = "Network Contributor" + principal_id = data.azurerm_kubernetes_cluster.clustermesh_member[each.key].identity[0].principal_id +} + +module "fleet" { + source = "./fleet" + + fleet_enabled = try(var.fleet_config.enabled, false) + resource_group_name = local.run_id + location = local.region + subscription_id = data.azurerm_client_config.current.subscription_id + fleet_name = try(var.fleet_config.fleet_name, "") + cmp_name = try(var.fleet_config.cmp_name, "") + member_label_key = try(var.fleet_config.member_label_key, "mesh") + member_label_value = try(var.fleet_config.member_label_value, "true") + members = [ + for m in try(var.fleet_config.members, []) : { + member_name = m.member_name + aks_name = local.aks_cli_config_map[m.aks_role].aks_name + } + ] + tags = local.tags + + # AKS clusters must exist before we join them as fleet members and apply the + # mesh profile. Peering must exist too — apply reaches the mesh-apiserver LB + # endpoints cross-cluster, which requires peering (separate-VNet mode). + # Network Contributor on each VNet must exist before clustermeshprofile apply + # so cloud-controller-manager can provision the apiserver internal LB. + depends_on = [ + module.aks-cli, + module.vnet_peering, + azurerm_role_assignment.clustermesh_vnet_contributor, + ] +} diff --git a/modules/terraform/azure/variables.tf b/modules/terraform/azure/variables.tf index 0c57fc6869..deb028690d 100644 --- a/modules/terraform/azure/variables.tf +++ b/modules/terraform/azure/variables.tf @@ -472,6 +472,7 @@ variable "aks_cli_config_list" { managed_identity_name = optional(string, null) subnet_name = optional(string, null) + pod_subnet_name = optional(string, null) kubernetes_version = optional(string, null) aks_custom_headers = optional(list(string), []) use_custom_configurations = optional(bool, false) @@ -586,3 +587,32 @@ variable "disk_encryption_set_config_list" { } } + +# ============================================================================= +# ClusterMesh additions (optional; used by the clustermesh-scale scenario). +# Both default to disabled so existing scenarios are unaffected. +# ============================================================================= + +variable "vnet_peering_config" { + description = "Pairwise VNet peering across all VNets in network_config_list. Keys are stable src_role-dst_role so adding a cluster does not churn existing peerings." + type = object({ + enabled = optional(bool, false) + }) + default = {} +} + +variable "fleet_config" { + description = "Azure Fleet + ClusterMesh profile. When enabled, provisions a Fleet resource, one member per entry in members (labeled member_label_key=member_label_value), and creates+applies a clustermeshprofile via local-exec against the private-preview az fleet CLI (see modules/terraform/azure/fleet/)." + type = object({ + enabled = optional(bool, false) + fleet_name = optional(string, "") + cmp_name = optional(string, "") + member_label_key = optional(string, "mesh") + member_label_value = optional(string, "true") + members = optional(list(object({ + member_name = string + aks_role = string + })), []) + }) + default = {} +} diff --git a/modules/terraform/azure/vnet-peering/main.tf b/modules/terraform/azure/vnet-peering/main.tf new file mode 100644 index 0000000000..20ffa88fbf --- /dev/null +++ b/modules/terraform/azure/vnet-peering/main.tf @@ -0,0 +1,40 @@ +# ============================================================================= +# VNet peering submodule — pairwise mesh +# +# Mirrors Step 3b in fleet-setup-script.sh (SHARED_VNET=false mode): +# creates az network vnet peering create in both directions for every ordered +# pair (src, dst) with src != dst, over the VNets in var.vnet_role_to_id. +# +# for_each keys are the stable string "${src_role}->${dst_role}", so adding a +# new cluster role does NOT churn peerings that already exist between other pairs. +# ============================================================================= + +locals { + peering_pairs = var.peering_enabled ? { + for pair in flatten([ + for src_role, src_id in var.vnet_role_to_id : [ + for dst_role, dst_id in var.vnet_role_to_id : { + key = "${src_role}->${dst_role}" + src_role = src_role + dst_role = dst_role + src_id = src_id + dst_id = dst_id + src_name = var.vnet_role_to_name[src_role] + } if src_role != dst_role + ] + ]) : pair.key => pair + } : {} +} + +resource "azurerm_virtual_network_peering" "peering" { + for_each = local.peering_pairs + + name = "${each.value.src_name}-to-${each.value.dst_role}" + resource_group_name = var.resource_group_name + virtual_network_name = each.value.src_name + remote_virtual_network_id = each.value.dst_id + allow_virtual_network_access = true + allow_forwarded_traffic = false + allow_gateway_transit = false + use_remote_gateways = false +} diff --git a/modules/terraform/azure/vnet-peering/outputs.tf b/modules/terraform/azure/vnet-peering/outputs.tf new file mode 100644 index 0000000000..d8f9d9f69e --- /dev/null +++ b/modules/terraform/azure/vnet-peering/outputs.tf @@ -0,0 +1,4 @@ +output "peering_keys" { + description = "List of peering keys (src_role->dst_role) that were created." + value = keys(azurerm_virtual_network_peering.peering) +} diff --git a/modules/terraform/azure/vnet-peering/variables.tf b/modules/terraform/azure/vnet-peering/variables.tf new file mode 100644 index 0000000000..7aabadcf7b --- /dev/null +++ b/modules/terraform/azure/vnet-peering/variables.tf @@ -0,0 +1,22 @@ +variable "peering_enabled" { + description = "Whether to create pairwise VNet peerings between all VNets in vnet_role_to_id." + type = bool + default = false +} + +variable "vnet_role_to_id" { + description = "Map of network role => VNet resource ID. Every pair (a, b) with a != b gets two peerings (a->b and b->a)." + type = map(string) + default = {} +} + +variable "vnet_role_to_name" { + description = "Map of network role => VNet name. Used to name the peering resource on the source VNet." + type = map(string) + default = {} +} + +variable "resource_group_name" { + description = "Resource group containing all VNets." + type = string +} diff --git a/pipelines/perf-eval/Network Benchmark/clustermesh-scale.yml b/pipelines/perf-eval/Network Benchmark/clustermesh-scale.yml new file mode 100644 index 0000000000..caaedc0ea0 --- /dev/null +++ b/pipelines/perf-eval/Network Benchmark/clustermesh-scale.yml @@ -0,0 +1,69 @@ +trigger: none + +pool: AKS-Telescope-Airlock + +schedules: + - cron: "0 4 * * 0" + displayName: Weekly Sunday 4am clustermesh scale test + branches: + include: + - main + always: false + +variables: + SCENARIO_TYPE: perf-eval + SCENARIO_NAME: clustermesh-scale + OWNER: aks + +stages: + - stage: azure_eastus2euap + dependsOn: [] + jobs: + - template: /jobs/competitive-test.yml + parameters: + cloud: azure + regions: + - eastus2euap + engine: clusterloader2 + engine_input: + image: "ghcr.io/azure/clusterloader2:v20250513" + install: false + operation_timeout: 15m + topology: clustermesh-scale + terraform_input_file_mapping: + - eastus2euap: "scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-2.tfvars" + matrix: + n2: + cluster_count: 2 + mesh_size: 2 + cl2_config_file: config.yaml + test_type: default-config + namespaces: 1 + deployments_per_namespace: 2 + replicas_per_deployment: 2 + hold_duration: 30s + warmup_duration: 10s + restart_count: 0 + api_server_calls_per_second: 5 + trigger_reason: ${{ variables['Build.Reason'] }} + n2_event_throughput: + cluster_count: 2 + mesh_size: 2 + cl2_config_file: event-throughput.yaml + test_type: event-throughput + namespaces: 5 + deployments_per_namespace: 4 + replicas_per_deployment: 10 + hold_duration: 2m + warmup_duration: 30s + restart_count: 1 + api_server_calls_per_second: 20 + trigger_reason: ${{ variables['Build.Reason'] }} + max_parallel: 1 + timeout_in_minutes: 120 + credential_type: service_connection + ssh_key_enabled: false + # Iteration-only: skip uploading results to the telescope blob while + # we're still stabilizing the clustermesh-scale pipeline. Flip to + # false (or remove) once results are meaningful. + skip_publish: true diff --git a/pipelines/system/new-pipeline-test.yml b/pipelines/system/new-pipeline-test.yml index 63d55f02d9..38ea068658 100644 --- a/pipelines/system/new-pipeline-test.yml +++ b/pipelines/system/new-pipeline-test.yml @@ -1,25 +1,66 @@ trigger: none +pool: AKS-Telescope-Airlock + +schedules: + - cron: "0 4 * * 0" + displayName: Weekly Sunday 4am clustermesh scale test + branches: + include: + - main + always: false + variables: - SCENARIO_TYPE: - SCENARIO_NAME: + SCENARIO_TYPE: perf-eval + SCENARIO_NAME: clustermesh-scale + OWNER: aks stages: - - stage: # format: [_]+ (e.g. azure_eastus2, aws_eastus_westus) + - stage: azure_eastus2euap dependsOn: [] jobs: - - template: /jobs/competitive-test.yml # must keep as is + - template: /jobs/competitive-test.yml parameters: - cloud: # e.g. azure, aws - regions: # list of regions - - region1 # e.g. eastus2 - topology: # e.g. cluster-autoscaler - engine: # e.g. clusterloader2 - matrix: # list of test parameters to customize the provisioned resources - : - : - : - max_parallel: # required - credential_type: service_connection # required + cloud: azure + regions: + - eastus2euap + engine: clusterloader2 + engine_input: + image: "ghcr.io/azure/clusterloader2:v20250513" + install: false + operation_timeout: 15m + topology: clustermesh-scale + terraform_input_file_mapping: + - eastus2euap: "scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-2.tfvars" + matrix: + # Mirror pipelines/perf-eval/Network Benchmark/clustermesh-scale.yml + # so dev runs use the same matrix-var plumbing as production. + # Auto-exported as uppercase env vars (NAMESPACES, MESH_SIZE, etc.) + # by AzDO and consumed in steps/engine/clusterloader2/clustermesh-scale/execute.yml. + # + # Production clustermesh-scale.yml also has an `n2` trivial-vertical-slice + # entry. We don't run it in dev — n2_event_throughput already exercises + # the full plumbing and per-run cost (full Fleet/AKS lifecycle ~15-20 min) + # makes a second axis expensive during iteration. + n2_event_throughput: + cluster_count: 2 + mesh_size: 2 + cl2_config_file: event-throughput.yaml + test_type: event-throughput + namespaces: 5 + deployments_per_namespace: 4 + replicas_per_deployment: 10 + hold_duration: 2m + warmup_duration: 30s + restart_count: 1 + api_server_calls_per_second: 20 + trigger_reason: ${{ variables['Build.Reason'] }} + max_parallel: 1 + timeout_in_minutes: 120 + credential_type: service_connection ssh_key_enabled: false - timeout_in_minutes: 60 # if not specified, default is 60 + # Iteration-only: skip uploading results to the telescope blob while + # we're still stabilizing the clustermesh-scale pipeline. Mirrors the + # same flag in pipelines/perf-eval/Network Benchmark/clustermesh-scale.yml. + # Flip to false (or remove) once results are meaningful. + skip_publish: true diff --git a/scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-2.tfvars b/scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-2.tfvars new file mode 100644 index 0000000000..535bdba5a7 --- /dev/null +++ b/scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-2.tfvars @@ -0,0 +1,179 @@ +scenario_type = "perf-eval" +scenario_name = "clustermesh-scale" +deletion_delay = "4h" +owner = "aks" + +# ============================================================================= +# ClusterMesh Scale Test — 2 cluster tier +# +# Mirrors fleet-setup-script.sh with SHARED_VNET=false (separate VNets + peering). +# - 2 VNets (one per cluster) at 10..0.0/16 +# - Per-cluster node subnet (10..0.0/24, 254 IPs) + pod subnet (10..4.0/22, 1022 IPs) +# - 2 AKS clusters with Cilium + ACNS, Azure CNI w/ pod subnet (not overlay) +# - Pairwise VNet peering between the two VNets (both directions) +# - Fleet + 2 fleet members (label mesh=true) + clustermeshprofile +# +# Pod subnet sizing: /22 (1022 IPs) is the floor for any Phase 2 scenario in +# this tier. Math: ~70 baseline pods (kube-system + AKS add-ons across 2 nodes) +# + 200 workload pods (event-throughput n2 tier: 5 ns x 4 dep x 10 replicas) +# = ~270 pods/cluster, plus headroom for future churn-stress / HA scenarios +# without re-touching the network plan. /24 (254 IPs) was insufficient. +# Larger tiers (n5/n10/n20 in Phase 3) will get their own tfvars files with +# subnets sized for their cluster + pod counts. +# +# Naming: +# VNet role : mesh-1, mesh-2 (one VNet per role) +# AKS role : mesh-1, mesh-2 (one AKS per role) +# AKS cluster name : clustermesh-1, clustermesh-2 +# Fleet member name : mesh-1, mesh-2 (intentionally != cluster name) +# Fleet name : clustermesh-flt +# Profile name : clustermesh-cmp +# ============================================================================= + +network_config_list = [ + { + role = "mesh-1" + vnet_name = "clustermesh-1-vnet" + vnet_address_space = "10.1.0.0/16" + subnet = [ + { + name = "clustermesh-1-node" + address_prefix = "10.1.0.0/24" + }, + { + name = "clustermesh-1-pod" + address_prefix = "10.1.4.0/22" + } + ] + network_security_group_name = "" + nic_public_ip_associations = [] + nsr_rules = [] + }, + { + role = "mesh-2" + vnet_name = "clustermesh-2-vnet" + vnet_address_space = "10.2.0.0/16" + subnet = [ + { + name = "clustermesh-2-node" + address_prefix = "10.2.0.0/24" + }, + { + name = "clustermesh-2-pod" + address_prefix = "10.2.4.0/22" + } + ] + network_security_group_name = "" + nic_public_ip_associations = [] + nsr_rules = [] + } +] + +aks_cli_config_list = [ + { + role = "mesh-1" + aks_name = "clustermesh-1" + sku_tier = "Standard" + subnet_name = "clustermesh-1-node" + pod_subnet_name = "clustermesh-1-pod" + use_aks_preview_cli_extension = true + + optional_parameters = [ + { name = "generate-ssh-keys", value = "" }, + { name = "network-plugin", value = "azure" }, + { name = "network-dataplane", value = "cilium" }, + { name = "enable-acns", value = "" }, + # AKS default is 30 pods/node. Phase-2 event-throughput workload runs + # 5ns x 4dep x 10 replicas = 200 pods per cluster; with 2 default-pool + # nodes that's 100/node, so we need ≥110 to leave headroom for Cilium + # agent, ACNS daemons, monitoring stack, and kube-system pods. Azure + # CNI with pod subnet supports up to 250. + { name = "max-pods", value = "110" }, + ] + + # Default pool sizing: D4s_v5 (4 vCPU / 16GB) is enough for the workload + # pods alone. Prometheus is pinned to prompool below — without that + # split, Prometheus's 1Gi+ memory request co-tenanting on default-pool + # nodes caused per-node CPU overcommit (~160% allocatable) and left + # workload pods stuck Pending. + default_node_pool = { + name = "default" + node_count = 2 + auto_scaling_enabled = false + vm_size = "Standard_D4s_v5" + } + # Dedicated Prometheus node, labeled `prometheus=true`. CL2 is + # configured (in modules/python/clusterloader2/clustermesh-scale/scale.py + # via CL2_PROMETHEUS_NODE_SELECTOR) to schedule the prometheus-k8s pod + # only on this label, so it doesn't compete with workload pods. Mirrors + # the `prompool` pattern from + # scenarios/perf-eval/cnl-azurecni-overlay-cilium/terraform-inputs/azure.tfvars. + # D8s_v3 (8 vCPU / 32GB) is sized for our 1Gi-request Prometheus with + # ample headroom — much smaller than #1053's D32s_v5 because our + # workload spec is also much smaller. + extra_node_pool = [ + { + name = "prompool" + node_count = 1 + auto_scaling_enabled = false + vm_size = "Standard_D8s_v3" + optional_parameters = [ + { name = "labels", value = "prometheus=true" }, + ] + }, + ] + }, + { + role = "mesh-2" + aks_name = "clustermesh-2" + sku_tier = "Standard" + subnet_name = "clustermesh-2-node" + pod_subnet_name = "clustermesh-2-pod" + use_aks_preview_cli_extension = true + + optional_parameters = [ + { name = "generate-ssh-keys", value = "" }, + { name = "network-plugin", value = "azure" }, + { name = "network-dataplane", value = "cilium" }, + { name = "enable-acns", value = "" }, + { name = "max-pods", value = "110" }, + ] + + default_node_pool = { + name = "default" + node_count = 2 + auto_scaling_enabled = false + vm_size = "Standard_D4s_v5" + } + extra_node_pool = [ + { + name = "prompool" + node_count = 1 + auto_scaling_enabled = false + vm_size = "Standard_D8s_v3" + optional_parameters = [ + { name = "labels", value = "prometheus=true" }, + ] + }, + ] + } +] + +# ============================================================================= +# Fleet + ClusterMesh (new vars in this scenario) +# ============================================================================= +vnet_peering_config = { + enabled = true +} + +fleet_config = { + enabled = true + fleet_name = "clustermesh-flt" + cmp_name = "clustermesh-cmp" + member_label_key = "mesh" + member_label_value = "true" + members = [ + { member_name = "mesh-1", aks_role = "mesh-1" }, + { member_name = "mesh-2", aks_role = "mesh-2" } + ] +} diff --git a/scenarios/perf-eval/clustermesh-scale/terraform-test-inputs/azure-2.json b/scenarios/perf-eval/clustermesh-scale/terraform-test-inputs/azure-2.json new file mode 100644 index 0000000000..b2a8243a56 --- /dev/null +++ b/scenarios/perf-eval/clustermesh-scale/terraform-test-inputs/azure-2.json @@ -0,0 +1,4 @@ +{ + "run_id": "cmesh2test", + "region": "westus2" +} diff --git a/scenarios/perf-eval/clustermesh-scale/vendor/fleet-2.0.4-py3-none-any.whl b/scenarios/perf-eval/clustermesh-scale/vendor/fleet-2.0.4-py3-none-any.whl new file mode 100644 index 0000000000..68bf9f5746 Binary files /dev/null and b/scenarios/perf-eval/clustermesh-scale/vendor/fleet-2.0.4-py3-none-any.whl differ diff --git a/steps/engine/clusterloader2/clustermesh-scale/collect.yml b/steps/engine/clusterloader2/clustermesh-scale/collect.yml new file mode 100644 index 0000000000..6a879a2c58 --- /dev/null +++ b/steps/engine/clusterloader2/clustermesh-scale/collect.yml @@ -0,0 +1,88 @@ +parameters: + - name: cloud + type: string + default: "" + - name: engine_input + type: object + default: {} + - name: region + type: string + +steps: + - template: /steps/cloud/${{ parameters.cloud }}/collect-cloud-info.yml + parameters: + region: ${{ parameters.region }} + + - script: | + set -eo pipefail + set -x + + # Re-export matrix vars under CL2_*/MESH_SIZE/TEST_TYPE names that scale.py + # collect expects. Same workaround as execute.yml — matrix-var `$()` + # macros don't expand reliably in `env:` blocks. + export CL2_NAMESPACES="$NAMESPACES" + export CL2_DEPLOYMENTS_PER_NAMESPACE="$DEPLOYMENTS_PER_NAMESPACE" + export CL2_REPLICAS_PER_DEPLOYMENT="$REPLICAS_PER_DEPLOYMENT" + export MESH_SIZE="${MESH_SIZE:-$CLUSTERMESH_COUNT}" + export TEST_TYPE="${TEST_TYPE:-default-config}" + export TRIGGER_REASON="${TRIGGER_REASON:-$BUILD_REASON}" + + clusters=$(cat "$HOME/.kube/clustermesh-clusters.json") + cluster_count=$(echo "$clusters" | jq 'length') + + # Aggregate every per-cluster JSONL into a single TEST_RESULTS_FILE. + # Each line carries `cluster: ` so downstream Kusto queries can + # group/filter by cluster across the mesh. + mkdir -p "$(dirname "$TEST_RESULTS_FILE")" + : > "$TEST_RESULTS_FILE" + + for row in $(echo "$clusters" | jq -c '.[]'); do + role=$(echo "$row" | jq -r '.role') + report_dir="${CL2_REPORT_DIR}/${role}" + + if [ ! -d "$report_dir" ]; then + echo "##vso[task.logissue type=warning;] $role: missing report dir $report_dir, skipping" + continue + fi + + # If CL2 errored out before producing junit.xml (e.g. prometheus stack + # setup timeout), skip aggregation for this cluster — scale.py collect + # would crash on the missing file. The execute step already logged a + # warning per-cluster; we don't want to also abort the whole pipeline + # at collect time when partial data may be useful. + if [ ! -f "$report_dir/junit.xml" ]; then + echo "##vso[task.logissue type=warning;] $role: $report_dir/junit.xml not found (CL2 likely failed); skipping collect for this cluster" + continue + fi + + per_cluster_result="${TEST_RESULTS_FILE%.*}.${role}.${TEST_RESULTS_FILE##*.}" + + PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE collect \ + --cl2_report_dir "$report_dir" \ + --cloud_info "${CLOUD_INFO:-}" \ + --run_id "$RUN_ID" \ + --run_url "$RUN_URL" \ + --result_file "$per_cluster_result" \ + --start_timestamp "$START_TIME" \ + --cluster-name "$role" \ + --cluster-count "$cluster_count" \ + --mesh-size "$MESH_SIZE" \ + --test_type "$TEST_TYPE" \ + --namespaces "$CL2_NAMESPACES" \ + --deployments-per-namespace "$CL2_DEPLOYMENTS_PER_NAMESPACE" \ + --replicas-per-deployment "$CL2_REPLICAS_PER_DEPLOYMENT" \ + --trigger_reason "${TRIGGER_REASON:-}" + + cat "$per_cluster_result" >> "$TEST_RESULTS_FILE" + done + + echo "Aggregated results from $cluster_count clusters into $TEST_RESULTS_FILE" + wc -l "$TEST_RESULTS_FILE" || true + workingDirectory: modules/python + env: + CLOUD: ${{ parameters.cloud }} + RUN_URL: $(RUN_URL) + PYTHON_SCRIPT_FILE: $(Pipeline.Workspace)/s/modules/python/clusterloader2/clustermesh-scale/scale.py + CL2_REPORT_DIR: $(Pipeline.Workspace)/s/modules/python/clusterloader2/clustermesh-scale/results + BUILD_REASON: $(Build.Reason) + displayName: "Collect + aggregate results across clustermesh clusters" diff --git a/steps/engine/clusterloader2/clustermesh-scale/execute.yml b/steps/engine/clusterloader2/clustermesh-scale/execute.yml new file mode 100644 index 0000000000..cd82bc2d70 --- /dev/null +++ b/steps/engine/clusterloader2/clustermesh-scale/execute.yml @@ -0,0 +1,206 @@ +parameters: + - name: cloud + type: string + default: "" + - name: engine_input + type: object + default: {} + - name: region + type: string + +steps: + - script: | + echo "Set the start time for test execution" + startTimestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ") + echo "Start: $startTimestamp" + echo "##vso[task.setvariable variable=START_TIME]$startTimestamp" + displayName: set up timestamp variable + + - script: | + set -eo pipefail + set -x + + # Matrix variables (namespaces, mesh_size, deployments_per_namespace, + # replicas_per_deployment, hold_duration, warmup_duration, restart_count, + # api_server_calls_per_second, test_type) are auto-exported by AzDO to + # the script as UPPERCASE env vars (e.g. NAMESPACES, MESH_SIZE). Re-export + # them under the CL2_* names that scale.py and the CL2 yaml templates + # (config.yaml / event-throughput.yaml) consume. + # + # Why this re-export rather than `env: CL2_NAMESPACES: $(namespaces)` in + # the YAML: AzDO's `$()` runtime macro does not expand matrix variables + # in `env:` block values (see prior failed run with literal '$(namespaces)' + # reaching python). Same pattern as + # steps/engine/clusterloader2/network-scale/execute.yml which references + # the auto-exported names directly. + export CL2_NAMESPACES="$NAMESPACES" + export CL2_DEPLOYMENTS_PER_NAMESPACE="$DEPLOYMENTS_PER_NAMESPACE" + export CL2_REPLICAS_PER_DEPLOYMENT="$REPLICAS_PER_DEPLOYMENT" + export CL2_API_SERVER_CALLS_PER_SECOND="$API_SERVER_CALLS_PER_SECOND" + export CL2_HOLD_DURATION="$HOLD_DURATION" + export CL2_WARMUP_DURATION="$WARMUP_DURATION" + export CL2_RESTART_GENERATION="$RESTART_COUNT" + + # Same discovery pattern as topology/clustermesh-scale/validate-resources.yml. + # We re-run it here rather than relying on a step variable so this engine + # file can be invoked independently. + clusters=$(az resource list \ + --resource-type Microsoft.ContainerService/managedClusters \ + --location "$REGION" \ + --query "[?tags.run_id=='${RUN_ID}' && starts_with(tags.role, 'mesh-')].{name:name, rg:resourceGroup, role:tags.role}" \ + -o json) + + cluster_count=$(echo "$clusters" | jq 'length') + if [ "$cluster_count" -lt 2 ]; then + echo "##vso[task.logissue type=error;] Expected >=2 clustermesh clusters, found $cluster_count" + exit 1 + fi + + echo "Running CL2 across $cluster_count clusters" + mkdir -p "$HOME/.kube" + echo "$clusters" > "$HOME/.kube/clustermesh-clusters.json" + echo "##vso[task.setvariable variable=CLUSTERMESH_COUNT]$cluster_count" + + # CL2 overrides are written once — params are identical for every cluster + # in this run (the per-cluster variation is which kubeconfig CL2 hits). + PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE configure \ + --namespaces "$CL2_NAMESPACES" \ + --deployments-per-namespace "$CL2_DEPLOYMENTS_PER_NAMESPACE" \ + --replicas-per-deployment "$CL2_REPLICAS_PER_DEPLOYMENT" \ + --operation-timeout "${CL2_OPERATION_TIMEOUT:-15m}" \ + --cl2_override_file "${CL2_CONFIG_DIR}/overrides.yaml" + + # Per-cluster CL2 fan-out — sequential. Each invocation writes its own + # report dir at ${CL2_REPORT_DIR}//, so collect.yml can iterate the + # same way and tag results with --cluster-name. + failures=0 + for row in $(echo "$clusters" | jq -c '.[]'); do + name=$(echo "$row" | jq -r '.name') + rg=$(echo "$row" | jq -r '.rg') + role=$(echo "$row" | jq -r '.role') + + echo "====================================================================" + echo " Running CL2 on $role ($name)" + echo "====================================================================" + + kubeconfig="$HOME/.kube/$role.config" + KUBECONFIG="$kubeconfig" az aks get-credentials \ + --resource-group "$rg" --name "$name" --overwrite-existing --only-show-errors + + report_dir="${CL2_REPORT_DIR}/${role}" + mkdir -p "$report_dir" + + cl2_passed=0 + # Run CL2; collect outcome WITHOUT failing the bash script (so we can + # also inspect junit.xml for internal test failures even when CL2 exits + # 0). Treat as "passed" only if BOTH: + # (a) junit.xml exists (CL2 actually completed and wrote a report) + # (b) junit.xml has zero / elements + # Without (b) we'd silently green-light runs where measurements failed + # — e.g. PodMonitor template substitution producing "", which + # k8s admission rejects but CL2 still writes junit with tags. + PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE execute \ + --cl2-image "${CL2_IMAGE}" \ + --cl2-config-dir "${CL2_CONFIG_DIR}" \ + --cl2-report-dir "$report_dir" \ + --cl2-config-file "${CL2_CONFIG_FILE}" \ + --kubeconfig "$kubeconfig" \ + --provider "${CLOUD}" \ + || true + if [ -f "$report_dir/junit.xml" ]; then + # Count failure/error attrs from . + junit_failures=$(grep -oE 'failures="[0-9]+"' "$report_dir/junit.xml" | head -1 | grep -oE '[0-9]+' || echo 0) + junit_errors=$(grep -oE 'errors="[0-9]+"' "$report_dir/junit.xml" | head -1 | grep -oE '[0-9]+' || echo 0) + junit_failures=${junit_failures:-0} + junit_errors=${junit_errors:-0} + if [ "$junit_failures" -eq 0 ] && [ "$junit_errors" -eq 0 ]; then + cl2_passed=1 + else + echo "##vso[task.logissue type=warning;] $role: junit.xml reports failures=$junit_failures errors=$junit_errors" + fi + fi + + if [ "$cl2_passed" -eq 1 ]; then + echo " $role: CL2 run succeeded" + fi + + # Always-on log capture (spec line 35: "Logs: clustermesh-apiserver, + # agent watchers"). Files land in $report_dir/logs/ so they are + # uploaded alongside junit.xml + measurement results when the + # publish step runs. The same files double as immediate + # diagnostics for failed runs (see FAILURE DIAG block below). + log_dir="$report_dir/logs" + mkdir -p "$log_dir" + echo "------- $role: capturing pod logs to $log_dir -------" + # clustermesh-apiserver: all three containers (apiserver / etcd / + # kvstoremesh) — bounded tail, single pod expected. + for c in apiserver etcd kvstoremesh; do + KUBECONFIG="$kubeconfig" kubectl -n kube-system logs \ + -l k8s-app=clustermesh-apiserver -c "$c" --tail=4000 \ + > "$log_dir/clustermesh-apiserver-$c.log" 2>&1 || true + done + # cilium-agent: one pod per node — keep tail small to bound size. + KUBECONFIG="$kubeconfig" kubectl -n kube-system logs \ + -l k8s-app=cilium --tail=1000 --prefix=true \ + > "$log_dir/cilium-agent.log" 2>&1 || true + # cilium-operator: low-volume control plane. + KUBECONFIG="$kubeconfig" kubectl -n kube-system logs \ + -l io.cilium/app=operator --tail=2000 --prefix=true \ + > "$log_dir/cilium-operator.log" 2>&1 || true + + if [ "$cl2_passed" -ne 1 ]; then + # Dump enough state to distinguish prometheus-stack scheduling + # failures from CL2 logic failures. Prometheus is the most common + # culprit here — its pod requests 10Gi by default, doesn't fit on + # Standard_D4s_v4. If the pod is Pending with FailedScheduling, the + # describe events make that obvious. + # + # Note: scale.py passes tear_down_prometheus=False so the stack + # survives this dump (otherwise CL2 would clean up before we look). + echo "------- $role: CL2 FAILURE DIAG -------" + echo "------- node allocatable / requested capacity -------" + KUBECONFIG="$kubeconfig" kubectl get nodes -o wide 2>&1 || true + KUBECONFIG="$kubeconfig" kubectl describe nodes 2>&1 | grep -A 4 "Allocatable\|Allocated resources" | head -40 || true + + echo "------- monitoring/* pods -------" + KUBECONFIG="$kubeconfig" kubectl -n monitoring get pods -o wide 2>&1 || true + + echo "------- monitoring statefulsets -------" + KUBECONFIG="$kubeconfig" kubectl -n monitoring get statefulset -o wide 2>&1 || true + + echo "------- Prometheus CR (operator input) -------" + KUBECONFIG="$kubeconfig" kubectl -n monitoring get prometheus -o yaml 2>&1 | head -80 || true + + echo "------- prometheus-k8s pod describe -------" + KUBECONFIG="$kubeconfig" kubectl -n monitoring describe pod -l app.kubernetes.io/name=prometheus 2>&1 | tail -60 || true + + echo "------- prometheus-operator logs (tail 60) -------" + KUBECONFIG="$kubeconfig" kubectl -n monitoring logs -l app.kubernetes.io/name=prometheus-operator --tail=60 2>&1 || true + + echo "------- monitoring namespace events (recent) -------" + KUBECONFIG="$kubeconfig" kubectl -n monitoring get events --sort-by='.lastTimestamp' 2>&1 | tail -30 || true + echo "------- end CL2 FAILURE DIAG -------" + + echo "##vso[task.logissue type=warning;] $role: CL2 run failed (junit missing or has failures/errors at $report_dir/junit.xml; continuing other clusters)" + failures=$((failures + 1)) + fi + done + + if [ "$failures" -gt 0 ]; then + echo "##vso[task.logissue type=error;] CL2 failed on $failures cluster(s)" + exit 1 + fi + workingDirectory: modules/python + env: + ${{ if eq(parameters.cloud, 'azure') }}: + CLOUD: aks + ${{ else }}: + CLOUD: ${{ parameters.cloud }} + REGION: ${{ parameters.region }} + PYTHON_SCRIPT_FILE: $(Pipeline.Workspace)/s/modules/python/clusterloader2/clustermesh-scale/scale.py + CL2_IMAGE: ${{ parameters.engine_input.image }} + CL2_CONFIG_DIR: $(Pipeline.Workspace)/s/modules/python/clusterloader2/clustermesh-scale/config + CL2_CONFIG_FILE: $(cl2_config_file) + CL2_REPORT_DIR: $(Pipeline.Workspace)/s/modules/python/clusterloader2/clustermesh-scale/results + CL2_OPERATION_TIMEOUT: ${{ parameters.engine_input.operation_timeout }} + displayName: "Run CL2 across all clustermesh clusters" diff --git a/steps/setup-tests.yml b/steps/setup-tests.yml index d790917dca..ed7840dc4c 100644 --- a/steps/setup-tests.yml +++ b/steps/setup-tests.yml @@ -72,6 +72,45 @@ steps: region: ${{ parameters.region }} credential_type: ${{ parameters.credential_type }} + - script: | + # Install the Azure Fleet preview CLI extension required by the + # clustermesh-scale scenario. The Fleet ClusterMeshProfile API surface + # is private-preview and only the bundled wheel exposes the + # `az fleet clustermeshprofile` and `az fleet member create --labels` + # commands invoked by terraform local-exec at provision time. + # + # The wheel is vendored in-repo at scenarios/perf-eval/clustermesh-scale/vendor/. + set -euo pipefail + whl="$(Pipeline.Workspace)/s/scenarios/perf-eval/$(SCENARIO_NAME)/vendor/fleet-2.0.4-py3-none-any.whl" + if [ ! -f "$whl" ]; then + echo "##vso[task.logissue type=error;] Vendored fleet wheel not found at $whl" + exit 1 + fi + az extension remove --name fleet --only-show-errors 2>/dev/null || true + az extension add --source "$whl" --yes --only-show-errors + az fleet --help >/dev/null + az fleet clustermeshprofile --help >/dev/null + echo "Fleet preview CLI installed from $whl" + displayName: "Install Fleet preview CLI (clustermesh scenarios)" + condition: startsWith(variables['SCENARIO_NAME'], 'clustermesh') + + - script: | + # Install cilium-cli on the runner for richer ClusterMesh diagnostics. + # `cilium clustermesh status --context ` reports per-remote-cluster + # connection state, endpoint counts, and version skew — info that the + # in-pod `cilium-dbg status` doesn't expose. Used by topology + # validate-resources.yml on each cluster context. + set -euo pipefail + CILIUM_CLI_VERSION=v0.16.20 + CLI_ARCH=amd64 + curl -sSL --fail --remote-name-all \ + "https://github.com/cilium/cilium-cli/releases/download/${CILIUM_CLI_VERSION}/cilium-linux-${CLI_ARCH}.tar.gz" + sudo tar xzvfC "cilium-linux-${CLI_ARCH}.tar.gz" /usr/local/bin + rm "cilium-linux-${CLI_ARCH}.tar.gz" + cilium version --client + displayName: "Install cilium-cli (clustermesh scenarios)" + condition: startsWith(variables['SCENARIO_NAME'], 'clustermesh') + - script: | if [ -n "${TEST_MODULES_DIR}" ]; then test_modules_directory=$(Pipeline.Workspace)/s/${TEST_MODULES_DIR} diff --git a/steps/topology/clustermesh-scale/collect-clusterloader2.yml b/steps/topology/clustermesh-scale/collect-clusterloader2.yml new file mode 100644 index 0000000000..29f6c86b38 --- /dev/null +++ b/steps/topology/clustermesh-scale/collect-clusterloader2.yml @@ -0,0 +1,18 @@ +parameters: + - name: cloud + type: string + default: "" + - name: engine_input + type: object + default: {} + - name: regions + type: object + default: {} + +steps: + - template: /steps/set-run-id.yml + - template: /steps/engine/clusterloader2/clustermesh-scale/collect.yml + parameters: + cloud: ${{ parameters.cloud }} + engine_input: ${{ parameters.engine_input }} + region: ${{ parameters.regions[0] }} diff --git a/steps/topology/clustermesh-scale/execute-clusterloader2.yml b/steps/topology/clustermesh-scale/execute-clusterloader2.yml new file mode 100644 index 0000000000..eb1f53f7a4 --- /dev/null +++ b/steps/topology/clustermesh-scale/execute-clusterloader2.yml @@ -0,0 +1,17 @@ +parameters: + - name: cloud + type: string + default: "" + - name: engine_input + type: object + default: {} + - name: regions + type: object + default: {} + +steps: + - template: /steps/engine/clusterloader2/clustermesh-scale/execute.yml + parameters: + cloud: ${{ parameters.cloud }} + engine_input: ${{ parameters.engine_input }} + region: ${{ parameters.regions[0] }} diff --git a/steps/topology/clustermesh-scale/validate-resources.yml b/steps/topology/clustermesh-scale/validate-resources.yml new file mode 100644 index 0000000000..bfd47a11c6 --- /dev/null +++ b/steps/topology/clustermesh-scale/validate-resources.yml @@ -0,0 +1,402 @@ +parameters: + - name: cloud + type: string + - name: engine + type: string + - name: regions + type: object + +steps: + # ----------------------------------------------------------------------------- + # Per-cluster validation: enumerate every fleet member, fetch its kubeconfig, + # assert nodes are Ready, cilium agent is Running, and the cluster reports + # mesh state Connected to all (N-1) remote clusters. + # + # Cluster discovery uses the same tag-based pattern as + # /steps/cloud/azure/update-kubeconfig.yml — clusters are tagged + # role=mesh-N at terraform-apply time. + # ----------------------------------------------------------------------------- + - script: | + set -euo pipefail + set -x + + region=${{ parameters.regions[0] }} + + # JSON list of {name, rg, role} for every clustermesh AKS cluster in this run. + clusters=$(az resource list \ + --resource-type Microsoft.ContainerService/managedClusters \ + --location "$region" \ + --query "[?tags.run_id=='${RUN_ID}' && starts_with(tags.role, 'mesh-')].{name:name, rg:resourceGroup, role:tags.role}" \ + -o json) + + count=$(echo "$clusters" | jq 'length') + if [ "$count" -lt 2 ]; then + echo "##vso[task.logissue type=error;] Expected >=2 clustermesh AKS clusters tagged run_id=${RUN_ID}, found $count" + exit 1 + fi + + echo "Discovered $count clustermesh clusters:" + echo "$clusters" | jq -r '.[] | " \(.role): \(.name) in \(.rg)"' + + mkdir -p "$HOME/.kube" + echo "$clusters" > "$HOME/.kube/clustermesh-clusters.json" + + echo "##vso[task.setvariable variable=CLUSTERMESH_COUNT]$count" + displayName: "Enumerate clustermesh clusters" + + - script: | + set -euo pipefail + set -x + + clusters=$(cat "$HOME/.kube/clustermesh-clusters.json") + expected_remote=$(( $(echo "$clusters" | jq 'length') - 1 )) + + failures=0 + for row in $(echo "$clusters" | jq -c '.[]'); do + name=$(echo "$row" | jq -r '.name') + rg=$(echo "$row" | jq -r '.rg') + role=$(echo "$row" | jq -r '.role') + + echo "====================================================================" + echo " Validating $role ($name)" + echo "====================================================================" + + # Per-cluster kubeconfig file at $HOME/.kube/.config — keeps each + # cluster's auth state isolated so concurrent kubectl calls don't race. + kubeconfig="$HOME/.kube/$role.config" + KUBECONFIG="$kubeconfig" az aks get-credentials \ + --resource-group "$rg" --name "$name" --overwrite-existing --only-show-errors + + export KUBECONFIG="$kubeconfig" + + echo "--- nodes ---" + kubectl get nodes -o wide + kubectl wait --for=condition=Ready nodes --all --timeout=5m + + echo "--- cilium agent pods ---" + kubectl -n kube-system get pods -l k8s-app=cilium -o wide + kubectl -n kube-system rollout status ds/cilium --timeout=5m + + echo "--- clustermesh-apiserver pod ---" + kubectl -n kube-system get pods -l k8s-app=clustermesh-apiserver -o wide || true + + # Surface the apiserver/kvstoremesh container ports so we can confirm + # the PodMonitor scrape targets (expected: apiserver=9963, kvstoremesh=9964) + # without needing to drop into a pod. Informational only — does not gate. + echo "--- clustermesh-apiserver exposed ports ---" + kubectl -n kube-system get pod -l k8s-app=clustermesh-apiserver \ + -o jsonpath='{range .items[*].spec.containers[*]}{.name}:{range .ports[*]}{.name}={.containerPort} {end}{"\n"}{end}' \ + 2>/dev/null || true + echo + + echo "--- cilium-dbg status (ClusterMesh section) ---" + # Retry up to ~5 minutes — the mesh propagation can lag a few seconds + # past az fleet clustermeshprofile apply's return. + # We use `cilium-dbg status` (in-pod debug binary) rather than the + # external `cilium clustermesh status` so we don't require cilium-cli + # on the agent. cilium-dbg status includes a "ClusterMesh:" block of + # the form: + # ClusterMesh: 2/2 remote clusters ready, 0 global-services + # mesh-2: ready, ... + # Retry up to ~10 minutes — the AKS-managed Cilium operator publishes + # the per-agent `cilium-clustermesh` Secret asynchronously after Fleet + # finishes profile apply, and the clustermesh-apiserver may be + # recreated mid-validation (cert/config rotation), bumping the wait + # another ~30s for agents to reload. Empirically 5 min was too tight + # for whichever cluster gets validated first; 10 min covers it with + # margin. + # + # Note: `cilium-dbg status` (in-pod, agent's local view) and + # `cilium clustermesh status` (CLI, queries clustermesh-apiserver) can + # disagree for several minutes during this window — the CLI flips to + # "configured/connected" first because it counts apiserver clients, + # while the in-pod view requires the Secret to be reloaded. We gate on + # the in-pod view because the data path needs the agent's local state. + connected=0 + for i in $(seq 1 60); do + out=$(kubectl -n kube-system exec ds/cilium -- cilium-dbg status 2>&1 || true) + echo "$out" + # Parse "/ remote clusters ready" line. + ready=$(echo "$out" | sed -nE 's/.*ClusterMesh:[[:space:]]+([0-9]+)\/[0-9]+ remote clusters ready.*/\1/p' | head -1) + ready=${ready:-0} + if [ "$ready" -ge "$expected_remote" ]; then + connected=1 + break + fi + + # ============== DEBUG-DUMP-BEGIN (REMOVE BEFORE MERGE) ============== + # Every 6 iterations dump richer state: in-pod cilium-cli view of the + # mesh, clustermesh-apiserver pod state, and Fleet-side member status. + # These help diagnose why convergence is stalling. Strip before final + # PR review. + if [ "$((i % 6))" -eq 0 ]; then + echo "------- [debug] retry $i: cilium clustermesh status (runner cli) -------" + cilium clustermesh status --context "$(kubectl config current-context)" --wait=false 2>&1 || true + + echo "------- [debug] retry $i: clustermesh-apiserver pods -------" + kubectl -n kube-system get pods -l k8s-app=clustermesh-apiserver -o wide 2>&1 || true + kubectl -n kube-system describe pods -l k8s-app=clustermesh-apiserver 2>&1 | tail -40 || true + + echo "------- [debug] retry $i: clustermesh-apiserver service -------" + # Service of type LoadBalancer for the clustermesh-apiserver. If + # EXTERNAL-IP stays "", the AKS control-plane identity is + # missing Network Contributor on the VNet (cloud-controller-manager + # cannot provision the internal LB). Look in describe events for + # AuthorizationFailed / forbidden messages. + kubectl -n kube-system get svc clustermesh-apiserver -o wide 2>&1 || true + kubectl -n kube-system describe svc clustermesh-apiserver 2>&1 | tail -25 || true + + echo "------- [debug] retry $i: cilium agent restarts / readiness -------" + kubectl -n kube-system get pods -l k8s-app=cilium -o wide 2>&1 || true + + echo "------- [debug] retry $i: Fleet ClusterMeshProfile profile-level status -------" + # Profile-level mesh state (NotConnected/Connecting/Connected/Failed) + # plus the last operation error if any. This is the authoritative + # control-plane view of whether the mesh has converged. + az fleet clustermeshprofile show \ + --resource-group "$rg" \ + --fleet-name clustermesh-flt \ + --name clustermesh-cmp \ + --query "{state:properties.status.state, provisioningState:properties.provisioningState, lastError:properties.status.lastOperationError}" \ + -o jsonc 2>&1 || true + + echo "------- [debug] retry $i: Fleet ClusterMeshProfile members (connection state) -------" + # Per-member: provisioningState is just ARM-level (join accepted); + # meshProperties.status.state is the actual Cilium connection state. + az fleet clustermeshprofile list-members \ + --resource-group "$rg" \ + --fleet-name clustermesh-flt \ + --name clustermesh-cmp \ + --query "[].{name:name, provisioning:properties.provisioningState, mesh:properties.meshProperties.status.state, lastUpdated:properties.meshProperties.status.lastUpdatedAt, error:properties.meshProperties.status.error.message}" \ + -o table 2>&1 || true + fi + # =============== DEBUG-DUMP-END (REMOVE BEFORE MERGE) =============== + + echo " waiting for $expected_remote remote clusters to be ready (got $ready), retry $i/60..." + sleep 10 + done + + if [ "$connected" -ne 1 ]; then + echo "##vso[task.logissue type=error;] $role: clustermesh not Connected to $expected_remote remote clusters" + failures=$((failures + 1)) + fi + + echo "--- cilium clustermesh status (runner-side, richer diagnostics) ---" + # Best-effort, informational only — failures here don't fail the step + # because the in-pod check above is authoritative. cilium-cli reports + # per-remote connection state, endpoint counts, and version info. + cilium clustermesh status --context "$(kubectl config current-context)" --wait=false || true + done + + if [ "$failures" -gt 0 ]; then + echo "##vso[task.logissue type=error;] $failures cluster(s) failed mesh validation" + exit 1 + fi + displayName: "Validate Cilium + ClusterMesh on every cluster" + + - script: | + set -euo pipefail + set -x + + # Cross-cluster data-path smoke: deploy a `global` service backed by an + # echo pod in the first cluster, deploy a curl client in the second + # cluster, and curl the service by name. If global service load-balancing + # works, the request resolves cross-cluster via the mesh data path. + # + # Per plan.md Phase 1 exit criteria, we don't ship a "green" Phase 1 that + # only validated control plane. + + clusters=$(cat "$HOME/.kube/clustermesh-clusters.json") + first_role=$(echo "$clusters" | jq -r '.[0].role') + second_role=$(echo "$clusters" | jq -r '.[1].role') + + kc_first="$HOME/.kube/$first_role.config" + kc_second="$HOME/.kube/$second_role.config" + + ns="cm-smoke" + + cleanup() { + KUBECONFIG="$kc_first" kubectl delete ns "$ns" --ignore-not-found --wait=false || true + KUBECONFIG="$kc_second" kubectl delete ns "$ns" --ignore-not-found --wait=false || true + } + trap cleanup EXIT + + cat <<'EOF' > /tmp/cm-smoke-server.yaml + apiVersion: v1 + kind: Namespace + metadata: + name: cm-smoke + annotations: + # AKS managed Cilium gates clustermesh sync at the *namespace* level + # by default (CFP-39876, "managed Cilium" change). Without this, + # neither pod identities, endpoints, nor services in this namespace + # are synced across clusters — even with service.cilium.io/global on + # the Service. This is the load-bearing annotation here; the + # service-level one below is kept for explicitness. + clustermesh.cilium.io/global: "true" + --- + apiVersion: apps/v1 + kind: Deployment + metadata: + name: echo + namespace: cm-smoke + spec: + replicas: 1 + selector: + matchLabels: { app: echo } + template: + metadata: + labels: { app: echo } + spec: + containers: + - name: echo + image: registry.k8s.io/e2e-test-images/agnhost:2.47 + args: ["netexec", "--http-port=8080"] + ports: [{ containerPort: 8080 }] + --- + apiVersion: v1 + kind: Service + metadata: + name: echo + namespace: cm-smoke + annotations: + # The namespace annotation above is what actually gates sync in AKS + # managed Cilium; this service-level annotation is kept for explicit + # intent and forward-compatibility. + service.cilium.io/global: "true" + spec: + selector: { app: echo } + ports: + - port: 80 + targetPort: 8080 + EOF + + cat <<'EOF' > /tmp/cm-smoke-client.yaml + apiVersion: v1 + kind: Namespace + metadata: + name: cm-smoke + annotations: + clustermesh.cilium.io/global: "true" + --- + # Cilium global services require the same Service name to exist in every + # participating cluster. The Service in cluster 2 has no local backends; + # cross-cluster lookup resolves to cluster 1's pods via the mesh. + apiVersion: v1 + kind: Service + metadata: + name: echo + namespace: cm-smoke + annotations: + service.cilium.io/global: "true" + spec: + selector: { app: echo } + ports: + - port: 80 + targetPort: 8080 + --- + apiVersion: v1 + kind: Pod + metadata: + name: curl + namespace: cm-smoke + labels: { app: curl } + spec: + restartPolicy: Never + containers: + - name: curl + image: curlimages/curl:8.10.1 + command: ["sleep", "600"] + EOF + + KUBECONFIG="$kc_first" kubectl apply -f /tmp/cm-smoke-server.yaml + KUBECONFIG="$kc_second" kubectl apply -f /tmp/cm-smoke-client.yaml + + KUBECONFIG="$kc_first" kubectl -n "$ns" rollout status deploy/echo --timeout=3m + KUBECONFIG="$kc_second" kubectl -n "$ns" wait --for=condition=Ready pod/curl --timeout=3m + + # Give Cilium clustermesh a moment to sync the new global Service from + # cluster 1 → cluster 2 before the first curl attempt. Empirically this + # is sub-second once mesh is converged, but we've already paid the cost + # of waiting for rollouts above so a small settle here doesn't matter. + sleep 15 + + # Try for 2 minutes — global service endpoints can take a few seconds + # to populate via the mesh. + ok=0 + for i in $(seq 1 24); do + if KUBECONFIG="$kc_second" kubectl -n "$ns" exec curl -- \ + curl -fsS -m 5 http://echo.cm-smoke.svc.cluster.local/hostname; then + ok=1 + echo "" + echo "Cross-cluster curl succeeded on attempt $i" + break + fi + echo " attempt $i/24 failed, retrying in 5s..." + sleep 5 + done + + if [ "$ok" -ne 1 ]; then + # ============== SMOKE-FAILURE-DEBUG-DUMP (REMOVE BEFORE MERGE) ============== + # On failure, dump enough state to distinguish Cilium global-service + # sync issues from cross-VNet pod-IP routing issues. Specifically: + # 1. cilium clustermesh status — should show "Global services: 1" if sync OK + # 2. cilium service list (in-pod) — should have an entry for cm-smoke/echo + # with remote-cluster backends in cluster 2 + # 3. kubectl describe svc / get endpoints echo — k8s view (cluster 2 should + # have NO local endpoints, that's expected) + # 4. From inside the curl pod: DNS resolve, then direct-IP curl to a + # cluster-1 echo pod IP — bypasses ClusterIP, tests raw L3 across VNets + echo + echo "================ SMOKE FAILURE DIAG (cluster $first_role -- backend) ================" + KUBECONFIG="$kc_first" cilium clustermesh status --context "$(KUBECONFIG="$kc_first" kubectl config current-context)" --wait=false 2>&1 || true + KUBECONFIG="$kc_first" kubectl -n "$ns" describe svc echo 2>&1 || true + KUBECONFIG="$kc_first" kubectl -n "$ns" get endpoints echo -o wide 2>&1 || true + KUBECONFIG="$kc_first" kubectl -n "$ns" get pods -l app=echo -o wide 2>&1 || true + echo "------- $first_role: cilium-config (clustermesh-relevant flags) -------" + # Authoritative source for whether the cilium agent is configured to + # process global services. Look for: enable-cluster-mesh, + # cluster-mesh-shared-services, clustermesh-config, identity-allocation-mode, + # enable-services. AKS/ACNS may gate global services with a feature flag. + KUBECONFIG="$kc_first" kubectl -n kube-system get cm cilium-config -o yaml 2>&1 \ + | grep -iE 'cluster-mesh|clustermesh|service|global|identity' || true + echo "------- $first_role: cilium service list (full, head 40) -------" + KUBECONFIG="$kc_first" kubectl -n kube-system exec ds/cilium -c cilium-agent -- cilium service list 2>&1 | head -40 || true + echo "------- $first_role: cilium-operator logs (tail 60) -------" + KUBECONFIG="$kc_first" kubectl -n kube-system logs -l io.cilium/app=operator --tail=60 2>&1 \ + | grep -iE 'global|clustermesh|cluster-mesh|cm-smoke|service' || true + + echo + echo "================ SMOKE FAILURE DIAG (cluster $second_role -- client) ================" + KUBECONFIG="$kc_second" cilium clustermesh status --context "$(KUBECONFIG="$kc_second" kubectl config current-context)" --wait=false 2>&1 || true + KUBECONFIG="$kc_second" kubectl -n "$ns" describe svc echo 2>&1 || true + KUBECONFIG="$kc_second" kubectl -n "$ns" get endpoints echo -o wide 2>&1 || true + echo "------- $second_role: cilium-config (clustermesh-relevant flags) -------" + KUBECONFIG="$kc_second" kubectl -n kube-system get cm cilium-config -o yaml 2>&1 \ + | grep -iE 'cluster-mesh|clustermesh|service|global|identity' || true + echo "------- $second_role: cilium service list (full, head 40) -------" + KUBECONFIG="$kc_second" kubectl -n kube-system exec ds/cilium -c cilium-agent -- cilium service list 2>&1 | head -40 || true + echo "------- $second_role: cilium-operator logs (tail 60) -------" + KUBECONFIG="$kc_second" kubectl -n kube-system logs -l io.cilium/app=operator --tail=60 2>&1 \ + | grep -iE 'global|clustermesh|cluster-mesh|cm-smoke|service' || true + + echo + echo "------- DNS + direct-pod-IP probe from curl pod (bypass ClusterIP) -------" + # ClusterIP plumbing is a Cilium-clustermesh concern; direct pod-IP + # connectivity is a VNet-peering concern. Hitting a backend pod IP + # directly disambiguates the two failure modes. + KUBECONFIG="$kc_second" kubectl -n "$ns" exec curl -- nslookup echo.cm-smoke.svc.cluster.local 2>&1 || true + backend_ip=$(KUBECONFIG="$kc_first" kubectl -n "$ns" get pod -l app=echo -o jsonpath='{.items[0].status.podIP}' 2>/dev/null || true) + echo "first cluster's echo pod IP: ${backend_ip:-}" + if [ -n "${backend_ip:-}" ]; then + KUBECONFIG="$kc_second" kubectl -n "$ns" exec curl -- \ + curl -fsS -m 5 "http://${backend_ip}:8080/hostname" 2>&1 || \ + echo " direct pod-IP curl ALSO failed → cross-VNet routing issue (peering / pod-CIDR routes)" + fi + echo "============================ END SMOKE DIAG ============================" + # =========================== END SMOKE-FAILURE-DEBUG-DUMP =========================== + + echo "##vso[task.logissue type=error;] Cross-cluster data-path smoke failed: $second_role could not reach service in $first_role" + exit 1 + fi + displayName: "Cross-cluster data-path smoke (global service curl)"