diff --git a/jobs/competitive-test.yml b/jobs/competitive-test.yml
index 4f2c6a08f2..f97f937d63 100644
--- a/jobs/competitive-test.yml
+++ b/jobs/competitive-test.yml
@@ -48,6 +48,9 @@ parameters:
- name: ssh_key_enabled
type: boolean
default: true
+- name: skip_publish
+ type: boolean
+ default: false
jobs:
- job: ${{ parameters.cloud }}
@@ -89,14 +92,15 @@ jobs:
engine: ${{ parameters.engine }}
regions: ${{ parameters.regions }}
engine_input: ${{ parameters.engine_input }}
- - template: /steps/publish-results.yml
- parameters:
- cloud: ${{ parameters.cloud }}
- topology: ${{ parameters.topology }}
- engine: ${{ parameters.engine }}
- regions: ${{ parameters.regions }}
- engine_input: ${{ parameters.engine_input }}
- credential_type: ${{ parameters.credential_type }}
+ - ${{ if not(parameters.skip_publish) }}:
+ - template: /steps/publish-results.yml
+ parameters:
+ cloud: ${{ parameters.cloud }}
+ topology: ${{ parameters.topology }}
+ engine: ${{ parameters.engine }}
+ regions: ${{ parameters.regions }}
+ engine_input: ${{ parameters.engine_input }}
+ credential_type: ${{ parameters.credential_type }}
- template: /steps/cleanup-resources.yml
parameters:
cloud: ${{ parameters.cloud }}
diff --git a/modules/python/clusterloader2/clustermesh-scale/config/config.yaml b/modules/python/clusterloader2/clustermesh-scale/config/config.yaml
new file mode 100644
index 0000000000..6eace02220
--- /dev/null
+++ b/modules/python/clusterloader2/clustermesh-scale/config/config.yaml
@@ -0,0 +1,105 @@
+name: clustermesh-scale-test
+
+# Workload: deploy a small fixed number of pods on this cluster (no churn,
+# no traffic). Measurement modules under modules/measurements/ run the actual
+# scale-test instrumentation (cilium agent/operator CPU+memory, kube-apiserver
+# health, mesh-specific PromQL) so each per-cluster JSONL row carries the data
+# needed for cross-cluster comparison in Kusto. The workload is deliberately
+# trivial — fan-out, attribution, and metric coverage are what we're testing
+# in Phase 1; richer workloads land per scenario in Phase 2+.
+
+{{$namespaces := DefaultParam .CL2_NAMESPACES 1}}
+{{$deploymentsPerNamespace := DefaultParam .CL2_DEPLOYMENTS_PER_NAMESPACE 2}}
+{{$replicasPerDeployment := DefaultParam .CL2_REPLICAS_PER_DEPLOYMENT 2}}
+{{$operationTimeout := DefaultParam .CL2_OPERATION_TIMEOUT "15m"}}
+{{$apiServerCallsPerSecond := DefaultParam .CL2_API_SERVER_CALLS_PER_SECOND 5}}
+
+namespace:
+ number: {{$namespaces}}
+ prefix: clustermesh-scale
+ deleteStaleNamespaces: true
+ deleteAutomanagedNamespaces: true
+ enableExistingNamespaces: false
+ deleteNamespaceTimeout: 20m
+
+tuningSets:
+ - name: Sequence
+ parallelismLimitedLoad:
+ parallelismLimit: 1
+ - name: DeploymentCreateQps
+ qpsLoad:
+ qps: {{$apiServerCallsPerSecond}}
+
+steps:
+ # ----- Start measurements -----
+ # control-plane.yaml owns PodStartupLatency + APIResponsivenessPrometheus +
+ # apiserver CPU/mem queries; cilium.yaml owns cilium-agent + cilium-operator
+ # CPU/mem; clustermesh-metrics.yaml owns mesh-specific PromQL (remote-cluster
+ # connectivity, kvstore event rate, identity count, etc.). All three are
+ # gathered later (see "Gather measurements" below) so the steady-state window
+ # is bounded by the workload create/delete pair.
+ - module:
+ path: /modules/measurements/control-plane.yaml
+ params:
+ action: start
+ group: clustermesh-scale-test
+
+ - module:
+ path: /modules/measurements/cilium.yaml
+ params:
+ action: start
+
+ - module:
+ path: /modules/measurements/clustermesh-metrics.yaml
+ params:
+ action: start
+
+ - module:
+ path: /modules/clustermesh.yaml
+ params:
+ actionName: create
+ tuningSet: DeploymentCreateQps
+
+ - module:
+ path: /modules/scale-test.yaml
+ params:
+ actionName: create
+ namespaces: {{$namespaces}}
+ deploymentsPerNamespace: {{$deploymentsPerNamespace}}
+ replicasPerDeployment: {{$replicasPerDeployment}}
+ tuningSet: DeploymentCreateQps
+ operationTimeout: {{$operationTimeout}}
+
+ # ----- Gather measurements -----
+ # Mirror the start block above. Order matches network-scale convention.
+ - module:
+ path: /modules/measurements/control-plane.yaml
+ params:
+ action: gather
+ group: clustermesh-scale-test
+
+ - module:
+ path: /modules/measurements/cilium.yaml
+ params:
+ action: gather
+
+ - module:
+ path: /modules/measurements/clustermesh-metrics.yaml
+ params:
+ action: gather
+
+ - module:
+ path: /modules/scale-test.yaml
+ params:
+ actionName: delete
+ namespaces: {{$namespaces}}
+ deploymentsPerNamespace: {{$deploymentsPerNamespace}}
+ replicasPerDeployment: {{$replicasPerDeployment}}
+ tuningSet: DeploymentCreateQps
+ operationTimeout: {{$operationTimeout}}
+
+ - module:
+ path: /modules/clustermesh.yaml
+ params:
+ actionName: delete
+ tuningSet: DeploymentCreateQps
diff --git a/modules/python/clusterloader2/clustermesh-scale/config/event-throughput.yaml b/modules/python/clusterloader2/clustermesh-scale/config/event-throughput.yaml
new file mode 100644
index 0000000000..439fdc4e71
--- /dev/null
+++ b/modules/python/clusterloader2/clustermesh-scale/config/event-throughput.yaml
@@ -0,0 +1,166 @@
+name: clustermesh-event-throughput
+
+# Scale scenario #1: Cross-Cluster Event Throughput.
+#
+# Goal (scale testing.txt line 42-54): determine max sustainable and burst
+# event rates for endpoints, services, and identities propagating across
+# the mesh; measure events/sec processed and time-to-convergence proxy.
+#
+# Sequence (every cluster runs this in parallel; CL2 fan-out lives in
+# steps/engine/.../execute.yml):
+#
+# 1. Start measurements (control-plane, cilium, clustermesh-metrics +
+# scenario-specific clustermesh-throughput + etcd-metrics).
+# 2. Deploy PodMonitor scraping clustermesh-apiserver.
+# 3. Create N pods + N global Services per cluster at a controlled QPS.
+# 4. Warmup sleep — let initial create-flurry settle into steady state.
+# 5. Burst rolling-restart of every Deployment (closes the "burst"
+# coverage gap from scale testing.txt line 52).
+# 6. Settle sleep — let kvstore queues drain and propagation latency
+# histograms accumulate steady-state samples.
+# 7. Gather all measurements.
+# 8. Tear down the workload + PodMonitor.
+
+{{$namespaces := DefaultParam .CL2_NAMESPACES 5}}
+{{$deploymentsPerNamespace := DefaultParam .CL2_DEPLOYMENTS_PER_NAMESPACE 4}}
+{{$replicasPerDeployment := DefaultParam .CL2_REPLICAS_PER_DEPLOYMENT 10}}
+{{$operationTimeout := DefaultParam .CL2_OPERATION_TIMEOUT "20m"}}
+{{$apiServerCallsPerSecond := DefaultParam .CL2_API_SERVER_CALLS_PER_SECOND 20}}
+{{$warmupDuration := DefaultParam .CL2_WARMUP_DURATION "30s"}}
+{{$holdDuration := DefaultParam .CL2_HOLD_DURATION "2m"}}
+{{$restartGeneration := DefaultParam .CL2_RESTART_GENERATION 1}}
+
+namespace:
+ number: {{$namespaces}}
+ prefix: clustermesh-et
+ deleteStaleNamespaces: true
+ deleteAutomanagedNamespaces: true
+ enableExistingNamespaces: false
+ deleteNamespaceTimeout: 20m
+
+tuningSets:
+ - name: Sequence
+ parallelismLimitedLoad:
+ parallelismLimit: 1
+ - name: DeploymentCreateQps
+ qpsLoad:
+ qps: {{$apiServerCallsPerSecond}}
+
+steps:
+ # ----- Start measurements -----
+ - module:
+ path: /modules/measurements/control-plane.yaml
+ params:
+ action: start
+ group: clustermesh-event-throughput
+
+ - module:
+ path: /modules/measurements/cilium.yaml
+ params:
+ action: start
+
+ - module:
+ path: /modules/measurements/clustermesh-metrics.yaml
+ params:
+ action: start
+
+ - module:
+ path: /modules/measurements/clustermesh-throughput.yaml
+ params:
+ action: start
+
+ - module:
+ path: /modules/measurements/etcd-metrics.yaml
+ params:
+ action: start
+
+ - module:
+ path: /modules/clustermesh.yaml
+ params:
+ actionName: create
+ tuningSet: DeploymentCreateQps
+
+ # ----- Workload: create -----
+ - module:
+ path: /modules/event-throughput-workload.yaml
+ params:
+ actionName: create
+ generation: 0
+ namespaces: {{$namespaces}}
+ deploymentsPerNamespace: {{$deploymentsPerNamespace}}
+ replicasPerDeployment: {{$replicasPerDeployment}}
+ tuningSet: DeploymentCreateQps
+ operationTimeout: {{$operationTimeout}}
+
+ # ----- Warmup: let the create-flurry settle into steady state -----
+ - name: Warmup before burst
+ measurements:
+ - Identifier: WarmupSleep
+ Method: Sleep
+ Params:
+ duration: {{$warmupDuration}}
+
+ # ----- Burst: rolling-restart of every Deployment -----
+ - module:
+ path: /modules/event-throughput-workload.yaml
+ params:
+ actionName: restart
+ generation: {{$restartGeneration}}
+ namespaces: {{$namespaces}}
+ deploymentsPerNamespace: {{$deploymentsPerNamespace}}
+ replicasPerDeployment: {{$replicasPerDeployment}}
+ tuningSet: DeploymentCreateQps
+ operationTimeout: {{$operationTimeout}}
+
+ # ----- Settle: let kvstore queues drain post-burst -----
+ - name: Settle after burst
+ measurements:
+ - Identifier: SettleSleep
+ Method: Sleep
+ Params:
+ duration: {{$holdDuration}}
+
+ # ----- Gather measurements -----
+ - module:
+ path: /modules/measurements/control-plane.yaml
+ params:
+ action: gather
+ group: clustermesh-event-throughput
+
+ - module:
+ path: /modules/measurements/cilium.yaml
+ params:
+ action: gather
+
+ - module:
+ path: /modules/measurements/clustermesh-metrics.yaml
+ params:
+ action: gather
+
+ - module:
+ path: /modules/measurements/clustermesh-throughput.yaml
+ params:
+ action: gather
+
+ - module:
+ path: /modules/measurements/etcd-metrics.yaml
+ params:
+ action: gather
+
+ # ----- Workload: delete -----
+ - module:
+ path: /modules/event-throughput-workload.yaml
+ params:
+ actionName: delete
+ generation: {{$restartGeneration}}
+ namespaces: {{$namespaces}}
+ deploymentsPerNamespace: {{$deploymentsPerNamespace}}
+ replicasPerDeployment: {{$replicasPerDeployment}}
+ tuningSet: DeploymentCreateQps
+ operationTimeout: {{$operationTimeout}}
+
+ - module:
+ path: /modules/clustermesh.yaml
+ params:
+ actionName: delete
+ tuningSet: DeploymentCreateQps
diff --git a/modules/python/clusterloader2/clustermesh-scale/config/modules/clustermesh.yaml b/modules/python/clusterloader2/clustermesh-scale/config/modules/clustermesh.yaml
new file mode 100644
index 0000000000..175387b2ae
--- /dev/null
+++ b/modules/python/clusterloader2/clustermesh-scale/config/modules/clustermesh.yaml
@@ -0,0 +1,26 @@
+## ClusterMesh module: deploys a PodMonitor for clustermesh-apiserver so the
+## CL2-spawned Prometheus picks up at least one mesh-side metric per cluster.
+## Phase 1 exit criteria require this — see plan.md Phase 1 line 318.
+
+{{$tuningSet := DefaultParam .tuningSet "DeploymentCreateQps"}}
+{{$interval := DefaultParam .interval "15s"}}
+{{ $replicasPerNamespace := 1 }}
+
+{{if eq .actionName "create"}}
+ {{ $replicasPerNamespace = 1 }}
+{{else}}
+ {{ $replicasPerNamespace = 0 }}
+{{end}}
+
+steps:
+ - name: {{.actionName}} ClusterMesh Pod Monitor
+ phases:
+ - namespaceList:
+ - "monitoring"
+ replicasPerNamespace: {{$replicasPerNamespace}}
+ tuningSet: {{$tuningSet}}
+ objectBundle:
+ - objectTemplatePath: "modules/clustermesh/podmonitor.yaml"
+ basename: clustermesh-apiserver
+ templateFillMap:
+ Interval: {{$interval}}
diff --git a/modules/python/clusterloader2/clustermesh-scale/config/modules/clustermesh/podmonitor.yaml b/modules/python/clusterloader2/clustermesh-scale/config/modules/clustermesh/podmonitor.yaml
new file mode 100644
index 0000000000..f667f9e94a
--- /dev/null
+++ b/modules/python/clusterloader2/clustermesh-scale/config/modules/clustermesh/podmonitor.yaml
@@ -0,0 +1,35 @@
+apiVersion: monitoring.coreos.com/v1
+kind: PodMonitor
+metadata:
+ name: clustermesh-apiserver
+ namespace: monitoring
+spec:
+ # Cilium clustermesh-apiserver exposes metrics on port 9963 (apiserver) and
+ # 9964 (kvstoremesh sidecar) when Prometheus integration is enabled. AKS
+ # managed Cilium uses the same upstream defaults. If a future preview
+ # changes these, override via __address__ relabel below.
+ selector:
+ matchLabels:
+ k8s-app: clustermesh-apiserver
+ namespaceSelector:
+ matchNames:
+ - kube-system
+ podMetricsEndpoints:
+ - interval: {{.Interval}}
+ honorLabels: true
+ path: /metrics
+ relabelings:
+ - sourceLabels: [__address__]
+ action: replace
+ targetLabel: __address__
+ regex: (.+?)(\:\d+)?
+ replacement: $1:9963
+ - interval: {{.Interval}}
+ honorLabels: true
+ path: /metrics
+ relabelings:
+ - sourceLabels: [__address__]
+ action: replace
+ targetLabel: __address__
+ regex: (.+?)(\:\d+)?
+ replacement: $1:9964
diff --git a/modules/python/clusterloader2/clustermesh-scale/config/modules/event-throughput-deployment.yaml b/modules/python/clusterloader2/clustermesh-scale/config/modules/event-throughput-deployment.yaml
new file mode 100644
index 0000000000..06d677b1b0
--- /dev/null
+++ b/modules/python/clusterloader2/clustermesh-scale/config/modules/event-throughput-deployment.yaml
@@ -0,0 +1,42 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: {{.Name}}
+ labels:
+ group: {{.Group}}
+ app: {{.Name}}
+spec:
+ replicas: {{.Replicas}}
+ selector:
+ matchLabels:
+ name: {{.Name}}
+ template:
+ metadata:
+ labels:
+ name: {{.Name}}
+ group: {{.Group}}
+ app: {{.Name}}
+ annotations:
+ # Bumping RestartGeneration in the pod template forces a rolling
+ # restart on the next CL2 apply — the canonical Kubernetes pattern
+ # for triggering deployment rollouts without changing image. This
+ # drives the burst event flurry for scale-scenario #1.
+ restart-generation: "{{.RestartGeneration}}"
+ spec:
+ containers:
+ - name: pause
+ image: mcr.microsoft.com/oss/kubernetes/pause:3.6
+ # pause:3.6 is the Kubernetes pause container — it literally sleeps
+ # forever and consumes single-digit CPU shares + ~few MB. The
+ # earlier 50m CPU / 50Mi memory limits caused per-node CPU
+ # overcommit (~160% of allocatable on Standard_D4s_v4) at
+ # 100 pods/node, which starves the kubelet+CNI sandbox setup and
+ # leaves a few stragglers stuck Pending → CL2 timeout. Tighter
+ # limits here mirror what real pause-pod e2e fixtures use.
+ resources:
+ requests:
+ cpu: 1m
+ memory: 5Mi
+ limits:
+ cpu: 5m
+ memory: 20Mi
diff --git a/modules/python/clusterloader2/clustermesh-scale/config/modules/event-throughput-service.yaml b/modules/python/clusterloader2/clustermesh-scale/config/modules/event-throughput-service.yaml
new file mode 100644
index 0000000000..7c795f65c3
--- /dev/null
+++ b/modules/python/clusterloader2/clustermesh-scale/config/modules/event-throughput-service.yaml
@@ -0,0 +1,27 @@
+apiVersion: v1
+kind: Service
+metadata:
+ name: {{.Name}}
+ labels:
+ group: {{.Group}}
+ app: {{.Name}}
+ annotations:
+ # Modern annotation (Cilium >= 1.13). The clustermesh-apiserver fans
+ # this service's endpoints out to all peer clusters, exercising the
+ # service-propagation path that scale-scenario #1 measures.
+ service.cilium.io/global: "true"
+ # Legacy annotation (pre-1.13). Applied defensively because the AKS
+ # managed Cilium build version is not yet verified by us. Cilium
+ # ignores annotations it does not understand, so carrying both is safe.
+ io.cilium/global-service: "true"
+spec:
+ selector:
+ name: {{.Name}}
+ ports:
+ - port: 80
+ targetPort: 80
+ protocol: TCP
+ # Headless: backends are advertised across the mesh by clustermesh-apiserver
+ # rather than routed through a per-cluster ClusterIP. Reduces noise from
+ # ClusterIP allocation under high churn.
+ clusterIP: None
diff --git a/modules/python/clusterloader2/clustermesh-scale/config/modules/event-throughput-workload.yaml b/modules/python/clusterloader2/clustermesh-scale/config/modules/event-throughput-workload.yaml
new file mode 100644
index 0000000000..0e0a3e36bd
--- /dev/null
+++ b/modules/python/clusterloader2/clustermesh-scale/config/modules/event-throughput-workload.yaml
@@ -0,0 +1,73 @@
+name: clustermesh-event-throughput-workload
+
+# Workload module for scale-scenario #1: Cross-Cluster Event Throughput.
+#
+# Exercises three flavors of cross-cluster events on every cluster in parallel:
+#
+# create — bring N pods + N global Services up at a controlled QPS.
+# Drives endpoint+identity creation events into the local
+# clustermesh-apiserver, which fans out N*(M-1) writes across
+# the mesh on every other peer's etcd.
+# restart — bump a pod-template annotation so the Deployment triggers a
+# rolling restart. Closes the "burst creation/deletion" gap from
+# scale testing.txt line 52 — measures peak event-flurry capacity
+# when an entire cluster's pods churn over within seconds.
+# delete — set replicasPerNamespace to 0; drives the symmetric delete-event
+# throughput number.
+
+{{$actionName := .actionName}}
+{{$generation := DefaultParam .generation 0}}
+{{$namespaces := .namespaces}}
+{{$deploymentsPerNamespace := .deploymentsPerNamespace}}
+{{$replicasPerDeployment := .replicasPerDeployment}}
+{{$tuningSet := .tuningSet}}
+{{$operationTimeout := .operationTimeout}}
+
+# delete = bring object count to 0; create/restart keep configured count.
+{{$replicasInPhase := $deploymentsPerNamespace}}
+{{if eq $actionName "delete"}}{{$replicasInPhase = 0}}{{end}}
+
+steps:
+ # Per-action WaitForControlledPodsRunning lifecycle: start (registers
+ # watcher with apiVersion+kind so CL2 knows which controllers to track),
+ # then create/restart/delete the workload, then gather. Using a per-action
+ # Identifier keeps the create/restart/delete invocations from clobbering
+ # each other's metric state across the three module calls in
+ # event-throughput.yaml.
+ - name: Start tracking event-throughput pods to be {{$actionName}}d
+ measurements:
+ - Identifier: WaitForControlledPodsRunning-{{$actionName}}
+ Method: WaitForControlledPodsRunning
+ Params:
+ action: start
+ apiVersion: apps/v1
+ kind: Deployment
+ checkIfPodsAreUpdated: true
+ labelSelector: group = clustermesh-event-throughput
+ operationTimeout: {{$operationTimeout}}
+
+ - name: {{$actionName}} event-throughput workload
+ phases:
+ - namespaceRange:
+ min: 1
+ max: {{$namespaces}}
+ replicasPerNamespace: {{$replicasInPhase}}
+ tuningSet: {{$tuningSet}}
+ objectBundle:
+ - basename: et
+ objectTemplatePath: /modules/event-throughput-deployment.yaml
+ templateFillMap:
+ Replicas: {{$replicasPerDeployment}}
+ Group: clustermesh-event-throughput
+ RestartGeneration: {{$generation}}
+ - basename: et
+ objectTemplatePath: /modules/event-throughput-service.yaml
+ templateFillMap:
+ Group: clustermesh-event-throughput
+
+ - name: Wait for event-throughput pods to be {{$actionName}}d
+ measurements:
+ - Identifier: WaitForControlledPodsRunning-{{$actionName}}
+ Method: WaitForControlledPodsRunning
+ Params:
+ action: gather
diff --git a/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/cilium.yaml b/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/cilium.yaml
new file mode 100644
index 0000000000..4d27607347
--- /dev/null
+++ b/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/cilium.yaml
@@ -0,0 +1,226 @@
+{{$action := .action}} # start, gather
+
+{{$suffix := DefaultParam .suffix ""}}
+
+steps:
+ - name: {{$action}} Additional Cilium Measurements
+ measurements:
+ - Identifier: CiliumAvgCPUUsage{{$suffix}}
+ Method: GenericPrometheusQuery
+ Params:
+ action: {{$action}}
+ metricName: Cilium Average CPU Usage {{$suffix}}
+ metricVersion: v1
+ unit: cpu
+ enableViolations: true
+ queries:
+ - name: Perc99
+ query: quantile(0.99, avg_over_time(rate(cilium_process_cpu_seconds_total[1m])[%v:]))
+ - name: Perc90
+ query: quantile(0.90, avg_over_time(rate(cilium_process_cpu_seconds_total[1m])[%v:]))
+ - name: Perc50
+ query: quantile(0.50, avg_over_time(rate(cilium_process_cpu_seconds_total[1m])[%v:]))
+ - Identifier: CiliumMaxCPUUsage{{$suffix}}
+ Method: GenericPrometheusQuery
+ Params:
+ action: {{$action}}
+ metricName: Cilium Max CPU Usage {{$suffix}}
+ metricVersion: v1
+ unit: cpu
+ enableViolations: true
+ queries:
+ - name: Perc99
+ query: quantile(0.99, max_over_time(rate(cilium_process_cpu_seconds_total[1m])[%v:]))
+ - name: Perc90
+ query: quantile(0.90, max_over_time(rate(cilium_process_cpu_seconds_total[1m])[%v:]))
+ - name: Perc50
+ query: quantile(0.50, max_over_time(rate(cilium_process_cpu_seconds_total[1m])[%v:]))
+ - Identifier: CiliumAvgMemUsage{{$suffix}}
+ Method: GenericPrometheusQuery
+ Params:
+ action: {{$action}}
+ metricName: Cilium Avg Memory Usage {{$suffix}}
+ metricVersion: v1
+ unit: MB
+ enableViolations: true
+ queries:
+ - name: Perc99
+ query: quantile(0.99, avg_over_time(cilium_process_resident_memory_bytes[%v:]) / 1024 / 1024)
+ - name: Perc90
+ query: quantile(0.90, avg_over_time(cilium_process_resident_memory_bytes[%v:]) / 1024 / 1024)
+ - name: Perc50
+ query: quantile(0.5, avg_over_time(cilium_process_resident_memory_bytes[%v:]) / 1024 / 1024)
+ - Identifier: CiliumMaxMemUsage{{$suffix}}
+ Method: GenericPrometheusQuery
+ Params:
+ action: {{$action}}
+ metricName: Cilium Max Memory Usage {{$suffix}}
+ metricVersion: v1
+ unit: MB
+ enableViolations: true
+ queries:
+ - name: Perc99
+ query: quantile(0.99, max_over_time(cilium_process_resident_memory_bytes[%v:]) / 1024 / 1024)
+ - name: Perc90
+ query: quantile(0.90, max_over_time(cilium_process_resident_memory_bytes[%v:]) / 1024 / 1024)
+ - name: Perc50
+ query: quantile(0.5, max_over_time(cilium_process_resident_memory_bytes[%v:]) / 1024 / 1024)
+ - Identifier: CiliumOperatorAvgCPUUsage{{$suffix}}
+ Method: GenericPrometheusQuery
+ Params:
+ action: {{$action}}
+ metricName: Cilium Operator Avg CPU Usage {{$suffix}}
+ metricVersion: v1
+ unit: cpu
+ enableViolations: true
+ queries:
+ - name: Perc99
+ query: quantile(0.99, avg_over_time(rate(cilium_operator_process_cpu_seconds_total[1m])[%v:]))
+ - name: Perc90
+ query: quantile(0.90, avg_over_time(rate(cilium_operator_process_cpu_seconds_total[1m])[%v:]))
+ - name: Perc50
+ query: quantile(0.50, avg_over_time(rate(cilium_operator_process_cpu_seconds_total[1m])[%v:]))
+ - Identifier: CiliumOperatorMaxCPUUsage{{$suffix}}
+ Method: GenericPrometheusQuery
+ Params:
+ action: {{$action}}
+ metricName: Cilium Operator Max CPU Usage {{$suffix}}
+ metricVersion: v1
+ unit: cpu
+ enableViolations: true
+ queries:
+ - name: Perc99
+ query: quantile(0.99, max_over_time(rate(cilium_operator_process_cpu_seconds_total[1m])[%v:]))
+ - name: Perc90
+ query: quantile(0.90, max_over_time(rate(cilium_operator_process_cpu_seconds_total[1m])[%v:]))
+ - name: Perc50
+ query: quantile(0.50, max_over_time(rate(cilium_operator_process_cpu_seconds_total[1m])[%v:]))
+ - Identifier: CiliumOperatorMaxMemUsage{{$suffix}}
+ Method: GenericPrometheusQuery
+ Params:
+ action: {{$action}}
+ metricName: Cilium Operator Max Memory Usage {{$suffix}}
+ metricVersion: v1
+ unit: MB
+ enableViolations: true
+ queries:
+ - name: Perc99
+ query: quantile(0.99, max_over_time(cilium_operator_process_resident_memory_bytes[%v:]) / 1024 / 1024)
+ - name: Perc90
+ query: quantile(0.90, max_over_time(cilium_operator_process_resident_memory_bytes[%v:]) / 1024 / 1024)
+ - name: Perc50
+ query: quantile(0.5, max_over_time(cilium_operator_process_resident_memory_bytes[%v:]) / 1024 / 1024)
+ - Identifier: CiliumOperatorAvgMemUsage{{$suffix}}
+ Method: GenericPrometheusQuery
+ Params:
+ action: {{$action}}
+ metricName: Cilium Operator Avg Memory Usage {{$suffix}}
+ metricVersion: v1
+ unit: MB
+ enableViolations: true
+ queries:
+ - name: Perc99
+ query: quantile(0.99, avg_over_time(cilium_operator_process_resident_memory_bytes[%v:]) / 1024 / 1024)
+ - name: Perc90
+ query: quantile(0.90, avg_over_time(cilium_operator_process_resident_memory_bytes[%v:]) / 1024 / 1024)
+ - name: Perc50
+ query: quantile(0.5, avg_over_time(cilium_operator_process_resident_memory_bytes[%v:]) / 1024 / 1024)
+ - Identifier: CiliumContainerFsAvgWrittenBytes{{$suffix}}
+ Method: GenericPrometheusQuery
+ Params:
+ action: {{$action}}
+ metricName: Cilium Container FS Average Written Bytes {{$suffix}}
+ metricVersion: v1
+ unit: bytes/s
+ enableViolations: true
+ queries:
+ - name: Perc99
+ query: quantile(0.99, avg_over_time(rate(container_fs_writes_bytes_total{container="cilium-agent"}[1m])[%v:]))
+ - name: Perc90
+ query: quantile(0.90, avg_over_time(rate(container_fs_writes_bytes_total{container="cilium-agent"}[1m])[%v:]))
+ - name: Perc50
+ query: quantile(0.50, avg_over_time(rate(container_fs_writes_bytes_total{container="cilium-agent"}[1m])[%v:]))
+ - Identifier: CiliumContainerFsMaxWrittenBytes{{$suffix}}
+ Method: GenericPrometheusQuery
+ Params:
+ action: {{$action}}
+ metricName: Cilium Container FS Max Written Bytes {{$suffix}}
+ metricVersion: v1
+ unit: bytes/s
+ enableViolations: true
+ queries:
+ - name: Perc99
+ query: quantile(0.99, max_over_time(rate(container_fs_writes_bytes_total{container="cilium-agent"}[1m])[%v:]))
+ - name: Perc90
+ query: quantile(0.90, max_over_time(rate(container_fs_writes_bytes_total{container="cilium-agent"}[1m])[%v:]))
+ - name: Perc50
+ query: quantile(0.50, max_over_time(rate(container_fs_writes_bytes_total{container="cilium-agent"}[1m])[%v:]))
+ # NOTE: FS write latency (avg/max) was intentionally dropped from this
+ # scenario. The query (rate(container_fs_write_seconds_total) / rate(
+ # container_fs_writes_total) for container="cilium-agent") returns no
+ # samples here because cilium-agent in the clustermesh scenario does
+ # almost all I/O via in-kernel bpf maps, not container fs — the write-op
+ # rate is genuinely ~0, so the division yields no result. Written-bytes
+ # rates (above) still produce useful data and remain the FS signal.
+
+ # ---------------------------------------------------------------------
+ # Network usage (spec line 38, 134: "CPU/memory/network per
+ # component"). cAdvisor exposes container_network_*_bytes_total per
+ # pod. We pin to pod="cilium-.*" instead of container="cilium-agent"
+ # because cAdvisor reports network counters at the pod-sandbox level
+ # (container="POD"), not the per-container level — so a
+ # container="cilium-agent" filter would return empty.
+ # ---------------------------------------------------------------------
+ - Identifier: CiliumContainerNetworkTransmitBytes{{$suffix}}
+ Method: GenericPrometheusQuery
+ Params:
+ action: {{$action}}
+ metricName: Cilium Network Transmit Bytes {{$suffix}}
+ metricVersion: v1
+ unit: bytes/s
+ enableViolations: false
+ queries:
+ - name: Perc99
+ query: quantile(0.99, max_over_time(rate(container_network_transmit_bytes_total{pod=~"cilium-.*",namespace="kube-system"}[1m])[%v:]))
+ - name: Perc50
+ query: quantile(0.50, avg_over_time(rate(container_network_transmit_bytes_total{pod=~"cilium-.*",namespace="kube-system"}[1m])[%v:]))
+ - Identifier: CiliumContainerNetworkReceiveBytes{{$suffix}}
+ Method: GenericPrometheusQuery
+ Params:
+ action: {{$action}}
+ metricName: Cilium Network Receive Bytes {{$suffix}}
+ metricVersion: v1
+ unit: bytes/s
+ enableViolations: false
+ queries:
+ - name: Perc99
+ query: quantile(0.99, max_over_time(rate(container_network_receive_bytes_total{pod=~"cilium-.*",namespace="kube-system"}[1m])[%v:]))
+ - name: Perc50
+ query: quantile(0.50, avg_over_time(rate(container_network_receive_bytes_total{pod=~"cilium-.*",namespace="kube-system"}[1m])[%v:]))
+
+ - Identifier: CiliumContainerRestarts{{$suffix}}
+ Method: GenericPrometheusQuery
+ Params:
+ action: {{$action}}
+ metricName: Cilium Container Restarts {{$suffix}}
+ metricVersion: v1
+ unit: "#"
+ enableViolations: true
+ queries:
+ - name: Perc99
+ query: quantile(0.99, max_over_time(increase(kube_pod_container_status_restarts_total{container="cilium-agent"}[%v])[%v:]))
+ - name: Perc90
+ query: quantile(0.90, max_over_time(increase(kube_pod_container_status_restarts_total{container="cilium-agent"}[%v])[%v:]))
+ - name: Perc50
+ query: quantile(0.50, max_over_time(increase(kube_pod_container_status_restarts_total{container="cilium-agent"}[%v])[%v:]))
+ # - Identifier: AvgCiliumHubbleMetricsCardinality{{$suffix}}
+ # Method: GenericPrometheusQuery
+ # Params:
+ # action: {{$action}}
+ # metricName: Average Cilium Hubble Metrics Cardinality {{$suffix}}
+ # metricVersion: v1
+ # unit: "#"
+ # enableViolations: true
+ # queries:
+ # - name: Avg
+ # query: count({__name__=~"hubble_.*"})
diff --git a/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/clustermesh-metrics.yaml b/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/clustermesh-metrics.yaml
new file mode 100644
index 0000000000..18d0a2a85c
--- /dev/null
+++ b/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/clustermesh-metrics.yaml
@@ -0,0 +1,192 @@
+{{$action := .action}} # start, gather
+
+{{$suffix := DefaultParam .suffix ""}}
+
+# ClusterMesh-specific Prometheus measurements.
+#
+# All metrics here are upstream Cilium clustermesh-apiserver / cilium-agent
+# metrics, scraped via the PodMonitor deployed by config/modules/clustermesh.yaml.
+# If AKS managed Cilium does not expose a given metric, GenericPrometheusQuery
+# returns empty data items (CL2 logs a warning, the run continues) — refine
+# query strings once we have a live mesh to inspect.
+
+steps:
+ - name: {{$action}} ClusterMesh Measurements
+ measurements:
+ # ---------------------------------------------------------------------
+ # Mesh health: how many remote clusters are connected from this cluster's
+ # perspective. In an N-cluster mesh, this gauge should reach (N-1) on every
+ # cluster. Capturing percentile shape across the run window flags drops.
+ # ---------------------------------------------------------------------
+ - Identifier: ClusterMeshRemoteClustersConnected{{$suffix}}
+ Method: GenericPrometheusQuery
+ Params:
+ action: {{$action}}
+ metricName: ClusterMesh Remote Clusters Connected {{$suffix}}
+ metricVersion: v1
+ unit: "#"
+ enableViolations: false
+ queries:
+ - name: Perc99
+ query: quantile(0.99, avg_over_time(cilium_clustermesh_remote_clusters[%v:]))
+ - name: Perc50
+ query: quantile(0.50, avg_over_time(cilium_clustermesh_remote_clusters[%v:]))
+ - name: Min
+ query: min_over_time(min(cilium_clustermesh_remote_clusters)[%v:])
+
+ # ---------------------------------------------------------------------
+ # Mesh failure counter: cumulative remote-cluster connection failures.
+ # Healthy runs should keep this at 0; we track the max increase observed
+ # over the run to surface flapping links during scale-up.
+ # ---------------------------------------------------------------------
+ - Identifier: ClusterMeshRemoteClusterFailures{{$suffix}}
+ Method: GenericPrometheusQuery
+ Params:
+ action: {{$action}}
+ metricName: ClusterMesh Remote Cluster Failures {{$suffix}}
+ metricVersion: v1
+ unit: "#"
+ enableViolations: false
+ queries:
+ - name: MaxIncrease
+ query: max(max_over_time(cilium_clustermesh_remote_cluster_failures[%v:])) - min(min_over_time(cilium_clustermesh_remote_cluster_failures[%v:]))
+
+ # ---------------------------------------------------------------------
+ # Cross-cluster event throughput — the headline metric for scale scenario
+ # #1 (Cross-Cluster Event Throughput) and #2 (Pod Churn). Rate of kvstore
+ # events queued per second on this cluster.
+ # ---------------------------------------------------------------------
+ - Identifier: ClusterMeshKvstoreEventsRate{{$suffix}}
+ Method: GenericPrometheusQuery
+ Params:
+ action: {{$action}}
+ metricName: ClusterMesh Kvstore Events Rate {{$suffix}}
+ metricVersion: v1
+ unit: events/s
+ enableViolations: false
+ queries:
+ - name: Perc99
+ query: quantile(0.99, max_over_time(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count[1m])[%v:]))
+ - name: Perc90
+ query: quantile(0.90, max_over_time(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count[1m])[%v:]))
+ - name: Perc50
+ query: quantile(0.50, avg_over_time(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count[1m])[%v:]))
+
+ # ---------------------------------------------------------------------
+ # Per-type event rate breakdown (spec line 131: "Event rate (per
+ # type)"). The kvstoremesh kvstore-events histogram carries a
+ # `scope` label tagging which kvstore key family the event touched.
+ # We split into the three families spec line 5 calls out: endpoints,
+ # services, identities. Cilium 1.18 uses these scope values:
+ # identities/v1 — security identities
+ # services/v1 — global Service objects
+ # ip/v1 — endpoint IP-to-identity mappings (endpoints)
+ # nodes/v1 — node tunnel / IPAM advertisements
+ # serviceexports/v1 — MCS-API ServiceExport objects
+ # lease — leader election
+ # cilium/.heartbeat — kvstore liveness heartbeat
+ # cilium/syncedcanaries — initial-sync barrier markers
+ # ---------------------------------------------------------------------
+ - Identifier: ClusterMeshKvstoreEventsRateIdentities{{$suffix}}
+ Method: GenericPrometheusQuery
+ Params:
+ action: {{$action}}
+ metricName: ClusterMesh Kvstore Events Rate Identities {{$suffix}}
+ metricVersion: v1
+ unit: events/s
+ enableViolations: false
+ queries:
+ - name: Perc99
+ query: quantile(0.99, max_over_time(sum(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count{scope="identities/v1"}[1m]))[%v:]))
+ - name: Perc50
+ query: quantile(0.50, avg_over_time(sum(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count{scope="identities/v1"}[1m]))[%v:]))
+ - Identifier: ClusterMeshKvstoreEventsRateServices{{$suffix}}
+ Method: GenericPrometheusQuery
+ Params:
+ action: {{$action}}
+ metricName: ClusterMesh Kvstore Events Rate Services {{$suffix}}
+ metricVersion: v1
+ unit: events/s
+ enableViolations: false
+ queries:
+ - name: Perc99
+ query: quantile(0.99, max_over_time(sum(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count{scope="services/v1"}[1m]))[%v:]))
+ - name: Perc50
+ query: quantile(0.50, avg_over_time(sum(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count{scope="services/v1"}[1m]))[%v:]))
+ - Identifier: ClusterMeshKvstoreEventsRateEndpoints{{$suffix}}
+ Method: GenericPrometheusQuery
+ Params:
+ action: {{$action}}
+ metricName: ClusterMesh Kvstore Events Rate Endpoints {{$suffix}}
+ metricVersion: v1
+ unit: events/s
+ enableViolations: false
+ queries:
+ - name: Perc99
+ query: quantile(0.99, max_over_time(sum(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count{scope="ip/v1"}[1m]))[%v:]))
+ - name: Perc50
+ query: quantile(0.50, avg_over_time(sum(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count{scope="ip/v1"}[1m]))[%v:]))
+
+ # ---------------------------------------------------------------------
+ # Cross-cluster propagation latency proxy: p99 of kvstore operation
+ # duration. This is the closest upstream metric to "how long does it take
+ # for a change in cluster A to be visible in cluster B" without injecting
+ # synthetic probes.
+ # ---------------------------------------------------------------------
+ - Identifier: ClusterMeshKvstoreOperationDuration{{$suffix}}
+ Method: GenericPrometheusQuery
+ Params:
+ action: {{$action}}
+ metricName: ClusterMesh Kvstore Operation Duration {{$suffix}}
+ metricVersion: v1
+ unit: s
+ enableViolations: false
+ queries:
+ - name: Perc99
+ query: histogram_quantile(0.99, sum(rate(cilium_kvstoremesh_kvstore_operations_duration_seconds_bucket[1m])) by (le))
+ - name: Perc90
+ query: histogram_quantile(0.90, sum(rate(cilium_kvstoremesh_kvstore_operations_duration_seconds_bucket[1m])) by (le))
+ - name: Perc50
+ query: histogram_quantile(0.50, sum(rate(cilium_kvstoremesh_kvstore_operations_duration_seconds_bucket[1m])) by (le))
+
+ # ---------------------------------------------------------------------
+ # Watch queue depth (saturation signal — spec line 37 "Key signals:
+ # ... Watch queue depth"). cilium_kvstoremesh_kvstore_sync_queue_size
+ # is a gauge: number of items currently waiting to be processed by
+ # the kvstoremesh sync loop. A persistently positive or growing value
+ # is the saturation indicator (event ingest > drain rate).
+ # ---------------------------------------------------------------------
+ - Identifier: ClusterMeshKvstoreSyncQueueSize{{$suffix}}
+ Method: GenericPrometheusQuery
+ Params:
+ action: {{$action}}
+ metricName: ClusterMesh Kvstore Sync Queue Size {{$suffix}}
+ metricVersion: v1
+ unit: "#"
+ enableViolations: false
+ queries:
+ - name: Max
+ query: max(max_over_time(cilium_kvstoremesh_kvstore_sync_queue_size[%v:]))
+ - name: Perc99
+ query: quantile(0.99, max_over_time(cilium_kvstoremesh_kvstore_sync_queue_size[%v:]))
+ - name: Perc50
+ query: quantile(0.50, avg_over_time(cilium_kvstoremesh_kvstore_sync_queue_size[%v:]))
+
+ # ---------------------------------------------------------------------
+ # Identity propagation: cilium identity count. Under cross-cluster pod
+ # churn (scenarios #1, #2, #3), this should track the global identity
+ # set converging across clusters. Divergence flags propagation lag.
+ # ---------------------------------------------------------------------
+ - Identifier: ClusterMeshIdentityCount{{$suffix}}
+ Method: GenericPrometheusQuery
+ Params:
+ action: {{$action}}
+ metricName: ClusterMesh Identity Count {{$suffix}}
+ metricVersion: v1
+ unit: "#"
+ enableViolations: false
+ queries:
+ - name: Perc99
+ query: quantile(0.99, max_over_time(cilium_identity[%v:]))
+ - name: Perc50
+ query: quantile(0.50, avg_over_time(cilium_identity[%v:]))
diff --git a/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/clustermesh-throughput.yaml b/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/clustermesh-throughput.yaml
new file mode 100644
index 0000000000..c0dd5f92c6
--- /dev/null
+++ b/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/clustermesh-throughput.yaml
@@ -0,0 +1,78 @@
+{{$action := .action}} # start, gather
+
+{{$suffix := DefaultParam .suffix ""}}
+
+# Scenario #1 (Cross-Cluster Event Throughput) — extra measurements layered
+# on top of the always-on clustermesh-metrics.yaml. These are specifically
+# tuned to the event-throughput workload's create/restart/delete sequence,
+# and are scoped to this scenario because they only make sense when the
+# workload is actively churning kvstore writes.
+
+steps:
+ - name: {{$action}} ClusterMesh Event Throughput Measurements
+ measurements:
+ # ---------------------------------------------------------------------
+ # Backlog detection: the headline saturation signal. If the rate of
+ # events queued exceeds the rate at which the local agent drains them,
+ # the system is over-saturated. A sustained positive value over the
+ # measurement window is the failure mode scale testing.txt line 14
+ # ("upper bounds — effective QPS limit") is asking us to find.
+ # ---------------------------------------------------------------------
+ - Identifier: ClusterMeshEventBacklog{{$suffix}}
+ Method: GenericPrometheusQuery
+ Params:
+ action: {{$action}}
+ metricName: ClusterMesh Event Backlog Rate {{$suffix}}
+ metricVersion: v1
+ unit: events/s
+ enableViolations: false
+ queries:
+ # Wrap each side in sum() to drop labels — the two metrics carry
+ # non-identical label sets (e.g. sync_errors_total has a per-cluster
+ # `source_cluster` label that events_queue_seconds_count doesn't).
+ # Without sum(), PromQL's binary `-` returns an empty vector when
+ # operand label sets don't align. sum() collapses both to a single
+ # series so the subtraction is well-defined.
+ - name: Perc99
+ query: quantile(0.99, max_over_time((sum(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count[1m])) - sum(rate(cilium_kvstoremesh_kvstore_sync_errors_total[1m])))[%v:]))
+ - name: MaxBurst
+ query: max(max_over_time(rate(cilium_kvstoremesh_kvstore_events_queue_seconds_count[30s])[%v:]))
+
+ # ---------------------------------------------------------------------
+ # Global services gauge: one row per cluster of how many global services
+ # this cluster's clustermesh-apiserver has accepted. With the workload
+ # creating N global Services per cluster across M clusters, every cluster
+ # should observe roughly N*M global services. Divergence flags either
+ # scrape failures or service-propagation lag.
+ # ---------------------------------------------------------------------
+ - Identifier: ClusterMeshGlobalServices{{$suffix}}
+ Method: GenericPrometheusQuery
+ Params:
+ action: {{$action}}
+ metricName: ClusterMesh Global Services {{$suffix}}
+ metricVersion: v1
+ unit: "#"
+ enableViolations: false
+ queries:
+ - name: Max
+ query: max(max_over_time(cilium_clustermesh_global_services[%v:]))
+ - name: Perc50
+ query: quantile(0.50, avg_over_time(cilium_clustermesh_global_services[%v:]))
+
+ # ---------------------------------------------------------------------
+ # Explicit p95 split for kvstore operation latency. clustermesh-metrics.yaml
+ # already emits p50/p90/p99; for scenario #1 we also surface p95 so the
+ # scaling-curve dashboard has a smoother percentile gradient when plotting
+ # latency vs cluster count.
+ # ---------------------------------------------------------------------
+ - Identifier: ClusterMeshKvstoreOperationDurationP95{{$suffix}}
+ Method: GenericPrometheusQuery
+ Params:
+ action: {{$action}}
+ metricName: ClusterMesh Kvstore Operation Duration P95 {{$suffix}}
+ metricVersion: v1
+ unit: s
+ enableViolations: false
+ queries:
+ - name: Perc95
+ query: histogram_quantile(0.95, sum(rate(cilium_kvstoremesh_kvstore_operations_duration_seconds_bucket[1m])) by (le))
diff --git a/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/control-plane.yaml b/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/control-plane.yaml
new file mode 100644
index 0000000000..47504cbf89
--- /dev/null
+++ b/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/control-plane.yaml
@@ -0,0 +1,86 @@
+{{$action := .action}} # start, gather
+
+# Feature gates
+{{$podStartupLatencyThreshold := DefaultParam .CL2_POD_STARTUP_LATENCY_THRESHOLD "15s"}}
+{{$ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS_SIMPLE := DefaultParam .CL2_ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS_SIMPLE true}}
+{{$PROMETHEUS_SCRAPE_KUBE_PROXY := DefaultParam .PROMETHEUS_SCRAPE_KUBE_PROXY true}}
+{{$NETWORK_LATENCY_THRESHOLD := DefaultParam .CL2_NETWORK_LATENCY_THRESHOLD "0s"}}
+{{$ENABLE_IN_CLUSTER_NETWORK_LATENCY := DefaultParam .CL2_ENABLE_IN_CLUSTER_NETWORK_LATENCY true}}
+
+{{$suffix := DefaultParam .suffix ""}}
+
+steps:
+ - name: {{$action}} Additional Measurements
+ measurements:
+ - Identifier: APIResponsivenessPrometheus{{$suffix}}
+ Method: APIResponsivenessPrometheus
+ Params:
+ action: {{$action}}
+ enableViolations: {{$ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS_SIMPLE}}
+ useSimpleLatencyQuery: true
+ - Identifier: PodStartupLatency{{$suffix}}
+ Method: PodStartupLatency
+ Params:
+ action: {{$action}}
+ labelSelector: group = {{.group}}
+ threshold: {{$podStartupLatencyThreshold}}
+ - Identifier: ApiserverAvgCPUUsage{{$suffix}}
+ Method: GenericPrometheusQuery
+ Params:
+ action: {{$action}}
+ metricName: Apiserver Average CPU Usage {{$suffix}}
+ metricVersion: v1
+ unit: cpu
+ enableViolations: true
+ queries:
+ - name: Perc99
+ query: quantile(0.99, avg_over_time(rate(process_cpu_seconds_total{endpoint="apiserver"}[1m])[%v:]))
+ - name: Perc90
+ query: quantile(0.90, avg_over_time(rate(process_cpu_seconds_total{endpoint="apiserver"}[1m])[%v:]))
+ - name: Perc50
+ query: quantile(0.50, avg_over_time(rate(process_cpu_seconds_total{endpoint="apiserver"}[1m])[%v:]))
+ - Identifier: ApiserverMaxCPUUsage{{$suffix}}
+ Method: GenericPrometheusQuery
+ Params:
+ action: {{$action}}
+ metricName: Apiserver Max CPU Usage {{$suffix}}
+ metricVersion: v1
+ unit: cpu
+ enableViolations: true
+ queries:
+ - name: Perc99
+ query: quantile(0.99, max_over_time(rate(process_cpu_seconds_total{endpoint="apiserver"}[1m])[%v:]))
+ - name: Perc90
+ query: quantile(0.90, max_over_time(rate(process_cpu_seconds_total{endpoint="apiserver"}[1m])[%v:]))
+ - name: Perc50
+ query: quantile(0.50, max_over_time(rate(process_cpu_seconds_total{endpoint="apiserver"}[1m])[%v:]))
+ - Identifier: ApiserverAvgMemUsage{{$suffix}}
+ Method: GenericPrometheusQuery
+ Params:
+ action: {{$action}}
+ metricName: Apiserver Average Memory Usage {{$suffix}}
+ metricVersion: v1
+ unit: MB
+ enableViolations: true
+ queries:
+ - name: Perc99
+ query: quantile(0.99, avg_over_time(process_resident_memory_bytes{endpoint="apiserver"}[%v:]) / 1024 / 1024)
+ - name: Perc90
+ query: quantile(0.90, avg_over_time(process_resident_memory_bytes{endpoint="apiserver"}[%v:]) / 1024 / 1024)
+ - name: Perc50
+ query: quantile(0.5, avg_over_time(process_resident_memory_bytes{endpoint="apiserver"}[%v:]) / 1024 / 1024)
+ - Identifier: ApiserverMaxMemUsage{{$suffix}}
+ Method: GenericPrometheusQuery
+ Params:
+ action: {{$action}}
+ metricName: Apiserver Max Memory Usage {{$suffix}}
+ metricVersion: v1
+ unit: MB
+ enableViolations: true
+ queries:
+ - name: Perc99
+ query: quantile(0.99, max_over_time(process_resident_memory_bytes{endpoint="apiserver"}[%v:]) / 1024 / 1024)
+ - name: Perc90
+ query: quantile(0.90, max_over_time(process_resident_memory_bytes{endpoint="apiserver"}[%v:]) / 1024 / 1024)
+ - name: Perc50
+ query: quantile(0.5, max_over_time(process_resident_memory_bytes{endpoint="apiserver"}[%v:]) / 1024 / 1024)
diff --git a/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/etcd-metrics.yaml b/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/etcd-metrics.yaml
new file mode 100644
index 0000000000..129891204d
--- /dev/null
+++ b/modules/python/clusterloader2/clustermesh-scale/config/modules/measurements/etcd-metrics.yaml
@@ -0,0 +1,158 @@
+{{$action := .action}} # start, gather
+
+{{$suffix := DefaultParam .suffix ""}}
+
+# Etcd-internal measurements for the embedded etcd inside each cluster's
+# clustermesh-apiserver pod.
+#
+# Spec coverage (scale testing.txt):
+# - line 34: "Metrics: Cilium, clustermesh-apiserver, etcd"
+# - line 134: "etcd metrics (watch count, compactions, latency)"
+#
+# Source: the etcd container in the clustermesh-apiserver pod is launched
+# with `--listen-metrics-urls=http://0.0.0.0:9963` and `--metrics=basic`.
+# Our PodMonitor (modules/clustermesh/podmonitor.yaml, port 9963 endpoint)
+# already scrapes that target — we just hadn't been querying the metrics.
+#
+# `--metrics=basic` only emits the etcd_debugging_* family (despite the
+# name, these ARE the basic-tier metrics; the "extensive" tier adds
+# etcd_disk_wal_fsync_*, etcd_network_peer_*, etcd_mvcc_db_total_size_in_bytes,
+# etc., which AKS-managed Cilium does not enable). Queries below pick the
+# best basic-tier proxies for each spec-required signal.
+
+steps:
+ - name: {{$action}} ClusterMesh Etcd Measurements
+ measurements:
+ # ---------------------------------------------------------------------
+ # Watch count (spec line 134 "watch count"). Total watchers currently
+ # registered against this cluster's clustermesh-apiserver etcd. Each
+ # remote cluster's kvstoremesh maintains watchers for endpoints,
+ # services, and identities, so this scales with mesh size and traffic.
+ # Slow-watcher count is the back-pressure signal: a non-zero value
+ # means watchers can't keep up with the event stream.
+ # ---------------------------------------------------------------------
+ - Identifier: ClusterMeshEtcdWatchCount{{$suffix}}
+ Method: GenericPrometheusQuery
+ Params:
+ action: {{$action}}
+ metricName: ClusterMesh Etcd Watch Count {{$suffix}}
+ metricVersion: v1
+ unit: "#"
+ enableViolations: false
+ queries:
+ - name: Max
+ query: max(max_over_time(etcd_debugging_mvcc_watcher_total[%v:]))
+ - name: Perc99
+ query: quantile(0.99, max_over_time(etcd_debugging_mvcc_watcher_total[%v:]))
+ - name: Perc50
+ query: quantile(0.50, avg_over_time(etcd_debugging_mvcc_watcher_total[%v:]))
+
+ - Identifier: ClusterMeshEtcdSlowWatchers{{$suffix}}
+ Method: GenericPrometheusQuery
+ Params:
+ action: {{$action}}
+ metricName: ClusterMesh Etcd Slow Watchers {{$suffix}}
+ metricVersion: v1
+ unit: "#"
+ enableViolations: false
+ queries:
+ - name: Max
+ query: max(max_over_time(etcd_debugging_mvcc_slow_watcher_total[%v:]))
+ - name: Perc99
+ query: quantile(0.99, max_over_time(etcd_debugging_mvcc_slow_watcher_total[%v:]))
+
+ # ---------------------------------------------------------------------
+ # Pending events: events queued for delivery to watchers but not yet
+ # consumed. A growing value over the run window is the etcd-side
+ # equivalent of the kvstoremesh sync queue depth — back-pressure from
+ # the consumer side.
+ # ---------------------------------------------------------------------
+ - Identifier: ClusterMeshEtcdPendingEvents{{$suffix}}
+ Method: GenericPrometheusQuery
+ Params:
+ action: {{$action}}
+ metricName: ClusterMesh Etcd Pending Events {{$suffix}}
+ metricVersion: v1
+ unit: "#"
+ enableViolations: false
+ queries:
+ - name: Max
+ query: max(max_over_time(etcd_debugging_mvcc_pending_events_total[%v:]))
+ - name: Perc99
+ query: quantile(0.99, max_over_time(etcd_debugging_mvcc_pending_events_total[%v:]))
+
+ # ---------------------------------------------------------------------
+ # Compactions (spec line 134 "compactions"). Auto-compaction is
+ # enabled with `--auto-compaction-retention=1` (1-hour retention). Two
+ # signals: how long a compaction takes (latency) and how many keys
+ # were removed (work done).
+ # ---------------------------------------------------------------------
+ - Identifier: ClusterMeshEtcdCompactionDuration{{$suffix}}
+ Method: GenericPrometheusQuery
+ Params:
+ action: {{$action}}
+ metricName: ClusterMesh Etcd Compaction Duration {{$suffix}}
+ metricVersion: v1
+ unit: ms
+ enableViolations: false
+ queries:
+ - name: Perc99
+ query: histogram_quantile(0.99, sum(rate(etcd_debugging_mvcc_db_compaction_total_duration_milliseconds_bucket[%v])) by (le))
+ - name: Perc50
+ query: histogram_quantile(0.50, sum(rate(etcd_debugging_mvcc_db_compaction_total_duration_milliseconds_bucket[%v])) by (le))
+
+ - Identifier: ClusterMeshEtcdCompactionKeys{{$suffix}}
+ Method: GenericPrometheusQuery
+ Params:
+ action: {{$action}}
+ metricName: ClusterMesh Etcd Compacted Keys {{$suffix}}
+ metricVersion: v1
+ unit: "#"
+ enableViolations: false
+ queries:
+ - name: TotalIncrease
+ query: max(max_over_time(etcd_debugging_mvcc_db_compaction_keys_total[%v:])) - min(min_over_time(etcd_debugging_mvcc_db_compaction_keys_total[%v:]))
+
+ # ---------------------------------------------------------------------
+ # Disk-write latency (spec line 134 "latency"). With --metrics=basic
+ # we don't have etcd_disk_wal_fsync_duration_seconds; the closest
+ # available proxy is etcd_debugging_disk_backend_commit_write_duration
+ # (how long it takes to commit a write txn to the bbolt backend).
+ # Together with rebalance/spill durations, this characterizes etcd's
+ # disk subsystem performance under load.
+ # ---------------------------------------------------------------------
+ - Identifier: ClusterMeshEtcdBackendWriteDuration{{$suffix}}
+ Method: GenericPrometheusQuery
+ Params:
+ action: {{$action}}
+ metricName: ClusterMesh Etcd Backend Write Duration {{$suffix}}
+ metricVersion: v1
+ unit: s
+ enableViolations: false
+ queries:
+ - name: Perc99
+ query: histogram_quantile(0.99, sum(rate(etcd_debugging_disk_backend_commit_write_duration_seconds_bucket[1m])) by (le))
+ - name: Perc90
+ query: histogram_quantile(0.90, sum(rate(etcd_debugging_disk_backend_commit_write_duration_seconds_bucket[1m])) by (le))
+ - name: Perc50
+ query: histogram_quantile(0.50, sum(rate(etcd_debugging_disk_backend_commit_write_duration_seconds_bucket[1m])) by (le))
+
+ # ---------------------------------------------------------------------
+ # MVCC store size proxy. With --metrics=basic we don't get
+ # etcd_mvcc_db_total_size_in_bytes; etcd_debugging_mvcc_keys_total is
+ # the key count and etcd_debugging_mvcc_total_put_size_in_bytes is the
+ # cumulative bytes written. Together they bound the working set.
+ # ---------------------------------------------------------------------
+ - Identifier: ClusterMeshEtcdMvccKeys{{$suffix}}
+ Method: GenericPrometheusQuery
+ Params:
+ action: {{$action}}
+ metricName: ClusterMesh Etcd MVCC Keys {{$suffix}}
+ metricVersion: v1
+ unit: "#"
+ enableViolations: false
+ queries:
+ - name: Max
+ query: max(max_over_time(etcd_debugging_mvcc_keys_total[%v:]))
+ - name: Perc50
+ query: quantile(0.50, avg_over_time(etcd_debugging_mvcc_keys_total[%v:]))
diff --git a/modules/python/clusterloader2/clustermesh-scale/config/modules/scale-test-deployment.yaml b/modules/python/clusterloader2/clustermesh-scale/config/modules/scale-test-deployment.yaml
new file mode 100644
index 0000000000..9ceffc8595
--- /dev/null
+++ b/modules/python/clusterloader2/clustermesh-scale/config/modules/scale-test-deployment.yaml
@@ -0,0 +1,27 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: {{.Name}}
+ labels:
+ group: {{.Group}}
+spec:
+ replicas: {{.Replicas}}
+ selector:
+ matchLabels:
+ name: {{.Name}}
+ template:
+ metadata:
+ labels:
+ name: {{.Name}}
+ group: {{.Group}}
+ spec:
+ containers:
+ - name: pause
+ image: mcr.microsoft.com/oss/kubernetes/pause:3.6
+ resources:
+ requests:
+ cpu: 1m
+ memory: 5Mi
+ limits:
+ cpu: 5m
+ memory: 20Mi
diff --git a/modules/python/clusterloader2/clustermesh-scale/config/modules/scale-test.yaml b/modules/python/clusterloader2/clustermesh-scale/config/modules/scale-test.yaml
new file mode 100644
index 0000000000..5fd806c60b
--- /dev/null
+++ b/modules/python/clusterloader2/clustermesh-scale/config/modules/scale-test.yaml
@@ -0,0 +1,57 @@
+name: clustermesh-scale-test-module
+
+# Trivial pod deployment module: creates or deletes
+# namespaces x deploymentsPerNamespace x replicasPerDeployment
+# pause-image pods on the target cluster. No traffic, no churn, no policies.
+
+{{$actionName := .actionName}}
+{{$namespaces := .namespaces}}
+{{$deploymentsPerNamespace := .deploymentsPerNamespace}}
+{{$replicasPerDeployment := .replicasPerDeployment}}
+{{$tuningSet := .tuningSet}}
+{{$operationTimeout := .operationTimeout}}
+
+{{$totalDeployments := MultiplyInt $namespaces $deploymentsPerNamespace}}
+
+steps:
+ # Register a fresh WaitForControlledPodsRunning watcher BEFORE the
+ # create/delete phase. Without this, the second invocation of this module
+ # (actionName=delete) errors with "metric WaitForControlledPodsRunning has
+ # not been started" — CL2 closes the metric after the first `gather`, so
+ # each invocation needs its own start. We use a per-action Identifier
+ # ("...-create" / "...-delete") so the start and gather pair cleanly even
+ # if the runtime ever caches metrics by Identifier across invocations.
+ - name: Start tracking pods to be {{$actionName}}d
+ measurements:
+ - Identifier: WaitForControlledPodsRunning-{{$actionName}}
+ Method: WaitForControlledPodsRunning
+ Params:
+ action: start
+ # CL2 needs apiVersion+kind to know which controllers to track on
+ # start; we deploy Deployment objects (see scale-test-deployment.yaml).
+ apiVersion: apps/v1
+ kind: Deployment
+ checkIfPodsAreUpdated: true
+ labelSelector: group = clustermesh-scale-test
+ operationTimeout: {{$operationTimeout}}
+
+ - name: {{$actionName}} deployments
+ phases:
+ - namespaceRange:
+ min: 1
+ max: {{$namespaces}}
+ replicasPerNamespace: {{$deploymentsPerNamespace}}
+ tuningSet: {{$tuningSet}}
+ objectBundle:
+ - basename: scale-test
+ objectTemplatePath: /modules/scale-test-deployment.yaml
+ templateFillMap:
+ Replicas: {{$replicasPerDeployment}}
+ Group: clustermesh-scale-test
+
+ - name: Wait for deployments to be {{$actionName}}d
+ measurements:
+ - Identifier: WaitForControlledPodsRunning-{{$actionName}}
+ Method: WaitForControlledPodsRunning
+ Params:
+ action: gather
diff --git a/modules/python/clusterloader2/clustermesh-scale/scale.py b/modules/python/clusterloader2/clustermesh-scale/scale.py
new file mode 100644
index 0000000000..35047f122a
--- /dev/null
+++ b/modules/python/clusterloader2/clustermesh-scale/scale.py
@@ -0,0 +1,258 @@
+"""
+ClusterMesh scale-test harness.
+
+Single-cluster invocation. The Telescope pipeline fans out by calling this
+script once per fleet member (driven by `az fleet clustermeshprofile list-members`
+in steps/topology/clustermesh-scale/execute-clusterloader2.yml). Each invocation
+emits one JSONL with a `cluster` attribution column so concatenated results from
+N clusters are queryable per-cluster downstream.
+
+Phase 1 is intentionally trivial: deploy a small fixed number of pods, no churn,
+no fortio, no network policies. The goal of Phase 1 is to prove the multi-cluster
+harness + topology + aggregation works end-to-end. Real measurements
+(cross-cluster event throughput, identity propagation, etc.) come in plan.md
+Phase 2 by adding measurement modules to config/modules/measurements/ and new
+parameters to configure/collect.
+"""
+import argparse
+import json
+import os
+from datetime import datetime, timezone
+
+from clusterloader2.utils import parse_xml_to_json, run_cl2_command, process_cl2_reports
+
+
+def configure_clusterloader2(
+ namespaces,
+ deployments_per_namespace,
+ replicas_per_deployment,
+ operation_timeout,
+ override_file,
+):
+ with open(override_file, "w", encoding="utf-8") as f:
+ # Prometheus stack — keep the Cilium-scrape flags ON so the
+ # cilium/control-plane/clustermesh measurement modules have data to
+ # query. The base memory REQUEST is set via the --prometheus-memory-request
+ # CLI flag in execute_clusterloader2 (the CL2_PROMETHEUS_MEMORY_REQUEST
+ # overrides key is not honored by this CL2 image). Memory LIMIT below
+ # IS honored as an overrides key and must be >= the request to satisfy
+ # k8s admission.
+ f.write("CL2_PROMETHEUS_TOLERATE_MASTER: true\n")
+ f.write("CL2_PROMETHEUS_MEMORY_LIMIT: 2Gi\n")
+ # Pin Prometheus to the dedicated `prompool` node (label
+ # prometheus=true is set in azure-2.tfvars extra_node_pool). Without
+ # this, prometheus-k8s lands on the default workload pool and
+ # competes with the 200 event-throughput pods for CPU/memory,
+ # causing per-node overcommit and Pending workload pods.
+ f.write('CL2_PROMETHEUS_NODE_SELECTOR: "prometheus: \\"true\\""\n')
+ f.write("CL2_PROMETHEUS_SCRAPE_CILIUM_AGENT: true\n")
+ f.write("CL2_PROMETHEUS_SCRAPE_CILIUM_OPERATOR: true\n")
+ f.write("CL2_POD_STARTUP_LATENCY_THRESHOLD: 3m\n")
+ # APIResponsivenessPrometheus default SLO (perc99 ≤ 1s) is tuned for
+ # production-scale clusters in steady state; on Phase-1 dev clusters
+ # the kube-apiserver hits multi-second perc99 during the Prometheus
+ # stack bring-up (mutatingwebhookconfigurations APPLY,
+ # customresourcedefinitions POST/PUT). The metric is still recorded
+ # — we just stop CL2 from failing the test on threshold breaches.
+ f.write("CL2_ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS_SIMPLE: false\n")
+
+ # Topology knobs — trivial defaults for Phase 1 vertical slice.
+ f.write(f"CL2_NAMESPACES: {namespaces}\n")
+ f.write(f"CL2_DEPLOYMENTS_PER_NAMESPACE: {deployments_per_namespace}\n")
+ f.write(f"CL2_REPLICAS_PER_DEPLOYMENT: {replicas_per_deployment}\n")
+ f.write(f"CL2_OPERATION_TIMEOUT: {operation_timeout}\n")
+
+ with open(override_file, "r", encoding="utf-8") as f:
+ print(f"Content of file {override_file}:\n{f.read()}")
+
+
+def execute_clusterloader2(
+ cl2_image,
+ cl2_config_dir,
+ cl2_report_dir,
+ cl2_config_file,
+ kubeconfig,
+ provider,
+):
+ run_cl2_command(
+ kubeconfig,
+ cl2_image,
+ cl2_config_dir,
+ cl2_report_dir,
+ provider,
+ cl2_config_file=cl2_config_file,
+ overrides=True,
+ enable_prometheus=True,
+ tear_down_prometheus=False,
+ scrape_kubelets=True,
+ scrape_ksm=True,
+ scrape_metrics_server=True,
+ # CL2 default is 10Gi which doesn't fit a Standard_D4s_v4 / 16GB node
+ # after k8s + Cilium overhead. Override via the CLI flag rather than
+ # `CL2_PROMETHEUS_MEMORY_REQUEST` overrides.yaml key — that key is not
+ # honored by this CL2 image (verified via prometheus-operator log
+ # showing PrometheusMemoryRequest:10Gi at runtime). Pair this with
+ # CL2_PROMETHEUS_MEMORY_LIMIT in the overrides file so request <= limit.
+ prometheus_memory_request="1Gi",
+ )
+
+
+def collect_clusterloader2(
+ cl2_report_dir,
+ cloud_info,
+ run_id,
+ run_url,
+ result_file,
+ test_type,
+ start_timestamp,
+ cluster_name,
+ cluster_count,
+ mesh_size,
+ namespaces,
+ deployments_per_namespace,
+ replicas_per_deployment,
+ trigger_reason="",
+):
+ details = parse_xml_to_json(os.path.join(cl2_report_dir, "junit.xml"), indent=2)
+ json_data = json.loads(details)
+ testsuites = json_data["testsuites"]
+
+ if testsuites:
+ status = "success" if testsuites[0]["failures"] == 0 else "failure"
+ else:
+ raise Exception(f"No testsuites found in the report! Raw data: {details}")
+
+ template = {
+ "timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
+ "status": status,
+ "group": None,
+ "measurement": None,
+ "result": None,
+ "test_details": {
+ "trigger_reason": trigger_reason,
+ # Cluster attribution — every row emitted for this run is tagged
+ # with the cluster it came from, so downstream Kusto queries can
+ # group/filter by cluster across an N-cluster mesh test.
+ "cluster": cluster_name,
+ # mesh_size is the configured target N (from pipeline matrix);
+ # cluster_count is what was actually discovered at run time. Querying
+ # `mesh_size != cluster_count` in Kusto surfaces partial-mesh runs
+ # (e.g., a Fleet member that failed to join) without needing a join
+ # to control-plane logs.
+ "mesh_size": mesh_size,
+ "cluster_count": cluster_count,
+ "namespaces": namespaces,
+ "deployments_per_namespace": deployments_per_namespace,
+ "replicas_per_deployment": replicas_per_deployment,
+ "pods_per_cluster": namespaces * deployments_per_namespace * replicas_per_deployment,
+ "details": (
+ testsuites[0]["testcases"][0].get("failure", None)
+ if testsuites[0].get("testcases")
+ else None
+ ),
+ },
+ "cloud_info": cloud_info,
+ "run_id": run_id,
+ "run_url": run_url,
+ "test_type": test_type,
+ "start_timestamp": start_timestamp,
+ # parameters (top-level for Kusto column convenience)
+ "cluster": cluster_name,
+ "mesh_size": mesh_size,
+ "cluster_count": cluster_count,
+ "namespaces": namespaces,
+ "deployments_per_namespace": deployments_per_namespace,
+ "replicas_per_deployment": replicas_per_deployment,
+ }
+ content = process_cl2_reports(cl2_report_dir, template)
+
+ os.makedirs(os.path.dirname(result_file), exist_ok=True)
+ with open(result_file, "w", encoding="utf-8") as f:
+ f.write(content)
+
+
+def main():
+ parser = argparse.ArgumentParser(description="ClusterMesh scale-test harness.")
+ subparsers = parser.add_subparsers(dest="command")
+
+ # configure
+ pc = subparsers.add_parser("configure", help="Write CL2 overrides file")
+ pc.add_argument("--namespaces", type=int, required=True)
+ pc.add_argument("--deployments-per-namespace", type=int, required=True)
+ pc.add_argument("--replicas-per-deployment", type=int, required=True)
+ pc.add_argument("--operation-timeout", type=str, default="15m")
+ pc.add_argument("--cl2_override_file", type=str, required=True,
+ help="Path to the overrides of CL2 config file")
+
+ # execute
+ pe = subparsers.add_parser("execute", help="Run CL2 against a single cluster")
+ pe.add_argument("--cl2-image", type=str, required=True)
+ pe.add_argument("--cl2-config-dir", type=str, required=True)
+ pe.add_argument("--cl2-report-dir", type=str, required=True)
+ pe.add_argument("--cl2-config-file", type=str, required=True)
+ pe.add_argument("--kubeconfig", type=str, required=True)
+ pe.add_argument("--provider", type=str, required=True)
+
+ # collect
+ pco = subparsers.add_parser("collect", help="Collect results for one cluster")
+ pco.add_argument("--cl2_report_dir", type=str, required=True)
+ pco.add_argument("--cloud_info", type=str, default="")
+ pco.add_argument("--run_id", type=str, required=True)
+ pco.add_argument("--run_url", type=str, default="")
+ pco.add_argument("--result_file", type=str, required=True)
+ pco.add_argument("--test_type", type=str, default="default-config")
+ pco.add_argument("--start_timestamp", type=str, required=True)
+ pco.add_argument("--cluster-name", type=str, required=True,
+ help="Fleet member / AKS cluster identity for attribution")
+ pco.add_argument("--cluster-count", type=int, required=True,
+ help="Total clusters in the mesh for this run (N)")
+ pco.add_argument("--mesh-size", type=int, required=True,
+ help="Configured target cluster count from the pipeline matrix; "
+ "compared against --cluster-count to detect partial-mesh runs")
+ pco.add_argument("--namespaces", type=int, required=True)
+ pco.add_argument("--deployments-per-namespace", type=int, required=True)
+ pco.add_argument("--replicas-per-deployment", type=int, required=True)
+ pco.add_argument("--trigger_reason", type=str, default="")
+
+ args = parser.parse_args()
+
+ if args.command == "configure":
+ configure_clusterloader2(
+ args.namespaces,
+ args.deployments_per_namespace,
+ args.replicas_per_deployment,
+ args.operation_timeout,
+ args.cl2_override_file,
+ )
+ elif args.command == "execute":
+ execute_clusterloader2(
+ args.cl2_image,
+ args.cl2_config_dir,
+ args.cl2_report_dir,
+ args.cl2_config_file,
+ args.kubeconfig,
+ args.provider,
+ )
+ elif args.command == "collect":
+ collect_clusterloader2(
+ args.cl2_report_dir,
+ args.cloud_info,
+ args.run_id,
+ args.run_url,
+ args.result_file,
+ args.test_type,
+ args.start_timestamp,
+ args.cluster_name,
+ args.cluster_count,
+ args.mesh_size,
+ args.namespaces,
+ args.deployments_per_namespace,
+ args.replicas_per_deployment,
+ args.trigger_reason,
+ )
+ else:
+ parser.print_help()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/modules/python/clusterloader2/utils.py b/modules/python/clusterloader2/utils.py
index 50deb2ed85..f0cec83046 100644
--- a/modules/python/clusterloader2/utils.py
+++ b/modules/python/clusterloader2/utils.py
@@ -25,7 +25,8 @@
def run_cl2_command(kubeconfig, cl2_image, cl2_config_dir, cl2_report_dir, provider, cl2_config_file="config.yaml", overrides=False, enable_prometheus=False, tear_down_prometheus=True,
enable_exec_service=False, scrape_kubelets=False,
- scrape_containerd=False, scrape_ksm=False, scrape_metrics_server=False):
+ scrape_containerd=False, scrape_ksm=False, scrape_metrics_server=False,
+ prometheus_memory_request=None):
docker_client = DockerClient()
command = f"""--provider={provider} --v=2
@@ -42,6 +43,14 @@ def run_cl2_command(kubeconfig, cl2_image, cl2_config_dir, cl2_report_dir, provi
if scrape_containerd:
command += f" --prometheus-scrape-containerd={scrape_containerd}"
+ if prometheus_memory_request:
+ # CL2 default is 10Gi. Smaller-than-default node SKUs (e.g. AKS
+ # Standard_D4s_v4 with 16GB) can't schedule the pod with the default
+ # request, and the resource-quota / limit ratio in the bundled
+ # prometheus manifests is rejected by k8s admission. Optional
+ # parameter — None preserves CL2 default for existing callers.
+ command += f" --prometheus-memory-request={prometheus_memory_request}"
+
if overrides:
command += " --testoverrides=/root/perf-tests/clusterloader2/config/overrides.yaml"
diff --git a/modules/python/tests/mock_data/clustermesh-scale/report/mesh-1/GenericPrometheusQuery_PodStartupLatency_clustermesh-scale-test_2026-04-28T15:00:00Z.json b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-1/GenericPrometheusQuery_PodStartupLatency_clustermesh-scale-test_2026-04-28T15:00:00Z.json
new file mode 100644
index 0000000000..3100934955
--- /dev/null
+++ b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-1/GenericPrometheusQuery_PodStartupLatency_clustermesh-scale-test_2026-04-28T15:00:00Z.json
@@ -0,0 +1,29 @@
+{
+ "version": "v1",
+ "dataItems": [
+ {
+ "labels": {
+ "Metric": "Perc99"
+ },
+ "data": {
+ "value": 1.2
+ }
+ },
+ {
+ "labels": {
+ "Metric": "Perc90"
+ },
+ "data": {
+ "value": 0.8
+ }
+ },
+ {
+ "labels": {
+ "Metric": "Perc50"
+ },
+ "data": {
+ "value": 0.4
+ }
+ }
+ ]
+}
diff --git a/modules/python/tests/mock_data/clustermesh-scale/report/mesh-1/junit.xml b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-1/junit.xml
new file mode 100644
index 0000000000..34a14e3425
--- /dev/null
+++ b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-1/junit.xml
@@ -0,0 +1,9 @@
+
+
+
+
+
+
+
+
+
diff --git a/modules/python/tests/mock_data/clustermesh-scale/report/mesh-2/GenericPrometheusQuery_PodStartupLatency_clustermesh-scale-test_2026-04-28T15:00:30Z.json b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-2/GenericPrometheusQuery_PodStartupLatency_clustermesh-scale-test_2026-04-28T15:00:30Z.json
new file mode 100644
index 0000000000..dbfb9aacc8
--- /dev/null
+++ b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-2/GenericPrometheusQuery_PodStartupLatency_clustermesh-scale-test_2026-04-28T15:00:30Z.json
@@ -0,0 +1,29 @@
+{
+ "version": "v1",
+ "dataItems": [
+ {
+ "labels": {
+ "Metric": "Perc99"
+ },
+ "data": {
+ "value": 1.5
+ }
+ },
+ {
+ "labels": {
+ "Metric": "Perc90"
+ },
+ "data": {
+ "value": 1.0
+ }
+ },
+ {
+ "labels": {
+ "Metric": "Perc50"
+ },
+ "data": {
+ "value": 0.5
+ }
+ }
+ ]
+}
diff --git a/modules/python/tests/mock_data/clustermesh-scale/report/mesh-2/junit.xml b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-2/junit.xml
new file mode 100644
index 0000000000..ee983d20bc
--- /dev/null
+++ b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-2/junit.xml
@@ -0,0 +1,9 @@
+
+
+
+
+
+
+
+
+
diff --git a/modules/python/tests/mock_data/clustermesh-scale/report/mesh-fail/GenericPrometheusQuery_PodStartupLatency_clustermesh-scale-test_2026-04-28T15:01:00Z.json b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-fail/GenericPrometheusQuery_PodStartupLatency_clustermesh-scale-test_2026-04-28T15:01:00Z.json
new file mode 100644
index 0000000000..868c276002
--- /dev/null
+++ b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-fail/GenericPrometheusQuery_PodStartupLatency_clustermesh-scale-test_2026-04-28T15:01:00Z.json
@@ -0,0 +1,13 @@
+{
+ "version": "v1",
+ "dataItems": [
+ {
+ "labels": {
+ "Metric": "Perc99"
+ },
+ "data": {
+ "value": 99.9
+ }
+ }
+ ]
+}
diff --git a/modules/python/tests/mock_data/clustermesh-scale/report/mesh-fail/junit.xml b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-fail/junit.xml
new file mode 100644
index 0000000000..a9eb1b2c7f
--- /dev/null
+++ b/modules/python/tests/mock_data/clustermesh-scale/report/mesh-fail/junit.xml
@@ -0,0 +1,8 @@
+
+
+
+ timeout waiting for deployments to become ready in cluster mesh-fail
+
+
+
+
diff --git a/modules/python/tests/test_clustermesh_scale.py b/modules/python/tests/test_clustermesh_scale.py
new file mode 100644
index 0000000000..0b9dd7510e
--- /dev/null
+++ b/modules/python/tests/test_clustermesh_scale.py
@@ -0,0 +1,410 @@
+"""Unit tests for the clustermesh-scale CL2 harness.
+
+Target module: modules/python/clusterloader2/clustermesh-scale/scale.py.
+Mirrors tests/test_network_scale.py — the module is loaded via importlib because
+the ``clustermesh-scale`` directory contains a hyphen and is not a valid Python
+package name.
+
+The key invariant under test is multi-cluster attribution: when collect_clusterloader2
+is called once per cluster (as the pipeline's collect.yml does), the resulting JSONL
+rows must each carry distinct cluster identity while sharing run-level fields. Without
+this, downstream Kusto queries cannot group/filter by cluster across the mesh.
+"""
+import importlib.util
+import json
+import os
+import sys
+import tempfile
+import unittest
+from pathlib import Path
+from unittest.mock import patch
+
+MODULE_PATH = (
+ Path(__file__).resolve().parents[1]
+ / "clusterloader2"
+ / "clustermesh-scale"
+ / "scale.py"
+)
+MODULE_SPEC = importlib.util.spec_from_file_location(
+ "clusterloader2_clustermesh_scale", MODULE_PATH
+)
+if MODULE_SPEC is None or MODULE_SPEC.loader is None:
+ raise ImportError(f"Unable to load module from {MODULE_PATH}")
+clustermesh_scale_module = importlib.util.module_from_spec(MODULE_SPEC)
+MODULE_SPEC.loader.exec_module(clustermesh_scale_module)
+
+configure_clusterloader2 = clustermesh_scale_module.configure_clusterloader2
+collect_clusterloader2 = clustermesh_scale_module.collect_clusterloader2
+main = clustermesh_scale_module.main
+
+MOCK_REPORT_ROOT = os.path.join(
+ os.path.dirname(__file__), "mock_data", "clustermesh-scale", "report"
+)
+
+
+class TestConfigureClustermeshScale(unittest.TestCase):
+ """configure_clusterloader2 writes the CL2 overrides file the pipeline expects."""
+
+ def test_overrides_file_contents(self):
+ """Every CL2_* knob the config template reads must appear in the overrides file."""
+ with tempfile.NamedTemporaryFile(
+ delete=False, mode="w+", encoding="utf-8"
+ ) as tmp:
+ tmp_path = tmp.name
+
+ try:
+ configure_clusterloader2(
+ namespaces=2,
+ deployments_per_namespace=3,
+ replicas_per_deployment=4,
+ operation_timeout="20m",
+ override_file=tmp_path,
+ )
+
+ with open(tmp_path, "r", encoding="utf-8") as f:
+ content = f.read()
+
+ # Prometheus knobs — scrape Cilium agent/operator so measurement
+ # modules have data. Memory LIMIT honored via overrides; the
+ # REQUEST is set via the --prometheus-memory-request CLI flag in
+ # execute_clusterloader2 (CL2_PROMETHEUS_MEMORY_REQUEST is not a
+ # real overrides key for this CL2 image). NODE_SELECTOR pins the
+ # Prometheus pod to the dedicated `prompool` node defined in
+ # azure-2.tfvars (label prometheus=true).
+ self.assertIn("CL2_PROMETHEUS_TOLERATE_MASTER: true", content)
+ self.assertIn("CL2_PROMETHEUS_MEMORY_LIMIT: 2Gi", content)
+ self.assertIn('CL2_PROMETHEUS_NODE_SELECTOR: "prometheus: \\"true\\""', content)
+ self.assertIn("CL2_PROMETHEUS_SCRAPE_CILIUM_AGENT: true", content)
+ self.assertIn("CL2_PROMETHEUS_SCRAPE_CILIUM_OPERATOR: true", content)
+ self.assertIn("CL2_POD_STARTUP_LATENCY_THRESHOLD: 3m", content)
+ self.assertIn("CL2_ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS_SIMPLE: false", content)
+ self.assertNotIn("CL2_PROMETHEUS_MEMORY_REQUEST", content)
+ self.assertNotIn("CL2_PROMETHEUS_MEMORY_LIMIT_FACTOR", content)
+ self.assertNotIn("CL2_PROMETHEUS_MEMORY_SCALE_FACTOR", content)
+ self.assertNotIn("CL2_PROMETHEUS_CPU_SCALE_FACTOR", content)
+
+ # Topology knobs round-tripped from arguments.
+ self.assertIn("CL2_NAMESPACES: 2", content)
+ self.assertIn("CL2_DEPLOYMENTS_PER_NAMESPACE: 3", content)
+ self.assertIn("CL2_REPLICAS_PER_DEPLOYMENT: 4", content)
+ self.assertIn("CL2_OPERATION_TIMEOUT: 20m", content)
+ finally:
+ os.remove(tmp_path)
+
+ def test_overrides_file_timeout_passthrough(self):
+ """Caller-provided operation_timeout flows through unchanged (no clamping)."""
+ with tempfile.NamedTemporaryFile(
+ delete=False, mode="w+", encoding="utf-8"
+ ) as tmp:
+ tmp_path = tmp.name
+ try:
+ configure_clusterloader2(
+ namespaces=1,
+ deployments_per_namespace=1,
+ replicas_per_deployment=1,
+ operation_timeout="45m",
+ override_file=tmp_path,
+ )
+ with open(tmp_path, "r", encoding="utf-8") as f:
+ self.assertIn("CL2_OPERATION_TIMEOUT: 45m", f.read())
+ finally:
+ os.remove(tmp_path)
+
+
+class TestCollectSingleCluster(unittest.TestCase):
+ """collect_clusterloader2 emits one JSONL row per call, tagged with cluster identity."""
+
+ def _collect(self, *, cluster_name, cluster_count=2, mesh_size=2,
+ test_type="unit-test", report_subdir="mesh-1"):
+ result_file = tempfile.mktemp(suffix=".jsonl")
+ collect_clusterloader2(
+ cl2_report_dir=os.path.join(MOCK_REPORT_ROOT, report_subdir),
+ cloud_info=json.dumps({"cloud": "azure", "region": "eastus2"}),
+ run_id="test-run-123",
+ run_url="http://example.com/run123",
+ result_file=result_file,
+ test_type=test_type,
+ start_timestamp="2026-04-28T15:00:00Z",
+ cluster_name=cluster_name,
+ cluster_count=cluster_count,
+ mesh_size=mesh_size,
+ namespaces=2,
+ deployments_per_namespace=3,
+ replicas_per_deployment=4,
+ trigger_reason="Manual",
+ )
+ return result_file
+
+ def test_collect_creates_result_file(self):
+ """collect_clusterloader2 writes a non-empty JSONL with run-level fields."""
+ result_file = self._collect(cluster_name="mesh-1")
+ try:
+ self.assertTrue(os.path.exists(result_file))
+ with open(result_file, "r", encoding="utf-8") as f:
+ content = f.read()
+ self.assertGreater(len(content), 0)
+ lines = content.strip().split("\n")
+ self.assertGreaterEqual(len(lines), 1)
+ row = json.loads(lines[0])
+ self.assertEqual(row["status"], "success")
+ self.assertEqual(row["run_id"], "test-run-123")
+ self.assertEqual(row["test_type"], "unit-test")
+ self.assertEqual(row["start_timestamp"], "2026-04-28T15:00:00Z")
+ finally:
+ if os.path.exists(result_file):
+ os.remove(result_file)
+
+ def test_collect_attributes_cluster_identity(self):
+ """Cluster identity is propagated to BOTH top-level and test_details, per Kusto schema."""
+ result_file = self._collect(cluster_name="mesh-1", cluster_count=2)
+ try:
+ with open(result_file, "r", encoding="utf-8") as f:
+ row = json.loads(f.read().strip().split("\n")[0])
+ self.assertEqual(row["cluster"], "mesh-1")
+ self.assertEqual(row["cluster_count"], 2)
+ self.assertEqual(row["test_details"]["cluster"], "mesh-1")
+ self.assertEqual(row["test_details"]["cluster_count"], 2)
+ finally:
+ if os.path.exists(result_file):
+ os.remove(result_file)
+
+ def test_collect_computes_pods_per_cluster(self):
+ """pods_per_cluster = namespaces * deployments * replicas (2 * 3 * 4 = 24)."""
+ result_file = self._collect(cluster_name="mesh-1")
+ try:
+ with open(result_file, "r", encoding="utf-8") as f:
+ row = json.loads(f.read().strip().split("\n")[0])
+ self.assertEqual(row["test_details"]["pods_per_cluster"], 24)
+ self.assertEqual(row["namespaces"], 2)
+ self.assertEqual(row["deployments_per_namespace"], 3)
+ self.assertEqual(row["replicas_per_deployment"], 4)
+ finally:
+ if os.path.exists(result_file):
+ os.remove(result_file)
+
+ def test_collect_emits_mesh_size_independent_of_cluster_count(self):
+ """mesh_size (configured target) and cluster_count (observed) must be distinct fields.
+
+ Querying ``mesh_size != cluster_count`` in Kusto is how we surface
+ partial-mesh runs — a Fleet member that failed to join would manifest
+ as a smaller observed cluster_count than the configured mesh_size.
+ Both fields must be present at top level AND in test_details.
+ """
+ result_file = self._collect(cluster_name="mesh-1", cluster_count=4, mesh_size=5)
+ try:
+ with open(result_file, "r", encoding="utf-8") as f:
+ row = json.loads(f.read().strip().split("\n")[0])
+ self.assertEqual(row["mesh_size"], 5)
+ self.assertEqual(row["cluster_count"], 4)
+ self.assertEqual(row["test_details"]["mesh_size"], 5)
+ self.assertEqual(row["test_details"]["cluster_count"], 4)
+ self.assertNotEqual(row["mesh_size"], row["cluster_count"])
+ finally:
+ if os.path.exists(result_file):
+ os.remove(result_file)
+
+ def test_collect_propagates_test_type(self):
+ """test_type tags every JSONL row so Kusto can filter scenario flavors.
+
+ Scale-scenario #1 (event-throughput) and the default-config Phase-1
+ smoke run share one results table; downstream dashboards filter on
+ ``test_type == 'event-throughput'`` to scope the scaling-curve view
+ to the right workload. Regression-guards that the field flows through
+ unmodified.
+ """
+ result_file = self._collect(cluster_name="mesh-1", test_type="event-throughput")
+ try:
+ with open(result_file, "r", encoding="utf-8") as f:
+ row = json.loads(f.read().strip().split("\n")[0])
+ self.assertEqual(row["test_type"], "event-throughput")
+ finally:
+ if os.path.exists(result_file):
+ os.remove(result_file)
+
+
+class TestCollectMultiCluster(unittest.TestCase):
+ """The multi-cluster aggregation invariant — the reason this scenario exists.
+
+ collect.yml calls scale.py once per cluster and concatenates per-cluster JSONL
+ files into a single TEST_RESULTS_FILE. The resulting stream MUST have:
+ * one logical row per cluster
+ * each row's `cluster` field distinct
+ * `cluster_count` consistent across rows
+ * `run_id` consistent across rows (same pipeline run)
+ Without this, downstream Kusto cannot group/filter by cluster.
+ """
+
+ def _collect(self, *, cluster_name, report_subdir):
+ result_file = tempfile.mktemp(suffix=f".{cluster_name}.jsonl")
+ collect_clusterloader2(
+ cl2_report_dir=os.path.join(MOCK_REPORT_ROOT, report_subdir),
+ cloud_info=json.dumps({"cloud": "azure"}),
+ run_id="multi-cluster-run",
+ run_url="http://example.com/multi",
+ result_file=result_file,
+ test_type="unit-test",
+ start_timestamp="2026-04-28T15:00:00Z",
+ cluster_name=cluster_name,
+ cluster_count=2,
+ mesh_size=2,
+ namespaces=1,
+ deployments_per_namespace=1,
+ replicas_per_deployment=1,
+ trigger_reason="",
+ )
+ return result_file
+
+ def test_two_clusters_aggregate_with_distinct_attribution(self):
+ """Aggregating per-cluster JSONLs yields rows with distinct cluster identity."""
+ f1 = self._collect(cluster_name="mesh-1", report_subdir="mesh-1")
+ f2 = self._collect(cluster_name="mesh-2", report_subdir="mesh-2")
+ try:
+ # Mirror what collect.yml does: cat per-cluster files into one stream.
+ aggregated = ""
+ for path in (f1, f2):
+ with open(path, "r", encoding="utf-8") as f:
+ aggregated += f.read()
+
+ rows = [json.loads(line) for line in aggregated.strip().split("\n") if line]
+ # Each per-cluster collect emits at least one row (overall testsuite line).
+ self.assertGreaterEqual(len(rows), 2)
+
+ clusters_seen = {row["cluster"] for row in rows}
+ self.assertEqual(clusters_seen, {"mesh-1", "mesh-2"})
+
+ # Run-level fields must be identical across all rows.
+ run_ids = {row["run_id"] for row in rows}
+ cluster_counts = {row["cluster_count"] for row in rows}
+ mesh_sizes = {row["mesh_size"] for row in rows}
+ self.assertEqual(run_ids, {"multi-cluster-run"})
+ self.assertEqual(cluster_counts, {2})
+ # mesh_size is a run-level constant — it must be identical across
+ # every per-cluster row in the aggregated stream.
+ self.assertEqual(mesh_sizes, {2})
+ finally:
+ for path in (f1, f2):
+ if os.path.exists(path):
+ os.remove(path)
+
+
+class TestCollectFailureStatus(unittest.TestCase):
+ """A junit.xml with failures>0 must produce status=failure (no silent green)."""
+
+ def test_failure_in_junit_propagates_to_status(self):
+ """A junit testsuite with failures>0 must surface as status=failure in the JSONL."""
+ result_file = tempfile.mktemp(suffix=".jsonl")
+ try:
+ collect_clusterloader2(
+ cl2_report_dir=os.path.join(MOCK_REPORT_ROOT, "mesh-fail"),
+ cloud_info="",
+ run_id="fail-run",
+ run_url="",
+ result_file=result_file,
+ test_type="unit-test",
+ start_timestamp="2026-04-28T15:00:00Z",
+ cluster_name="mesh-fail",
+ cluster_count=2,
+ mesh_size=2,
+ namespaces=1,
+ deployments_per_namespace=1,
+ replicas_per_deployment=1,
+ trigger_reason="",
+ )
+ with open(result_file, "r", encoding="utf-8") as f:
+ row = json.loads(f.read().strip().split("\n")[0])
+ self.assertEqual(row["status"], "failure")
+ self.assertEqual(row["cluster"], "mesh-fail")
+ details = row["test_details"]["details"]
+ self.assertIsNotNone(details)
+ self.assertIn("timeout", json.dumps(details).lower())
+ finally:
+ if os.path.exists(result_file):
+ os.remove(result_file)
+
+
+class TestMainArgumentParsing(unittest.TestCase):
+ """main() dispatches subcommands to the right function with the right args."""
+
+ @patch.object(clustermesh_scale_module, "configure_clusterloader2")
+ def test_configure_command_parsing(self, mock_configure):
+ """`configure` subcommand wires CLI args through to configure_clusterloader2."""
+ test_args = [
+ "clustermesh-scale/scale.py",
+ "configure",
+ "--namespaces", "2",
+ "--deployments-per-namespace", "3",
+ "--replicas-per-deployment", "4",
+ "--operation-timeout", "20m",
+ "--cl2_override_file", "/tmp/overrides.yaml",
+ ]
+ with patch.object(sys, "argv", test_args):
+ main()
+ mock_configure.assert_called_once_with(2, 3, 4, "20m", "/tmp/overrides.yaml")
+
+ @patch.object(clustermesh_scale_module, "execute_clusterloader2")
+ def test_execute_command_parsing(self, mock_execute):
+ """`execute` subcommand wires CLI args through to execute_clusterloader2."""
+ test_args = [
+ "clustermesh-scale/scale.py",
+ "execute",
+ "--cl2-image", "ghcr.io/azure/clusterloader2:v20250513",
+ "--cl2-config-dir", "/path/to/config",
+ "--cl2-report-dir", "/path/to/report",
+ "--cl2-config-file", "config.yaml",
+ "--kubeconfig", "/path/to/kubeconfig",
+ "--provider", "aks",
+ ]
+ with patch.object(sys, "argv", test_args):
+ main()
+ mock_execute.assert_called_once_with(
+ "ghcr.io/azure/clusterloader2:v20250513",
+ "/path/to/config",
+ "/path/to/report",
+ "config.yaml",
+ "/path/to/kubeconfig",
+ "aks",
+ )
+
+ @patch.object(clustermesh_scale_module, "collect_clusterloader2")
+ def test_collect_command_parsing(self, mock_collect):
+ """`collect` subcommand wires CLI args through to collect_clusterloader2."""
+ test_args = [
+ "clustermesh-scale/scale.py",
+ "collect",
+ "--cl2_report_dir", "/path/to/report",
+ "--cloud_info", "{}",
+ "--run_id", "abc",
+ "--run_url", "http://example.com",
+ "--result_file", "/tmp/results.jsonl",
+ "--test_type", "default-config",
+ "--start_timestamp", "2026-04-28T15:00:00Z",
+ "--cluster-name", "mesh-1",
+ "--cluster-count", "2",
+ "--mesh-size", "2",
+ "--namespaces", "1",
+ "--deployments-per-namespace", "1",
+ "--replicas-per-deployment", "1",
+ "--trigger_reason", "Manual",
+ ]
+ with patch.object(sys, "argv", test_args):
+ main()
+ mock_collect.assert_called_once_with(
+ "/path/to/report",
+ "{}",
+ "abc",
+ "http://example.com",
+ "/tmp/results.jsonl",
+ "default-config",
+ "2026-04-28T15:00:00Z",
+ "mesh-1",
+ 2,
+ 2,
+ 1,
+ 1,
+ 1,
+ "Manual",
+ )
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/modules/terraform/azure/aks-cli/main.tf b/modules/terraform/azure/aks-cli/main.tf
index 47395fcab6..687ca04e5b 100644
--- a/modules/terraform/azure/aks-cli/main.tf
+++ b/modules/terraform/azure/aks-cli/main.tf
@@ -53,6 +53,12 @@ locals {
try(var.subnets_map[var.aks_cli_config.subnet_name], null)
)
+ pod_subnet_id = (
+ try(var.aks_cli_config.pod_subnet_name, null) == null ?
+ null :
+ try(var.subnets_map[var.aks_cli_config.pod_subnet_name], null)
+ )
+
api_server_subnet_id = (
var.aks_cli_config.api_server_subnet_name == null ?
null :
@@ -118,6 +124,14 @@ locals {
)
)
+ pod_subnet_id_parameter = (local.pod_subnet_id == null ?
+ "" :
+ format(
+ "%s %s",
+ "--pod-subnet-id", local.pod_subnet_id,
+ )
+ )
+
managed_identity_parameter = (var.aks_cli_config.managed_identity_name == null ?
"--enable-managed-identity" :
format(
@@ -193,6 +207,7 @@ locals {
local.kms_parameters,
local.disk_encryption_parameters,
local.subnet_id_parameter,
+ local.pod_subnet_id_parameter,
local.managed_identity_parameter,
local.kubelet_identity_parameter,
local.api_server_vnet_integration_parameter,
diff --git a/modules/terraform/azure/fleet/main.tf b/modules/terraform/azure/fleet/main.tf
new file mode 100644
index 0000000000..559050996e
--- /dev/null
+++ b/modules/terraform/azure/fleet/main.tf
@@ -0,0 +1,336 @@
+# =============================================================================
+# Fleet + ClusterMesh Profile submodule
+#
+# Mirrors Steps 4-6 of fleet-setup-script.sh:
+# Step 4: az fleet create
+# Step 5: az fleet member create --labels mesh=true (per cluster)
+# Step 6: az fleet clustermeshprofile create --selector mesh=true
+# az fleet clustermeshprofile apply
+#
+# Design decisions:
+# - Fleet resource: azapi_resource. There is no stable azurerm resource that
+# covers managed Fleet with the shape we need, and the clustermeshprofile
+# lives under the same ARM parent, so keeping Fleet in azapi keeps the
+# parent_id references simple.
+# - Fleet members: terraform_data + local-exec wrapping
+# `az fleet member create --labels`. Member labels (needed by the
+# clustermeshprofile selector) are first-class in the Fleet ARM API but
+# the azapi resource body shape is currently rejected for this field;
+# az CLI is the supported surface today.
+# - ClusterMeshProfile create/apply: terraform_data + local-exec, wrapping
+# `az fleet clustermeshprofile create` and `apply`. The ARM resource type
+# is still private-preview — az CLI (v2.0.4+ private .whl) is currently
+# the only path. Create and destroy commands are stored inside
+# terraform_data.input so the destroy-time provisioner can reference
+# self.input. (destroy-time provisioners can't read vars/locals).
+# Same pattern as modules/terraform/azure/aks-cli/main.tf:271-318.
+# =============================================================================
+
+locals {
+ fleet_enabled = var.fleet_enabled
+
+ members_by_name = { for m in var.members : m.member_name => m }
+
+ # Construct AKS resource IDs from known inputs. aks-cli does not emit outputs.
+ # The depends_on chain on the fleet module instance ensures AKS exists before
+ # these IDs are referenced by the member create call.
+ aks_resource_id = {
+ for m in var.members :
+ m.member_name => format(
+ "/subscriptions/%s/resourceGroups/%s/providers/Microsoft.ContainerService/managedClusters/%s",
+ var.subscription_id,
+ var.resource_group_name,
+ m.aks_name,
+ )
+ }
+}
+
+# -----------------------------------------------------------------------------
+# Step 4: Fleet resource
+# -----------------------------------------------------------------------------
+resource "azapi_resource" "fleet" {
+ count = local.fleet_enabled ? 1 : 0
+
+ type = "Microsoft.ContainerService/fleets@2025-03-01"
+ name = var.fleet_name
+ parent_id = "/subscriptions/${var.subscription_id}/resourceGroups/${var.resource_group_name}"
+ location = var.location
+ tags = var.tags
+
+ body = {
+ properties = {}
+ }
+}
+
+# -----------------------------------------------------------------------------
+# Step 5: Fleet members (one per AKS cluster), labeled for the mesh selector.
+#
+# Implemented via local-exec for two reasons:
+# 1. Mirrors the source script exactly (`az fleet member create --labels mesh=true`).
+# 2. The Fleet member ARM API rejects azapi-style bodies for the `labels` field;
+# az CLI is the supported surface for this resource shape today.
+#
+# Same pattern as the clustermeshprofile below: command stored in
+# terraform_data.input so destroy-time provisioner can reference self.input.*.
+# -----------------------------------------------------------------------------
+locals {
+ member_create_command = {
+ for m in var.members : m.member_name => join(" ", [
+ "az fleet member create",
+ "--subscription", var.subscription_id,
+ "--resource-group", var.resource_group_name,
+ "--fleet-name", var.fleet_name,
+ "--name", m.member_name,
+ "--member-cluster-id", local.aks_resource_id[m.member_name],
+ "--labels", "${var.member_label_key}=${var.member_label_value}",
+ "--output", "none",
+ ])
+ }
+
+ member_destroy_command = {
+ for m in var.members : m.member_name => join(" ", [
+ "az fleet member delete",
+ "--subscription", var.subscription_id,
+ "--resource-group", var.resource_group_name,
+ "--fleet-name", var.fleet_name,
+ "--name", m.member_name,
+ "--yes",
+ "--output", "none",
+ ])
+ }
+
+ # Re-label members during destroy so the clustermeshprofile's
+ # `${member_label_key}=${member_label_value}` selector no longer matches —
+ # this is the only way out of the Fleet API's chicken-and-egg between
+ # `member delete` (rejects with MemberBelongsToClusterMesh while attached)
+ # and `clustermeshprofile delete` (rejects with
+ # CannotDeleteClusterMeshProfileWithMembers while members exist). The
+ # value `detaching` is intentionally non-matching; `az fleet member update
+ # --labels` REPLACES the labels map (it's not additive), so this also
+ # drops the original mesh=true label.
+ member_relabel_command = {
+ for m in var.members : m.member_name => join(" ", [
+ "az fleet member update",
+ "--subscription", var.subscription_id,
+ "--resource-group", var.resource_group_name,
+ "--fleet-name", var.fleet_name,
+ "--name", m.member_name,
+ "--labels", "${var.member_label_key}=detaching",
+ "--output", "none",
+ ])
+ }
+}
+
+resource "terraform_data" "member" {
+ for_each = local.fleet_enabled ? local.members_by_name : {}
+
+ depends_on = [azapi_resource.fleet]
+
+ input = {
+ create_command = local.member_create_command[each.value.member_name]
+ destroy_command = local.member_destroy_command[each.value.member_name]
+ }
+
+ # Bash retry loop. The Fleet RP can lag behind the AKS RP by 30-60s after
+ # a fresh AKS create; without retry, `az fleet member create` returns
+ # DependentResourceNotFound. Additionally, the AKS cluster can be in
+ # `Updating` state for several minutes after the Network Contributor role
+ # assignment on the VNet (granted in modules/terraform/azure/main.tf for the
+ # clustermesh-apiserver internal LB) — `az fleet member create` rejects
+ # with `ManagedClusterNotInExpectedState` until reconciliation finishes.
+ # 60 x 20s = 20 min covers slow Azure days; the happy path exits on the
+ # first attempt (~5s).
+ provisioner "local-exec" {
+ interpreter = ["bash", "-c"]
+ command = <<-EOT
+ set -euo pipefail
+ cmd='${self.input.create_command}'
+ max=60
+ delay=20
+ for i in $(seq 1 $max); do
+ echo "[$i/$max] $cmd"
+ if eval "$cmd"; then
+ exit 0
+ fi
+ if [ "$i" -lt "$max" ]; then
+ echo "Fleet RP not ready yet, retrying in $${delay}s..."
+ sleep "$delay"
+ fi
+ done
+ echo "az fleet member create failed after $max attempts" >&2
+ exit 1
+ EOT
+ }
+
+ provisioner "local-exec" {
+ when = destroy
+ interpreter = ["bash", "-c"]
+ command = "${self.input.destroy_command} || true"
+ }
+}
+
+# -----------------------------------------------------------------------------
+# Step 6: ClusterMesh profile (create + apply) via local-exec.
+#
+# Both the create and the destroy commands are stored inside
+# terraform_data.input so the destroy provisioner can reference self.input.*
+# (destroy-time provisioners cannot reference var.* or local.*).
+#
+# Destroy ordering: this resource depends on every fleet member, so on destroy
+# Terraform tears down the profile BEFORE the members (and before the AKS
+# clusters downstream). That matches the source-of-truth teardown: detach the
+# mesh before the clusters disappear, else extension reconciliation hangs.
+# -----------------------------------------------------------------------------
+locals {
+ cmp_create_command = local.fleet_enabled ? join(" ", [
+ "az fleet clustermeshprofile create",
+ "--subscription", var.subscription_id,
+ "--resource-group", var.resource_group_name,
+ "--fleet-name", var.fleet_name,
+ "--name", var.cmp_name,
+ "--selector", "${var.member_label_key}=${var.member_label_value}",
+ "--output", "none",
+ ]) : "true"
+
+ cmp_apply_command = local.fleet_enabled ? join(" ", [
+ "az fleet clustermeshprofile apply",
+ "--subscription", var.subscription_id,
+ "--resource-group", var.resource_group_name,
+ "--fleet-name", var.fleet_name,
+ "--name", var.cmp_name,
+ "--output", "none",
+ ]) : "true"
+
+ cmp_destroy_command = local.fleet_enabled ? join(" ", [
+ "az fleet clustermeshprofile delete",
+ "--subscription", var.subscription_id,
+ "--resource-group", var.resource_group_name,
+ "--fleet-name", var.fleet_name,
+ "--name", var.cmp_name,
+ "--yes",
+ "--output", "none",
+ ]) : "true"
+
+ # Returns the count of fleet members CURRENTLY APPLIED to the profile (i.e.
+ # in the profile's reconciled member set, not just selector-matched). Used
+ # by the destroy provisioner to wait for relabel+apply to drain the set
+ # before attempting the profile delete.
+ cmp_list_applied_count_command = local.fleet_enabled ? join(" ", [
+ "az fleet clustermeshprofile list-members",
+ "--subscription", var.subscription_id,
+ "--resource-group", var.resource_group_name,
+ "--fleet-name", var.fleet_name,
+ "--name", var.cmp_name,
+ "--query", "'length(@)'",
+ "--output", "tsv",
+ ]) : "echo 0"
+}
+
+resource "terraform_data" "clustermeshprofile" {
+ count = local.fleet_enabled ? 1 : 0
+
+ depends_on = [
+ terraform_data.member,
+ ]
+
+ input = {
+ create_command = local.cmp_create_command
+ apply_command = local.cmp_apply_command
+ delete_command = local.cmp_destroy_command
+ # `list-members` (default mode) returns members APPLIED to the profile —
+ # the same set the profile-delete API checks. We poll its count to know
+ # when the relabel+apply reconcile has actually drained membership.
+ list_applied_count_command = local.cmp_list_applied_count_command
+ # Pre-built per-member `az fleet member update --labels` commands. Joined
+ # with newlines and embedded in self.input because destroy provisioners
+ # can only access self.input.* (not var.* / local.*).
+ member_relabel_commands = local.fleet_enabled ? join("\n", values(local.member_relabel_command)) : ""
+ }
+
+ # create + apply are two separate az calls. Use bash with `set -euo pipefail`
+ # so any failure aborts the chain.
+ provisioner "local-exec" {
+ interpreter = ["bash", "-c"]
+ command = "set -euo pipefail; ${self.input.create_command}; ${self.input.apply_command}"
+ }
+
+ # Destroy-time: Fleet's API has a chicken-and-egg between member-delete
+ # and clustermeshprofile-delete:
+ # - `az fleet member delete` rejects with `MemberBelongsToClusterMesh`
+ # while the member is still selected by any clustermeshprofile.
+ # - `az fleet clustermeshprofile delete` rejects with
+ # `CannotDeleteClusterMeshProfileWithMembers` while any member is
+ # still in the profile.
+ # The az fleet 2.0.4 extension exposes no first-class detach/remove-member
+ # command. The way out is to UPDATE each member's labels to a value that
+ # the profile selector no longer matches (the profile selects on
+ # `${var.member_label_key}=${var.member_label_value}` from create-time),
+ # then re-`apply` the profile so it reconciles to an empty member set,
+ # then delete the profile. After that the per-member destroy provisioner
+ # on terraform_data.member runs successfully (members are no longer
+ # attached to any profile).
+ #
+ # All steps are best-effort (`|| true` / `exit 0` at the end) so a
+ # partial-state teardown still progresses to RG cleanup.
+ provisioner "local-exec" {
+ when = destroy
+ interpreter = ["bash", "-c"]
+ command = <<-EOT
+ set -uo pipefail
+ # 1. Relabel every member off the profile's selector. After this, a
+ # subsequent `apply` will reconcile the profile's member set to empty.
+ printf '%s\n' "${self.input.member_relabel_commands}" | while IFS= read -r cmd; do
+ [ -n "$cmd" ] || continue
+ echo "[relabel-member] $cmd"
+ eval "$cmd" || true
+ done
+
+ # 2. Issue an apply to start the reconcile. apply is async on the Fleet
+ # RP — `az fleet clustermeshprofile apply` returns when the LRO is
+ # accepted, but membership reconciliation (including draining the old
+ # applied set) can lag behind by several minutes.
+ echo "[apply-profile] ${self.input.apply_command}"
+ eval "${self.input.apply_command}" || true
+
+ # 3. Poll the profile's APPLIED member count until it reaches 0. Re-issue
+ # `apply` periodically as a nudge in case the first one was a no-op
+ # (e.g. Fleet RP hadn't yet observed the relabeled members).
+ # Budget: 120 x 5s = 10 min.
+ drained=false
+ for i in $(seq 1 120); do
+ count=$(eval "${self.input.list_applied_count_command}" 2>/dev/null | tr -d '[:space:]')
+ echo "[poll-members] attempt $i/120: applied count='$count'"
+ if [ "$count" = "0" ]; then
+ drained=true
+ break
+ fi
+ # Re-apply every minute (every 12 polls) to push Fleet RP if the
+ # initial apply didn't pick up the relabel.
+ if [ "$i" -gt 1 ] && [ $((i % 12)) -eq 0 ]; then
+ echo "[apply-profile] (nudge) ${self.input.apply_command}"
+ eval "${self.input.apply_command}" || true
+ fi
+ sleep 5
+ done
+ if [ "$drained" != "true" ]; then
+ echo "[poll-members] timed out waiting for applied set to drain; will still attempt delete"
+ fi
+
+ # 4. Delete the profile. Brief retry as a backstop in case there's still
+ # propagation lag between list-members showing 0 and delete being allowed.
+ echo "[delete-profile] ${self.input.delete_command}"
+ for i in $(seq 1 30); do
+ if eval "${self.input.delete_command}"; then
+ echo "[delete-profile] succeeded on attempt $i"
+ exit 0
+ fi
+ if [ "$i" -lt 30 ]; then
+ echo "[delete-profile] retry $i/30 in 5s"
+ sleep 5
+ fi
+ done
+ echo "[delete-profile] gave up after 30 attempts; downstream cleanup will proceed"
+ exit 0
+ EOT
+ }
+}
diff --git a/modules/terraform/azure/fleet/outputs.tf b/modules/terraform/azure/fleet/outputs.tf
new file mode 100644
index 0000000000..04c5ff508e
--- /dev/null
+++ b/modules/terraform/azure/fleet/outputs.tf
@@ -0,0 +1,14 @@
+output "fleet_name" {
+ description = "Name of the Fleet resource (empty when fleet_enabled=false)."
+ value = var.fleet_enabled ? var.fleet_name : ""
+}
+
+output "cmp_name" {
+ description = "Name of the ClusterMesh profile (empty when fleet_enabled=false)."
+ value = var.fleet_enabled ? var.cmp_name : ""
+}
+
+output "member_names" {
+ description = "List of fleet member names created."
+ value = var.fleet_enabled ? [for m in var.members : m.member_name] : []
+}
diff --git a/modules/terraform/azure/fleet/variables.tf b/modules/terraform/azure/fleet/variables.tf
new file mode 100644
index 0000000000..ee4820e779
--- /dev/null
+++ b/modules/terraform/azure/fleet/variables.tf
@@ -0,0 +1,57 @@
+variable "fleet_enabled" {
+ description = "Whether to create the Fleet, members, and clustermeshprofile."
+ type = bool
+ default = false
+}
+
+variable "resource_group_name" {
+ description = "Resource group that contains the Fleet and the member AKS clusters."
+ type = string
+}
+
+variable "location" {
+ description = "Azure region for the Fleet resource."
+ type = string
+}
+
+variable "subscription_id" {
+ description = "Azure subscription GUID (used to construct AKS resource IDs and CLI calls)."
+ type = string
+}
+
+variable "fleet_name" {
+ description = "Name of the Azure Fleet Manager resource."
+ type = string
+}
+
+variable "cmp_name" {
+ description = "Name of the Fleet ClusterMesh Profile."
+ type = string
+}
+
+variable "member_label_key" {
+ description = "Label key set on fleet members and used as the clustermeshprofile selector."
+ type = string
+ default = "mesh"
+}
+
+variable "member_label_value" {
+ description = "Label value set on fleet members and used as the clustermeshprofile selector."
+ type = string
+ default = "true"
+}
+
+variable "members" {
+ description = "List of fleet members. aks_name identifies the AKS cluster in the same resource group; member_name is the Fleet-side name (intentionally may differ from aks_name)."
+ type = list(object({
+ member_name = string
+ aks_name = string
+ }))
+ default = []
+}
+
+variable "tags" {
+ description = "Tags applied to the Fleet resource."
+ type = map(string)
+ default = {}
+}
diff --git a/modules/terraform/azure/fleet/versions.tf b/modules/terraform/azure/fleet/versions.tf
new file mode 100644
index 0000000000..71a8e66c18
--- /dev/null
+++ b/modules/terraform/azure/fleet/versions.tf
@@ -0,0 +1,9 @@
+terraform {
+ required_version = ">=1.5.6"
+ required_providers {
+ azapi = {
+ source = "Azure/azapi"
+ version = "2.8.0"
+ }
+ }
+}
diff --git a/modules/terraform/azure/main.tf b/modules/terraform/azure/main.tf
index ea48654f41..2d04ad1bf4 100644
--- a/modules/terraform/azure/main.tf
+++ b/modules/terraform/azure/main.tf
@@ -320,3 +320,94 @@ module "virtual_machine" {
# Ensure AKS cluster is created before VM tries to look it up for RBAC
depends_on = [module.aks, module.aks-cli, module.azapi]
}
+
+# =============================================================================
+# ClusterMesh add-ons (vnet-peering + fleet + clustermeshprofile).
+#
+# Both are no-ops unless explicitly enabled in their *_config variable. Used
+# today only by the clustermesh-scale scenario.
+# =============================================================================
+
+data "azurerm_client_config" "current" {}
+
+module "vnet_peering" {
+ source = "./vnet-peering"
+
+ peering_enabled = try(var.vnet_peering_config.enabled, false)
+ resource_group_name = local.run_id
+ vnet_role_to_id = { for role in keys(local.network_config_map) : role => module.virtual_network[role].vnet_id }
+ vnet_role_to_name = { for role, nw in local.network_config_map : role => nw.vnet_name }
+
+ depends_on = [module.virtual_network]
+}
+
+# -----------------------------------------------------------------------------
+# Network Contributor on each member's VNet for the AKS control-plane identity.
+#
+# Required so AKS cloud-controller-manager can provision the
+# clustermesh-apiserver internal LoadBalancer Service. `az aks create`
+# auto-grants the cluster identity Network Contributor on the *node subnet*,
+# but LB provisioning on that subnet additionally needs VNet-level read.
+# Without this grant the Service stays at EXTERNAL-IP=, the
+# `cilium clustermesh status` CLI fails with "unable to derive service IPs
+# automatically", and the per-agent `cilium-clustermesh` secret is never
+# populated → cilium-dbg reports "ClusterMesh: 0/0 remote clusters ready".
+#
+# Mirrors fleet-setup-script.sh Step 3 (the reference manual setup script).
+# Gated on fleet_config.enabled so non-clustermesh scenarios are unaffected.
+# -----------------------------------------------------------------------------
+locals {
+ clustermesh_member_roles = try(var.fleet_config.enabled, false) ? {
+ for m in try(var.fleet_config.members, []) : m.aks_role => m.aks_role
+ } : {}
+}
+
+data "azurerm_kubernetes_cluster" "clustermesh_member" {
+ for_each = local.clustermesh_member_roles
+
+ name = local.aks_cli_config_map[each.key].aks_name
+ resource_group_name = local.run_id
+
+ # aks-cli creates the cluster via local-exec; depends_on defers the data
+ # read until apply time when the cluster actually exists.
+ depends_on = [module.aks-cli]
+}
+
+resource "azurerm_role_assignment" "clustermesh_vnet_contributor" {
+ for_each = local.clustermesh_member_roles
+
+ scope = module.virtual_network[each.key].vnet_id
+ role_definition_name = "Network Contributor"
+ principal_id = data.azurerm_kubernetes_cluster.clustermesh_member[each.key].identity[0].principal_id
+}
+
+module "fleet" {
+ source = "./fleet"
+
+ fleet_enabled = try(var.fleet_config.enabled, false)
+ resource_group_name = local.run_id
+ location = local.region
+ subscription_id = data.azurerm_client_config.current.subscription_id
+ fleet_name = try(var.fleet_config.fleet_name, "")
+ cmp_name = try(var.fleet_config.cmp_name, "")
+ member_label_key = try(var.fleet_config.member_label_key, "mesh")
+ member_label_value = try(var.fleet_config.member_label_value, "true")
+ members = [
+ for m in try(var.fleet_config.members, []) : {
+ member_name = m.member_name
+ aks_name = local.aks_cli_config_map[m.aks_role].aks_name
+ }
+ ]
+ tags = local.tags
+
+ # AKS clusters must exist before we join them as fleet members and apply the
+ # mesh profile. Peering must exist too — apply reaches the mesh-apiserver LB
+ # endpoints cross-cluster, which requires peering (separate-VNet mode).
+ # Network Contributor on each VNet must exist before clustermeshprofile apply
+ # so cloud-controller-manager can provision the apiserver internal LB.
+ depends_on = [
+ module.aks-cli,
+ module.vnet_peering,
+ azurerm_role_assignment.clustermesh_vnet_contributor,
+ ]
+}
diff --git a/modules/terraform/azure/variables.tf b/modules/terraform/azure/variables.tf
index 0c57fc6869..deb028690d 100644
--- a/modules/terraform/azure/variables.tf
+++ b/modules/terraform/azure/variables.tf
@@ -472,6 +472,7 @@ variable "aks_cli_config_list" {
managed_identity_name = optional(string, null)
subnet_name = optional(string, null)
+ pod_subnet_name = optional(string, null)
kubernetes_version = optional(string, null)
aks_custom_headers = optional(list(string), [])
use_custom_configurations = optional(bool, false)
@@ -586,3 +587,32 @@ variable "disk_encryption_set_config_list" {
}
}
+
+# =============================================================================
+# ClusterMesh additions (optional; used by the clustermesh-scale scenario).
+# Both default to disabled so existing scenarios are unaffected.
+# =============================================================================
+
+variable "vnet_peering_config" {
+ description = "Pairwise VNet peering across all VNets in network_config_list. Keys are stable src_role-dst_role so adding a cluster does not churn existing peerings."
+ type = object({
+ enabled = optional(bool, false)
+ })
+ default = {}
+}
+
+variable "fleet_config" {
+ description = "Azure Fleet + ClusterMesh profile. When enabled, provisions a Fleet resource, one member per entry in members (labeled member_label_key=member_label_value), and creates+applies a clustermeshprofile via local-exec against the private-preview az fleet CLI (see modules/terraform/azure/fleet/)."
+ type = object({
+ enabled = optional(bool, false)
+ fleet_name = optional(string, "")
+ cmp_name = optional(string, "")
+ member_label_key = optional(string, "mesh")
+ member_label_value = optional(string, "true")
+ members = optional(list(object({
+ member_name = string
+ aks_role = string
+ })), [])
+ })
+ default = {}
+}
diff --git a/modules/terraform/azure/vnet-peering/main.tf b/modules/terraform/azure/vnet-peering/main.tf
new file mode 100644
index 0000000000..20ffa88fbf
--- /dev/null
+++ b/modules/terraform/azure/vnet-peering/main.tf
@@ -0,0 +1,40 @@
+# =============================================================================
+# VNet peering submodule — pairwise mesh
+#
+# Mirrors Step 3b in fleet-setup-script.sh (SHARED_VNET=false mode):
+# creates az network vnet peering create in both directions for every ordered
+# pair (src, dst) with src != dst, over the VNets in var.vnet_role_to_id.
+#
+# for_each keys are the stable string "${src_role}->${dst_role}", so adding a
+# new cluster role does NOT churn peerings that already exist between other pairs.
+# =============================================================================
+
+locals {
+ peering_pairs = var.peering_enabled ? {
+ for pair in flatten([
+ for src_role, src_id in var.vnet_role_to_id : [
+ for dst_role, dst_id in var.vnet_role_to_id : {
+ key = "${src_role}->${dst_role}"
+ src_role = src_role
+ dst_role = dst_role
+ src_id = src_id
+ dst_id = dst_id
+ src_name = var.vnet_role_to_name[src_role]
+ } if src_role != dst_role
+ ]
+ ]) : pair.key => pair
+ } : {}
+}
+
+resource "azurerm_virtual_network_peering" "peering" {
+ for_each = local.peering_pairs
+
+ name = "${each.value.src_name}-to-${each.value.dst_role}"
+ resource_group_name = var.resource_group_name
+ virtual_network_name = each.value.src_name
+ remote_virtual_network_id = each.value.dst_id
+ allow_virtual_network_access = true
+ allow_forwarded_traffic = false
+ allow_gateway_transit = false
+ use_remote_gateways = false
+}
diff --git a/modules/terraform/azure/vnet-peering/outputs.tf b/modules/terraform/azure/vnet-peering/outputs.tf
new file mode 100644
index 0000000000..d8f9d9f69e
--- /dev/null
+++ b/modules/terraform/azure/vnet-peering/outputs.tf
@@ -0,0 +1,4 @@
+output "peering_keys" {
+ description = "List of peering keys (src_role->dst_role) that were created."
+ value = keys(azurerm_virtual_network_peering.peering)
+}
diff --git a/modules/terraform/azure/vnet-peering/variables.tf b/modules/terraform/azure/vnet-peering/variables.tf
new file mode 100644
index 0000000000..7aabadcf7b
--- /dev/null
+++ b/modules/terraform/azure/vnet-peering/variables.tf
@@ -0,0 +1,22 @@
+variable "peering_enabled" {
+ description = "Whether to create pairwise VNet peerings between all VNets in vnet_role_to_id."
+ type = bool
+ default = false
+}
+
+variable "vnet_role_to_id" {
+ description = "Map of network role => VNet resource ID. Every pair (a, b) with a != b gets two peerings (a->b and b->a)."
+ type = map(string)
+ default = {}
+}
+
+variable "vnet_role_to_name" {
+ description = "Map of network role => VNet name. Used to name the peering resource on the source VNet."
+ type = map(string)
+ default = {}
+}
+
+variable "resource_group_name" {
+ description = "Resource group containing all VNets."
+ type = string
+}
diff --git a/pipelines/perf-eval/Network Benchmark/clustermesh-scale.yml b/pipelines/perf-eval/Network Benchmark/clustermesh-scale.yml
new file mode 100644
index 0000000000..caaedc0ea0
--- /dev/null
+++ b/pipelines/perf-eval/Network Benchmark/clustermesh-scale.yml
@@ -0,0 +1,69 @@
+trigger: none
+
+pool: AKS-Telescope-Airlock
+
+schedules:
+ - cron: "0 4 * * 0"
+ displayName: Weekly Sunday 4am clustermesh scale test
+ branches:
+ include:
+ - main
+ always: false
+
+variables:
+ SCENARIO_TYPE: perf-eval
+ SCENARIO_NAME: clustermesh-scale
+ OWNER: aks
+
+stages:
+ - stage: azure_eastus2euap
+ dependsOn: []
+ jobs:
+ - template: /jobs/competitive-test.yml
+ parameters:
+ cloud: azure
+ regions:
+ - eastus2euap
+ engine: clusterloader2
+ engine_input:
+ image: "ghcr.io/azure/clusterloader2:v20250513"
+ install: false
+ operation_timeout: 15m
+ topology: clustermesh-scale
+ terraform_input_file_mapping:
+ - eastus2euap: "scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-2.tfvars"
+ matrix:
+ n2:
+ cluster_count: 2
+ mesh_size: 2
+ cl2_config_file: config.yaml
+ test_type: default-config
+ namespaces: 1
+ deployments_per_namespace: 2
+ replicas_per_deployment: 2
+ hold_duration: 30s
+ warmup_duration: 10s
+ restart_count: 0
+ api_server_calls_per_second: 5
+ trigger_reason: ${{ variables['Build.Reason'] }}
+ n2_event_throughput:
+ cluster_count: 2
+ mesh_size: 2
+ cl2_config_file: event-throughput.yaml
+ test_type: event-throughput
+ namespaces: 5
+ deployments_per_namespace: 4
+ replicas_per_deployment: 10
+ hold_duration: 2m
+ warmup_duration: 30s
+ restart_count: 1
+ api_server_calls_per_second: 20
+ trigger_reason: ${{ variables['Build.Reason'] }}
+ max_parallel: 1
+ timeout_in_minutes: 120
+ credential_type: service_connection
+ ssh_key_enabled: false
+ # Iteration-only: skip uploading results to the telescope blob while
+ # we're still stabilizing the clustermesh-scale pipeline. Flip to
+ # false (or remove) once results are meaningful.
+ skip_publish: true
diff --git a/pipelines/system/new-pipeline-test.yml b/pipelines/system/new-pipeline-test.yml
index 63d55f02d9..38ea068658 100644
--- a/pipelines/system/new-pipeline-test.yml
+++ b/pipelines/system/new-pipeline-test.yml
@@ -1,25 +1,66 @@
trigger: none
+pool: AKS-Telescope-Airlock
+
+schedules:
+ - cron: "0 4 * * 0"
+ displayName: Weekly Sunday 4am clustermesh scale test
+ branches:
+ include:
+ - main
+ always: false
+
variables:
- SCENARIO_TYPE:
- SCENARIO_NAME:
+ SCENARIO_TYPE: perf-eval
+ SCENARIO_NAME: clustermesh-scale
+ OWNER: aks
stages:
- - stage: # format: [_]+ (e.g. azure_eastus2, aws_eastus_westus)
+ - stage: azure_eastus2euap
dependsOn: []
jobs:
- - template: /jobs/competitive-test.yml # must keep as is
+ - template: /jobs/competitive-test.yml
parameters:
- cloud: # e.g. azure, aws
- regions: # list of regions
- - region1 # e.g. eastus2
- topology: # e.g. cluster-autoscaler
- engine: # e.g. clusterloader2
- matrix: # list of test parameters to customize the provisioned resources
- :
- :
- :
- max_parallel: # required
- credential_type: service_connection # required
+ cloud: azure
+ regions:
+ - eastus2euap
+ engine: clusterloader2
+ engine_input:
+ image: "ghcr.io/azure/clusterloader2:v20250513"
+ install: false
+ operation_timeout: 15m
+ topology: clustermesh-scale
+ terraform_input_file_mapping:
+ - eastus2euap: "scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-2.tfvars"
+ matrix:
+ # Mirror pipelines/perf-eval/Network Benchmark/clustermesh-scale.yml
+ # so dev runs use the same matrix-var plumbing as production.
+ # Auto-exported as uppercase env vars (NAMESPACES, MESH_SIZE, etc.)
+ # by AzDO and consumed in steps/engine/clusterloader2/clustermesh-scale/execute.yml.
+ #
+ # Production clustermesh-scale.yml also has an `n2` trivial-vertical-slice
+ # entry. We don't run it in dev — n2_event_throughput already exercises
+ # the full plumbing and per-run cost (full Fleet/AKS lifecycle ~15-20 min)
+ # makes a second axis expensive during iteration.
+ n2_event_throughput:
+ cluster_count: 2
+ mesh_size: 2
+ cl2_config_file: event-throughput.yaml
+ test_type: event-throughput
+ namespaces: 5
+ deployments_per_namespace: 4
+ replicas_per_deployment: 10
+ hold_duration: 2m
+ warmup_duration: 30s
+ restart_count: 1
+ api_server_calls_per_second: 20
+ trigger_reason: ${{ variables['Build.Reason'] }}
+ max_parallel: 1
+ timeout_in_minutes: 120
+ credential_type: service_connection
ssh_key_enabled: false
- timeout_in_minutes: 60 # if not specified, default is 60
+ # Iteration-only: skip uploading results to the telescope blob while
+ # we're still stabilizing the clustermesh-scale pipeline. Mirrors the
+ # same flag in pipelines/perf-eval/Network Benchmark/clustermesh-scale.yml.
+ # Flip to false (or remove) once results are meaningful.
+ skip_publish: true
diff --git a/scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-2.tfvars b/scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-2.tfvars
new file mode 100644
index 0000000000..535bdba5a7
--- /dev/null
+++ b/scenarios/perf-eval/clustermesh-scale/terraform-inputs/azure-2.tfvars
@@ -0,0 +1,179 @@
+scenario_type = "perf-eval"
+scenario_name = "clustermesh-scale"
+deletion_delay = "4h"
+owner = "aks"
+
+# =============================================================================
+# ClusterMesh Scale Test — 2 cluster tier
+#
+# Mirrors fleet-setup-script.sh with SHARED_VNET=false (separate VNets + peering).
+# - 2 VNets (one per cluster) at 10..0.0/16
+# - Per-cluster node subnet (10..0.0/24, 254 IPs) + pod subnet (10..4.0/22, 1022 IPs)
+# - 2 AKS clusters with Cilium + ACNS, Azure CNI w/ pod subnet (not overlay)
+# - Pairwise VNet peering between the two VNets (both directions)
+# - Fleet + 2 fleet members (label mesh=true) + clustermeshprofile
+#
+# Pod subnet sizing: /22 (1022 IPs) is the floor for any Phase 2 scenario in
+# this tier. Math: ~70 baseline pods (kube-system + AKS add-ons across 2 nodes)
+# + 200 workload pods (event-throughput n2 tier: 5 ns x 4 dep x 10 replicas)
+# = ~270 pods/cluster, plus headroom for future churn-stress / HA scenarios
+# without re-touching the network plan. /24 (254 IPs) was insufficient.
+# Larger tiers (n5/n10/n20 in Phase 3) will get their own tfvars files with
+# subnets sized for their cluster + pod counts.
+#
+# Naming:
+# VNet role : mesh-1, mesh-2 (one VNet per role)
+# AKS role : mesh-1, mesh-2 (one AKS per role)
+# AKS cluster name : clustermesh-1, clustermesh-2
+# Fleet member name : mesh-1, mesh-2 (intentionally != cluster name)
+# Fleet name : clustermesh-flt
+# Profile name : clustermesh-cmp
+# =============================================================================
+
+network_config_list = [
+ {
+ role = "mesh-1"
+ vnet_name = "clustermesh-1-vnet"
+ vnet_address_space = "10.1.0.0/16"
+ subnet = [
+ {
+ name = "clustermesh-1-node"
+ address_prefix = "10.1.0.0/24"
+ },
+ {
+ name = "clustermesh-1-pod"
+ address_prefix = "10.1.4.0/22"
+ }
+ ]
+ network_security_group_name = ""
+ nic_public_ip_associations = []
+ nsr_rules = []
+ },
+ {
+ role = "mesh-2"
+ vnet_name = "clustermesh-2-vnet"
+ vnet_address_space = "10.2.0.0/16"
+ subnet = [
+ {
+ name = "clustermesh-2-node"
+ address_prefix = "10.2.0.0/24"
+ },
+ {
+ name = "clustermesh-2-pod"
+ address_prefix = "10.2.4.0/22"
+ }
+ ]
+ network_security_group_name = ""
+ nic_public_ip_associations = []
+ nsr_rules = []
+ }
+]
+
+aks_cli_config_list = [
+ {
+ role = "mesh-1"
+ aks_name = "clustermesh-1"
+ sku_tier = "Standard"
+ subnet_name = "clustermesh-1-node"
+ pod_subnet_name = "clustermesh-1-pod"
+ use_aks_preview_cli_extension = true
+
+ optional_parameters = [
+ { name = "generate-ssh-keys", value = "" },
+ { name = "network-plugin", value = "azure" },
+ { name = "network-dataplane", value = "cilium" },
+ { name = "enable-acns", value = "" },
+ # AKS default is 30 pods/node. Phase-2 event-throughput workload runs
+ # 5ns x 4dep x 10 replicas = 200 pods per cluster; with 2 default-pool
+ # nodes that's 100/node, so we need ≥110 to leave headroom for Cilium
+ # agent, ACNS daemons, monitoring stack, and kube-system pods. Azure
+ # CNI with pod subnet supports up to 250.
+ { name = "max-pods", value = "110" },
+ ]
+
+ # Default pool sizing: D4s_v5 (4 vCPU / 16GB) is enough for the workload
+ # pods alone. Prometheus is pinned to prompool below — without that
+ # split, Prometheus's 1Gi+ memory request co-tenanting on default-pool
+ # nodes caused per-node CPU overcommit (~160% allocatable) and left
+ # workload pods stuck Pending.
+ default_node_pool = {
+ name = "default"
+ node_count = 2
+ auto_scaling_enabled = false
+ vm_size = "Standard_D4s_v5"
+ }
+ # Dedicated Prometheus node, labeled `prometheus=true`. CL2 is
+ # configured (in modules/python/clusterloader2/clustermesh-scale/scale.py
+ # via CL2_PROMETHEUS_NODE_SELECTOR) to schedule the prometheus-k8s pod
+ # only on this label, so it doesn't compete with workload pods. Mirrors
+ # the `prompool` pattern from
+ # scenarios/perf-eval/cnl-azurecni-overlay-cilium/terraform-inputs/azure.tfvars.
+ # D8s_v3 (8 vCPU / 32GB) is sized for our 1Gi-request Prometheus with
+ # ample headroom — much smaller than #1053's D32s_v5 because our
+ # workload spec is also much smaller.
+ extra_node_pool = [
+ {
+ name = "prompool"
+ node_count = 1
+ auto_scaling_enabled = false
+ vm_size = "Standard_D8s_v3"
+ optional_parameters = [
+ { name = "labels", value = "prometheus=true" },
+ ]
+ },
+ ]
+ },
+ {
+ role = "mesh-2"
+ aks_name = "clustermesh-2"
+ sku_tier = "Standard"
+ subnet_name = "clustermesh-2-node"
+ pod_subnet_name = "clustermesh-2-pod"
+ use_aks_preview_cli_extension = true
+
+ optional_parameters = [
+ { name = "generate-ssh-keys", value = "" },
+ { name = "network-plugin", value = "azure" },
+ { name = "network-dataplane", value = "cilium" },
+ { name = "enable-acns", value = "" },
+ { name = "max-pods", value = "110" },
+ ]
+
+ default_node_pool = {
+ name = "default"
+ node_count = 2
+ auto_scaling_enabled = false
+ vm_size = "Standard_D4s_v5"
+ }
+ extra_node_pool = [
+ {
+ name = "prompool"
+ node_count = 1
+ auto_scaling_enabled = false
+ vm_size = "Standard_D8s_v3"
+ optional_parameters = [
+ { name = "labels", value = "prometheus=true" },
+ ]
+ },
+ ]
+ }
+]
+
+# =============================================================================
+# Fleet + ClusterMesh (new vars in this scenario)
+# =============================================================================
+vnet_peering_config = {
+ enabled = true
+}
+
+fleet_config = {
+ enabled = true
+ fleet_name = "clustermesh-flt"
+ cmp_name = "clustermesh-cmp"
+ member_label_key = "mesh"
+ member_label_value = "true"
+ members = [
+ { member_name = "mesh-1", aks_role = "mesh-1" },
+ { member_name = "mesh-2", aks_role = "mesh-2" }
+ ]
+}
diff --git a/scenarios/perf-eval/clustermesh-scale/terraform-test-inputs/azure-2.json b/scenarios/perf-eval/clustermesh-scale/terraform-test-inputs/azure-2.json
new file mode 100644
index 0000000000..b2a8243a56
--- /dev/null
+++ b/scenarios/perf-eval/clustermesh-scale/terraform-test-inputs/azure-2.json
@@ -0,0 +1,4 @@
+{
+ "run_id": "cmesh2test",
+ "region": "westus2"
+}
diff --git a/scenarios/perf-eval/clustermesh-scale/vendor/fleet-2.0.4-py3-none-any.whl b/scenarios/perf-eval/clustermesh-scale/vendor/fleet-2.0.4-py3-none-any.whl
new file mode 100644
index 0000000000..68bf9f5746
Binary files /dev/null and b/scenarios/perf-eval/clustermesh-scale/vendor/fleet-2.0.4-py3-none-any.whl differ
diff --git a/steps/engine/clusterloader2/clustermesh-scale/collect.yml b/steps/engine/clusterloader2/clustermesh-scale/collect.yml
new file mode 100644
index 0000000000..6a879a2c58
--- /dev/null
+++ b/steps/engine/clusterloader2/clustermesh-scale/collect.yml
@@ -0,0 +1,88 @@
+parameters:
+ - name: cloud
+ type: string
+ default: ""
+ - name: engine_input
+ type: object
+ default: {}
+ - name: region
+ type: string
+
+steps:
+ - template: /steps/cloud/${{ parameters.cloud }}/collect-cloud-info.yml
+ parameters:
+ region: ${{ parameters.region }}
+
+ - script: |
+ set -eo pipefail
+ set -x
+
+ # Re-export matrix vars under CL2_*/MESH_SIZE/TEST_TYPE names that scale.py
+ # collect expects. Same workaround as execute.yml — matrix-var `$()`
+ # macros don't expand reliably in `env:` blocks.
+ export CL2_NAMESPACES="$NAMESPACES"
+ export CL2_DEPLOYMENTS_PER_NAMESPACE="$DEPLOYMENTS_PER_NAMESPACE"
+ export CL2_REPLICAS_PER_DEPLOYMENT="$REPLICAS_PER_DEPLOYMENT"
+ export MESH_SIZE="${MESH_SIZE:-$CLUSTERMESH_COUNT}"
+ export TEST_TYPE="${TEST_TYPE:-default-config}"
+ export TRIGGER_REASON="${TRIGGER_REASON:-$BUILD_REASON}"
+
+ clusters=$(cat "$HOME/.kube/clustermesh-clusters.json")
+ cluster_count=$(echo "$clusters" | jq 'length')
+
+ # Aggregate every per-cluster JSONL into a single TEST_RESULTS_FILE.
+ # Each line carries `cluster: ` so downstream Kusto queries can
+ # group/filter by cluster across the mesh.
+ mkdir -p "$(dirname "$TEST_RESULTS_FILE")"
+ : > "$TEST_RESULTS_FILE"
+
+ for row in $(echo "$clusters" | jq -c '.[]'); do
+ role=$(echo "$row" | jq -r '.role')
+ report_dir="${CL2_REPORT_DIR}/${role}"
+
+ if [ ! -d "$report_dir" ]; then
+ echo "##vso[task.logissue type=warning;] $role: missing report dir $report_dir, skipping"
+ continue
+ fi
+
+ # If CL2 errored out before producing junit.xml (e.g. prometheus stack
+ # setup timeout), skip aggregation for this cluster — scale.py collect
+ # would crash on the missing file. The execute step already logged a
+ # warning per-cluster; we don't want to also abort the whole pipeline
+ # at collect time when partial data may be useful.
+ if [ ! -f "$report_dir/junit.xml" ]; then
+ echo "##vso[task.logissue type=warning;] $role: $report_dir/junit.xml not found (CL2 likely failed); skipping collect for this cluster"
+ continue
+ fi
+
+ per_cluster_result="${TEST_RESULTS_FILE%.*}.${role}.${TEST_RESULTS_FILE##*.}"
+
+ PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE collect \
+ --cl2_report_dir "$report_dir" \
+ --cloud_info "${CLOUD_INFO:-}" \
+ --run_id "$RUN_ID" \
+ --run_url "$RUN_URL" \
+ --result_file "$per_cluster_result" \
+ --start_timestamp "$START_TIME" \
+ --cluster-name "$role" \
+ --cluster-count "$cluster_count" \
+ --mesh-size "$MESH_SIZE" \
+ --test_type "$TEST_TYPE" \
+ --namespaces "$CL2_NAMESPACES" \
+ --deployments-per-namespace "$CL2_DEPLOYMENTS_PER_NAMESPACE" \
+ --replicas-per-deployment "$CL2_REPLICAS_PER_DEPLOYMENT" \
+ --trigger_reason "${TRIGGER_REASON:-}"
+
+ cat "$per_cluster_result" >> "$TEST_RESULTS_FILE"
+ done
+
+ echo "Aggregated results from $cluster_count clusters into $TEST_RESULTS_FILE"
+ wc -l "$TEST_RESULTS_FILE" || true
+ workingDirectory: modules/python
+ env:
+ CLOUD: ${{ parameters.cloud }}
+ RUN_URL: $(RUN_URL)
+ PYTHON_SCRIPT_FILE: $(Pipeline.Workspace)/s/modules/python/clusterloader2/clustermesh-scale/scale.py
+ CL2_REPORT_DIR: $(Pipeline.Workspace)/s/modules/python/clusterloader2/clustermesh-scale/results
+ BUILD_REASON: $(Build.Reason)
+ displayName: "Collect + aggregate results across clustermesh clusters"
diff --git a/steps/engine/clusterloader2/clustermesh-scale/execute.yml b/steps/engine/clusterloader2/clustermesh-scale/execute.yml
new file mode 100644
index 0000000000..cd82bc2d70
--- /dev/null
+++ b/steps/engine/clusterloader2/clustermesh-scale/execute.yml
@@ -0,0 +1,206 @@
+parameters:
+ - name: cloud
+ type: string
+ default: ""
+ - name: engine_input
+ type: object
+ default: {}
+ - name: region
+ type: string
+
+steps:
+ - script: |
+ echo "Set the start time for test execution"
+ startTimestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
+ echo "Start: $startTimestamp"
+ echo "##vso[task.setvariable variable=START_TIME]$startTimestamp"
+ displayName: set up timestamp variable
+
+ - script: |
+ set -eo pipefail
+ set -x
+
+ # Matrix variables (namespaces, mesh_size, deployments_per_namespace,
+ # replicas_per_deployment, hold_duration, warmup_duration, restart_count,
+ # api_server_calls_per_second, test_type) are auto-exported by AzDO to
+ # the script as UPPERCASE env vars (e.g. NAMESPACES, MESH_SIZE). Re-export
+ # them under the CL2_* names that scale.py and the CL2 yaml templates
+ # (config.yaml / event-throughput.yaml) consume.
+ #
+ # Why this re-export rather than `env: CL2_NAMESPACES: $(namespaces)` in
+ # the YAML: AzDO's `$()` runtime macro does not expand matrix variables
+ # in `env:` block values (see prior failed run with literal '$(namespaces)'
+ # reaching python). Same pattern as
+ # steps/engine/clusterloader2/network-scale/execute.yml which references
+ # the auto-exported names directly.
+ export CL2_NAMESPACES="$NAMESPACES"
+ export CL2_DEPLOYMENTS_PER_NAMESPACE="$DEPLOYMENTS_PER_NAMESPACE"
+ export CL2_REPLICAS_PER_DEPLOYMENT="$REPLICAS_PER_DEPLOYMENT"
+ export CL2_API_SERVER_CALLS_PER_SECOND="$API_SERVER_CALLS_PER_SECOND"
+ export CL2_HOLD_DURATION="$HOLD_DURATION"
+ export CL2_WARMUP_DURATION="$WARMUP_DURATION"
+ export CL2_RESTART_GENERATION="$RESTART_COUNT"
+
+ # Same discovery pattern as topology/clustermesh-scale/validate-resources.yml.
+ # We re-run it here rather than relying on a step variable so this engine
+ # file can be invoked independently.
+ clusters=$(az resource list \
+ --resource-type Microsoft.ContainerService/managedClusters \
+ --location "$REGION" \
+ --query "[?tags.run_id=='${RUN_ID}' && starts_with(tags.role, 'mesh-')].{name:name, rg:resourceGroup, role:tags.role}" \
+ -o json)
+
+ cluster_count=$(echo "$clusters" | jq 'length')
+ if [ "$cluster_count" -lt 2 ]; then
+ echo "##vso[task.logissue type=error;] Expected >=2 clustermesh clusters, found $cluster_count"
+ exit 1
+ fi
+
+ echo "Running CL2 across $cluster_count clusters"
+ mkdir -p "$HOME/.kube"
+ echo "$clusters" > "$HOME/.kube/clustermesh-clusters.json"
+ echo "##vso[task.setvariable variable=CLUSTERMESH_COUNT]$cluster_count"
+
+ # CL2 overrides are written once — params are identical for every cluster
+ # in this run (the per-cluster variation is which kubeconfig CL2 hits).
+ PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE configure \
+ --namespaces "$CL2_NAMESPACES" \
+ --deployments-per-namespace "$CL2_DEPLOYMENTS_PER_NAMESPACE" \
+ --replicas-per-deployment "$CL2_REPLICAS_PER_DEPLOYMENT" \
+ --operation-timeout "${CL2_OPERATION_TIMEOUT:-15m}" \
+ --cl2_override_file "${CL2_CONFIG_DIR}/overrides.yaml"
+
+ # Per-cluster CL2 fan-out — sequential. Each invocation writes its own
+ # report dir at ${CL2_REPORT_DIR}//, so collect.yml can iterate the
+ # same way and tag results with --cluster-name.
+ failures=0
+ for row in $(echo "$clusters" | jq -c '.[]'); do
+ name=$(echo "$row" | jq -r '.name')
+ rg=$(echo "$row" | jq -r '.rg')
+ role=$(echo "$row" | jq -r '.role')
+
+ echo "===================================================================="
+ echo " Running CL2 on $role ($name)"
+ echo "===================================================================="
+
+ kubeconfig="$HOME/.kube/$role.config"
+ KUBECONFIG="$kubeconfig" az aks get-credentials \
+ --resource-group "$rg" --name "$name" --overwrite-existing --only-show-errors
+
+ report_dir="${CL2_REPORT_DIR}/${role}"
+ mkdir -p "$report_dir"
+
+ cl2_passed=0
+ # Run CL2; collect outcome WITHOUT failing the bash script (so we can
+ # also inspect junit.xml for internal test failures even when CL2 exits
+ # 0). Treat as "passed" only if BOTH:
+ # (a) junit.xml exists (CL2 actually completed and wrote a report)
+ # (b) junit.xml has zero / elements
+ # Without (b) we'd silently green-light runs where measurements failed
+ # — e.g. PodMonitor template substitution producing "", which
+ # k8s admission rejects but CL2 still writes junit with tags.
+ PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE execute \
+ --cl2-image "${CL2_IMAGE}" \
+ --cl2-config-dir "${CL2_CONFIG_DIR}" \
+ --cl2-report-dir "$report_dir" \
+ --cl2-config-file "${CL2_CONFIG_FILE}" \
+ --kubeconfig "$kubeconfig" \
+ --provider "${CLOUD}" \
+ || true
+ if [ -f "$report_dir/junit.xml" ]; then
+ # Count failure/error attrs from .
+ junit_failures=$(grep -oE 'failures="[0-9]+"' "$report_dir/junit.xml" | head -1 | grep -oE '[0-9]+' || echo 0)
+ junit_errors=$(grep -oE 'errors="[0-9]+"' "$report_dir/junit.xml" | head -1 | grep -oE '[0-9]+' || echo 0)
+ junit_failures=${junit_failures:-0}
+ junit_errors=${junit_errors:-0}
+ if [ "$junit_failures" -eq 0 ] && [ "$junit_errors" -eq 0 ]; then
+ cl2_passed=1
+ else
+ echo "##vso[task.logissue type=warning;] $role: junit.xml reports failures=$junit_failures errors=$junit_errors"
+ fi
+ fi
+
+ if [ "$cl2_passed" -eq 1 ]; then
+ echo " $role: CL2 run succeeded"
+ fi
+
+ # Always-on log capture (spec line 35: "Logs: clustermesh-apiserver,
+ # agent watchers"). Files land in $report_dir/logs/ so they are
+ # uploaded alongside junit.xml + measurement results when the
+ # publish step runs. The same files double as immediate
+ # diagnostics for failed runs (see FAILURE DIAG block below).
+ log_dir="$report_dir/logs"
+ mkdir -p "$log_dir"
+ echo "------- $role: capturing pod logs to $log_dir -------"
+ # clustermesh-apiserver: all three containers (apiserver / etcd /
+ # kvstoremesh) — bounded tail, single pod expected.
+ for c in apiserver etcd kvstoremesh; do
+ KUBECONFIG="$kubeconfig" kubectl -n kube-system logs \
+ -l k8s-app=clustermesh-apiserver -c "$c" --tail=4000 \
+ > "$log_dir/clustermesh-apiserver-$c.log" 2>&1 || true
+ done
+ # cilium-agent: one pod per node — keep tail small to bound size.
+ KUBECONFIG="$kubeconfig" kubectl -n kube-system logs \
+ -l k8s-app=cilium --tail=1000 --prefix=true \
+ > "$log_dir/cilium-agent.log" 2>&1 || true
+ # cilium-operator: low-volume control plane.
+ KUBECONFIG="$kubeconfig" kubectl -n kube-system logs \
+ -l io.cilium/app=operator --tail=2000 --prefix=true \
+ > "$log_dir/cilium-operator.log" 2>&1 || true
+
+ if [ "$cl2_passed" -ne 1 ]; then
+ # Dump enough state to distinguish prometheus-stack scheduling
+ # failures from CL2 logic failures. Prometheus is the most common
+ # culprit here — its pod requests 10Gi by default, doesn't fit on
+ # Standard_D4s_v4. If the pod is Pending with FailedScheduling, the
+ # describe events make that obvious.
+ #
+ # Note: scale.py passes tear_down_prometheus=False so the stack
+ # survives this dump (otherwise CL2 would clean up before we look).
+ echo "------- $role: CL2 FAILURE DIAG -------"
+ echo "------- node allocatable / requested capacity -------"
+ KUBECONFIG="$kubeconfig" kubectl get nodes -o wide 2>&1 || true
+ KUBECONFIG="$kubeconfig" kubectl describe nodes 2>&1 | grep -A 4 "Allocatable\|Allocated resources" | head -40 || true
+
+ echo "------- monitoring/* pods -------"
+ KUBECONFIG="$kubeconfig" kubectl -n monitoring get pods -o wide 2>&1 || true
+
+ echo "------- monitoring statefulsets -------"
+ KUBECONFIG="$kubeconfig" kubectl -n monitoring get statefulset -o wide 2>&1 || true
+
+ echo "------- Prometheus CR (operator input) -------"
+ KUBECONFIG="$kubeconfig" kubectl -n monitoring get prometheus -o yaml 2>&1 | head -80 || true
+
+ echo "------- prometheus-k8s pod describe -------"
+ KUBECONFIG="$kubeconfig" kubectl -n monitoring describe pod -l app.kubernetes.io/name=prometheus 2>&1 | tail -60 || true
+
+ echo "------- prometheus-operator logs (tail 60) -------"
+ KUBECONFIG="$kubeconfig" kubectl -n monitoring logs -l app.kubernetes.io/name=prometheus-operator --tail=60 2>&1 || true
+
+ echo "------- monitoring namespace events (recent) -------"
+ KUBECONFIG="$kubeconfig" kubectl -n monitoring get events --sort-by='.lastTimestamp' 2>&1 | tail -30 || true
+ echo "------- end CL2 FAILURE DIAG -------"
+
+ echo "##vso[task.logissue type=warning;] $role: CL2 run failed (junit missing or has failures/errors at $report_dir/junit.xml; continuing other clusters)"
+ failures=$((failures + 1))
+ fi
+ done
+
+ if [ "$failures" -gt 0 ]; then
+ echo "##vso[task.logissue type=error;] CL2 failed on $failures cluster(s)"
+ exit 1
+ fi
+ workingDirectory: modules/python
+ env:
+ ${{ if eq(parameters.cloud, 'azure') }}:
+ CLOUD: aks
+ ${{ else }}:
+ CLOUD: ${{ parameters.cloud }}
+ REGION: ${{ parameters.region }}
+ PYTHON_SCRIPT_FILE: $(Pipeline.Workspace)/s/modules/python/clusterloader2/clustermesh-scale/scale.py
+ CL2_IMAGE: ${{ parameters.engine_input.image }}
+ CL2_CONFIG_DIR: $(Pipeline.Workspace)/s/modules/python/clusterloader2/clustermesh-scale/config
+ CL2_CONFIG_FILE: $(cl2_config_file)
+ CL2_REPORT_DIR: $(Pipeline.Workspace)/s/modules/python/clusterloader2/clustermesh-scale/results
+ CL2_OPERATION_TIMEOUT: ${{ parameters.engine_input.operation_timeout }}
+ displayName: "Run CL2 across all clustermesh clusters"
diff --git a/steps/setup-tests.yml b/steps/setup-tests.yml
index d790917dca..ed7840dc4c 100644
--- a/steps/setup-tests.yml
+++ b/steps/setup-tests.yml
@@ -72,6 +72,45 @@ steps:
region: ${{ parameters.region }}
credential_type: ${{ parameters.credential_type }}
+ - script: |
+ # Install the Azure Fleet preview CLI extension required by the
+ # clustermesh-scale scenario. The Fleet ClusterMeshProfile API surface
+ # is private-preview and only the bundled wheel exposes the
+ # `az fleet clustermeshprofile` and `az fleet member create --labels`
+ # commands invoked by terraform local-exec at provision time.
+ #
+ # The wheel is vendored in-repo at scenarios/perf-eval/clustermesh-scale/vendor/.
+ set -euo pipefail
+ whl="$(Pipeline.Workspace)/s/scenarios/perf-eval/$(SCENARIO_NAME)/vendor/fleet-2.0.4-py3-none-any.whl"
+ if [ ! -f "$whl" ]; then
+ echo "##vso[task.logissue type=error;] Vendored fleet wheel not found at $whl"
+ exit 1
+ fi
+ az extension remove --name fleet --only-show-errors 2>/dev/null || true
+ az extension add --source "$whl" --yes --only-show-errors
+ az fleet --help >/dev/null
+ az fleet clustermeshprofile --help >/dev/null
+ echo "Fleet preview CLI installed from $whl"
+ displayName: "Install Fleet preview CLI (clustermesh scenarios)"
+ condition: startsWith(variables['SCENARIO_NAME'], 'clustermesh')
+
+ - script: |
+ # Install cilium-cli on the runner for richer ClusterMesh diagnostics.
+ # `cilium clustermesh status --context ` reports per-remote-cluster
+ # connection state, endpoint counts, and version skew — info that the
+ # in-pod `cilium-dbg status` doesn't expose. Used by topology
+ # validate-resources.yml on each cluster context.
+ set -euo pipefail
+ CILIUM_CLI_VERSION=v0.16.20
+ CLI_ARCH=amd64
+ curl -sSL --fail --remote-name-all \
+ "https://github.com/cilium/cilium-cli/releases/download/${CILIUM_CLI_VERSION}/cilium-linux-${CLI_ARCH}.tar.gz"
+ sudo tar xzvfC "cilium-linux-${CLI_ARCH}.tar.gz" /usr/local/bin
+ rm "cilium-linux-${CLI_ARCH}.tar.gz"
+ cilium version --client
+ displayName: "Install cilium-cli (clustermesh scenarios)"
+ condition: startsWith(variables['SCENARIO_NAME'], 'clustermesh')
+
- script: |
if [ -n "${TEST_MODULES_DIR}" ]; then
test_modules_directory=$(Pipeline.Workspace)/s/${TEST_MODULES_DIR}
diff --git a/steps/topology/clustermesh-scale/collect-clusterloader2.yml b/steps/topology/clustermesh-scale/collect-clusterloader2.yml
new file mode 100644
index 0000000000..29f6c86b38
--- /dev/null
+++ b/steps/topology/clustermesh-scale/collect-clusterloader2.yml
@@ -0,0 +1,18 @@
+parameters:
+ - name: cloud
+ type: string
+ default: ""
+ - name: engine_input
+ type: object
+ default: {}
+ - name: regions
+ type: object
+ default: {}
+
+steps:
+ - template: /steps/set-run-id.yml
+ - template: /steps/engine/clusterloader2/clustermesh-scale/collect.yml
+ parameters:
+ cloud: ${{ parameters.cloud }}
+ engine_input: ${{ parameters.engine_input }}
+ region: ${{ parameters.regions[0] }}
diff --git a/steps/topology/clustermesh-scale/execute-clusterloader2.yml b/steps/topology/clustermesh-scale/execute-clusterloader2.yml
new file mode 100644
index 0000000000..eb1f53f7a4
--- /dev/null
+++ b/steps/topology/clustermesh-scale/execute-clusterloader2.yml
@@ -0,0 +1,17 @@
+parameters:
+ - name: cloud
+ type: string
+ default: ""
+ - name: engine_input
+ type: object
+ default: {}
+ - name: regions
+ type: object
+ default: {}
+
+steps:
+ - template: /steps/engine/clusterloader2/clustermesh-scale/execute.yml
+ parameters:
+ cloud: ${{ parameters.cloud }}
+ engine_input: ${{ parameters.engine_input }}
+ region: ${{ parameters.regions[0] }}
diff --git a/steps/topology/clustermesh-scale/validate-resources.yml b/steps/topology/clustermesh-scale/validate-resources.yml
new file mode 100644
index 0000000000..bfd47a11c6
--- /dev/null
+++ b/steps/topology/clustermesh-scale/validate-resources.yml
@@ -0,0 +1,402 @@
+parameters:
+ - name: cloud
+ type: string
+ - name: engine
+ type: string
+ - name: regions
+ type: object
+
+steps:
+ # -----------------------------------------------------------------------------
+ # Per-cluster validation: enumerate every fleet member, fetch its kubeconfig,
+ # assert nodes are Ready, cilium agent is Running, and the cluster reports
+ # mesh state Connected to all (N-1) remote clusters.
+ #
+ # Cluster discovery uses the same tag-based pattern as
+ # /steps/cloud/azure/update-kubeconfig.yml — clusters are tagged
+ # role=mesh-N at terraform-apply time.
+ # -----------------------------------------------------------------------------
+ - script: |
+ set -euo pipefail
+ set -x
+
+ region=${{ parameters.regions[0] }}
+
+ # JSON list of {name, rg, role} for every clustermesh AKS cluster in this run.
+ clusters=$(az resource list \
+ --resource-type Microsoft.ContainerService/managedClusters \
+ --location "$region" \
+ --query "[?tags.run_id=='${RUN_ID}' && starts_with(tags.role, 'mesh-')].{name:name, rg:resourceGroup, role:tags.role}" \
+ -o json)
+
+ count=$(echo "$clusters" | jq 'length')
+ if [ "$count" -lt 2 ]; then
+ echo "##vso[task.logissue type=error;] Expected >=2 clustermesh AKS clusters tagged run_id=${RUN_ID}, found $count"
+ exit 1
+ fi
+
+ echo "Discovered $count clustermesh clusters:"
+ echo "$clusters" | jq -r '.[] | " \(.role): \(.name) in \(.rg)"'
+
+ mkdir -p "$HOME/.kube"
+ echo "$clusters" > "$HOME/.kube/clustermesh-clusters.json"
+
+ echo "##vso[task.setvariable variable=CLUSTERMESH_COUNT]$count"
+ displayName: "Enumerate clustermesh clusters"
+
+ - script: |
+ set -euo pipefail
+ set -x
+
+ clusters=$(cat "$HOME/.kube/clustermesh-clusters.json")
+ expected_remote=$(( $(echo "$clusters" | jq 'length') - 1 ))
+
+ failures=0
+ for row in $(echo "$clusters" | jq -c '.[]'); do
+ name=$(echo "$row" | jq -r '.name')
+ rg=$(echo "$row" | jq -r '.rg')
+ role=$(echo "$row" | jq -r '.role')
+
+ echo "===================================================================="
+ echo " Validating $role ($name)"
+ echo "===================================================================="
+
+ # Per-cluster kubeconfig file at $HOME/.kube/.config — keeps each
+ # cluster's auth state isolated so concurrent kubectl calls don't race.
+ kubeconfig="$HOME/.kube/$role.config"
+ KUBECONFIG="$kubeconfig" az aks get-credentials \
+ --resource-group "$rg" --name "$name" --overwrite-existing --only-show-errors
+
+ export KUBECONFIG="$kubeconfig"
+
+ echo "--- nodes ---"
+ kubectl get nodes -o wide
+ kubectl wait --for=condition=Ready nodes --all --timeout=5m
+
+ echo "--- cilium agent pods ---"
+ kubectl -n kube-system get pods -l k8s-app=cilium -o wide
+ kubectl -n kube-system rollout status ds/cilium --timeout=5m
+
+ echo "--- clustermesh-apiserver pod ---"
+ kubectl -n kube-system get pods -l k8s-app=clustermesh-apiserver -o wide || true
+
+ # Surface the apiserver/kvstoremesh container ports so we can confirm
+ # the PodMonitor scrape targets (expected: apiserver=9963, kvstoremesh=9964)
+ # without needing to drop into a pod. Informational only — does not gate.
+ echo "--- clustermesh-apiserver exposed ports ---"
+ kubectl -n kube-system get pod -l k8s-app=clustermesh-apiserver \
+ -o jsonpath='{range .items[*].spec.containers[*]}{.name}:{range .ports[*]}{.name}={.containerPort} {end}{"\n"}{end}' \
+ 2>/dev/null || true
+ echo
+
+ echo "--- cilium-dbg status (ClusterMesh section) ---"
+ # Retry up to ~5 minutes — the mesh propagation can lag a few seconds
+ # past az fleet clustermeshprofile apply's return.
+ # We use `cilium-dbg status` (in-pod debug binary) rather than the
+ # external `cilium clustermesh status` so we don't require cilium-cli
+ # on the agent. cilium-dbg status includes a "ClusterMesh:" block of
+ # the form:
+ # ClusterMesh: 2/2 remote clusters ready, 0 global-services
+ # mesh-2: ready, ...
+ # Retry up to ~10 minutes — the AKS-managed Cilium operator publishes
+ # the per-agent `cilium-clustermesh` Secret asynchronously after Fleet
+ # finishes profile apply, and the clustermesh-apiserver may be
+ # recreated mid-validation (cert/config rotation), bumping the wait
+ # another ~30s for agents to reload. Empirically 5 min was too tight
+ # for whichever cluster gets validated first; 10 min covers it with
+ # margin.
+ #
+ # Note: `cilium-dbg status` (in-pod, agent's local view) and
+ # `cilium clustermesh status` (CLI, queries clustermesh-apiserver) can
+ # disagree for several minutes during this window — the CLI flips to
+ # "configured/connected" first because it counts apiserver clients,
+ # while the in-pod view requires the Secret to be reloaded. We gate on
+ # the in-pod view because the data path needs the agent's local state.
+ connected=0
+ for i in $(seq 1 60); do
+ out=$(kubectl -n kube-system exec ds/cilium -- cilium-dbg status 2>&1 || true)
+ echo "$out"
+ # Parse "/ remote clusters ready" line.
+ ready=$(echo "$out" | sed -nE 's/.*ClusterMesh:[[:space:]]+([0-9]+)\/[0-9]+ remote clusters ready.*/\1/p' | head -1)
+ ready=${ready:-0}
+ if [ "$ready" -ge "$expected_remote" ]; then
+ connected=1
+ break
+ fi
+
+ # ============== DEBUG-DUMP-BEGIN (REMOVE BEFORE MERGE) ==============
+ # Every 6 iterations dump richer state: in-pod cilium-cli view of the
+ # mesh, clustermesh-apiserver pod state, and Fleet-side member status.
+ # These help diagnose why convergence is stalling. Strip before final
+ # PR review.
+ if [ "$((i % 6))" -eq 0 ]; then
+ echo "------- [debug] retry $i: cilium clustermesh status (runner cli) -------"
+ cilium clustermesh status --context "$(kubectl config current-context)" --wait=false 2>&1 || true
+
+ echo "------- [debug] retry $i: clustermesh-apiserver pods -------"
+ kubectl -n kube-system get pods -l k8s-app=clustermesh-apiserver -o wide 2>&1 || true
+ kubectl -n kube-system describe pods -l k8s-app=clustermesh-apiserver 2>&1 | tail -40 || true
+
+ echo "------- [debug] retry $i: clustermesh-apiserver service -------"
+ # Service of type LoadBalancer for the clustermesh-apiserver. If
+ # EXTERNAL-IP stays "", the AKS control-plane identity is
+ # missing Network Contributor on the VNet (cloud-controller-manager
+ # cannot provision the internal LB). Look in describe events for
+ # AuthorizationFailed / forbidden messages.
+ kubectl -n kube-system get svc clustermesh-apiserver -o wide 2>&1 || true
+ kubectl -n kube-system describe svc clustermesh-apiserver 2>&1 | tail -25 || true
+
+ echo "------- [debug] retry $i: cilium agent restarts / readiness -------"
+ kubectl -n kube-system get pods -l k8s-app=cilium -o wide 2>&1 || true
+
+ echo "------- [debug] retry $i: Fleet ClusterMeshProfile profile-level status -------"
+ # Profile-level mesh state (NotConnected/Connecting/Connected/Failed)
+ # plus the last operation error if any. This is the authoritative
+ # control-plane view of whether the mesh has converged.
+ az fleet clustermeshprofile show \
+ --resource-group "$rg" \
+ --fleet-name clustermesh-flt \
+ --name clustermesh-cmp \
+ --query "{state:properties.status.state, provisioningState:properties.provisioningState, lastError:properties.status.lastOperationError}" \
+ -o jsonc 2>&1 || true
+
+ echo "------- [debug] retry $i: Fleet ClusterMeshProfile members (connection state) -------"
+ # Per-member: provisioningState is just ARM-level (join accepted);
+ # meshProperties.status.state is the actual Cilium connection state.
+ az fleet clustermeshprofile list-members \
+ --resource-group "$rg" \
+ --fleet-name clustermesh-flt \
+ --name clustermesh-cmp \
+ --query "[].{name:name, provisioning:properties.provisioningState, mesh:properties.meshProperties.status.state, lastUpdated:properties.meshProperties.status.lastUpdatedAt, error:properties.meshProperties.status.error.message}" \
+ -o table 2>&1 || true
+ fi
+ # =============== DEBUG-DUMP-END (REMOVE BEFORE MERGE) ===============
+
+ echo " waiting for $expected_remote remote clusters to be ready (got $ready), retry $i/60..."
+ sleep 10
+ done
+
+ if [ "$connected" -ne 1 ]; then
+ echo "##vso[task.logissue type=error;] $role: clustermesh not Connected to $expected_remote remote clusters"
+ failures=$((failures + 1))
+ fi
+
+ echo "--- cilium clustermesh status (runner-side, richer diagnostics) ---"
+ # Best-effort, informational only — failures here don't fail the step
+ # because the in-pod check above is authoritative. cilium-cli reports
+ # per-remote connection state, endpoint counts, and version info.
+ cilium clustermesh status --context "$(kubectl config current-context)" --wait=false || true
+ done
+
+ if [ "$failures" -gt 0 ]; then
+ echo "##vso[task.logissue type=error;] $failures cluster(s) failed mesh validation"
+ exit 1
+ fi
+ displayName: "Validate Cilium + ClusterMesh on every cluster"
+
+ - script: |
+ set -euo pipefail
+ set -x
+
+ # Cross-cluster data-path smoke: deploy a `global` service backed by an
+ # echo pod in the first cluster, deploy a curl client in the second
+ # cluster, and curl the service by name. If global service load-balancing
+ # works, the request resolves cross-cluster via the mesh data path.
+ #
+ # Per plan.md Phase 1 exit criteria, we don't ship a "green" Phase 1 that
+ # only validated control plane.
+
+ clusters=$(cat "$HOME/.kube/clustermesh-clusters.json")
+ first_role=$(echo "$clusters" | jq -r '.[0].role')
+ second_role=$(echo "$clusters" | jq -r '.[1].role')
+
+ kc_first="$HOME/.kube/$first_role.config"
+ kc_second="$HOME/.kube/$second_role.config"
+
+ ns="cm-smoke"
+
+ cleanup() {
+ KUBECONFIG="$kc_first" kubectl delete ns "$ns" --ignore-not-found --wait=false || true
+ KUBECONFIG="$kc_second" kubectl delete ns "$ns" --ignore-not-found --wait=false || true
+ }
+ trap cleanup EXIT
+
+ cat <<'EOF' > /tmp/cm-smoke-server.yaml
+ apiVersion: v1
+ kind: Namespace
+ metadata:
+ name: cm-smoke
+ annotations:
+ # AKS managed Cilium gates clustermesh sync at the *namespace* level
+ # by default (CFP-39876, "managed Cilium" change). Without this,
+ # neither pod identities, endpoints, nor services in this namespace
+ # are synced across clusters — even with service.cilium.io/global on
+ # the Service. This is the load-bearing annotation here; the
+ # service-level one below is kept for explicitness.
+ clustermesh.cilium.io/global: "true"
+ ---
+ apiVersion: apps/v1
+ kind: Deployment
+ metadata:
+ name: echo
+ namespace: cm-smoke
+ spec:
+ replicas: 1
+ selector:
+ matchLabels: { app: echo }
+ template:
+ metadata:
+ labels: { app: echo }
+ spec:
+ containers:
+ - name: echo
+ image: registry.k8s.io/e2e-test-images/agnhost:2.47
+ args: ["netexec", "--http-port=8080"]
+ ports: [{ containerPort: 8080 }]
+ ---
+ apiVersion: v1
+ kind: Service
+ metadata:
+ name: echo
+ namespace: cm-smoke
+ annotations:
+ # The namespace annotation above is what actually gates sync in AKS
+ # managed Cilium; this service-level annotation is kept for explicit
+ # intent and forward-compatibility.
+ service.cilium.io/global: "true"
+ spec:
+ selector: { app: echo }
+ ports:
+ - port: 80
+ targetPort: 8080
+ EOF
+
+ cat <<'EOF' > /tmp/cm-smoke-client.yaml
+ apiVersion: v1
+ kind: Namespace
+ metadata:
+ name: cm-smoke
+ annotations:
+ clustermesh.cilium.io/global: "true"
+ ---
+ # Cilium global services require the same Service name to exist in every
+ # participating cluster. The Service in cluster 2 has no local backends;
+ # cross-cluster lookup resolves to cluster 1's pods via the mesh.
+ apiVersion: v1
+ kind: Service
+ metadata:
+ name: echo
+ namespace: cm-smoke
+ annotations:
+ service.cilium.io/global: "true"
+ spec:
+ selector: { app: echo }
+ ports:
+ - port: 80
+ targetPort: 8080
+ ---
+ apiVersion: v1
+ kind: Pod
+ metadata:
+ name: curl
+ namespace: cm-smoke
+ labels: { app: curl }
+ spec:
+ restartPolicy: Never
+ containers:
+ - name: curl
+ image: curlimages/curl:8.10.1
+ command: ["sleep", "600"]
+ EOF
+
+ KUBECONFIG="$kc_first" kubectl apply -f /tmp/cm-smoke-server.yaml
+ KUBECONFIG="$kc_second" kubectl apply -f /tmp/cm-smoke-client.yaml
+
+ KUBECONFIG="$kc_first" kubectl -n "$ns" rollout status deploy/echo --timeout=3m
+ KUBECONFIG="$kc_second" kubectl -n "$ns" wait --for=condition=Ready pod/curl --timeout=3m
+
+ # Give Cilium clustermesh a moment to sync the new global Service from
+ # cluster 1 → cluster 2 before the first curl attempt. Empirically this
+ # is sub-second once mesh is converged, but we've already paid the cost
+ # of waiting for rollouts above so a small settle here doesn't matter.
+ sleep 15
+
+ # Try for 2 minutes — global service endpoints can take a few seconds
+ # to populate via the mesh.
+ ok=0
+ for i in $(seq 1 24); do
+ if KUBECONFIG="$kc_second" kubectl -n "$ns" exec curl -- \
+ curl -fsS -m 5 http://echo.cm-smoke.svc.cluster.local/hostname; then
+ ok=1
+ echo ""
+ echo "Cross-cluster curl succeeded on attempt $i"
+ break
+ fi
+ echo " attempt $i/24 failed, retrying in 5s..."
+ sleep 5
+ done
+
+ if [ "$ok" -ne 1 ]; then
+ # ============== SMOKE-FAILURE-DEBUG-DUMP (REMOVE BEFORE MERGE) ==============
+ # On failure, dump enough state to distinguish Cilium global-service
+ # sync issues from cross-VNet pod-IP routing issues. Specifically:
+ # 1. cilium clustermesh status — should show "Global services: 1" if sync OK
+ # 2. cilium service list (in-pod) — should have an entry for cm-smoke/echo
+ # with remote-cluster backends in cluster 2
+ # 3. kubectl describe svc / get endpoints echo — k8s view (cluster 2 should
+ # have NO local endpoints, that's expected)
+ # 4. From inside the curl pod: DNS resolve, then direct-IP curl to a
+ # cluster-1 echo pod IP — bypasses ClusterIP, tests raw L3 across VNets
+ echo
+ echo "================ SMOKE FAILURE DIAG (cluster $first_role -- backend) ================"
+ KUBECONFIG="$kc_first" cilium clustermesh status --context "$(KUBECONFIG="$kc_first" kubectl config current-context)" --wait=false 2>&1 || true
+ KUBECONFIG="$kc_first" kubectl -n "$ns" describe svc echo 2>&1 || true
+ KUBECONFIG="$kc_first" kubectl -n "$ns" get endpoints echo -o wide 2>&1 || true
+ KUBECONFIG="$kc_first" kubectl -n "$ns" get pods -l app=echo -o wide 2>&1 || true
+ echo "------- $first_role: cilium-config (clustermesh-relevant flags) -------"
+ # Authoritative source for whether the cilium agent is configured to
+ # process global services. Look for: enable-cluster-mesh,
+ # cluster-mesh-shared-services, clustermesh-config, identity-allocation-mode,
+ # enable-services. AKS/ACNS may gate global services with a feature flag.
+ KUBECONFIG="$kc_first" kubectl -n kube-system get cm cilium-config -o yaml 2>&1 \
+ | grep -iE 'cluster-mesh|clustermesh|service|global|identity' || true
+ echo "------- $first_role: cilium service list (full, head 40) -------"
+ KUBECONFIG="$kc_first" kubectl -n kube-system exec ds/cilium -c cilium-agent -- cilium service list 2>&1 | head -40 || true
+ echo "------- $first_role: cilium-operator logs (tail 60) -------"
+ KUBECONFIG="$kc_first" kubectl -n kube-system logs -l io.cilium/app=operator --tail=60 2>&1 \
+ | grep -iE 'global|clustermesh|cluster-mesh|cm-smoke|service' || true
+
+ echo
+ echo "================ SMOKE FAILURE DIAG (cluster $second_role -- client) ================"
+ KUBECONFIG="$kc_second" cilium clustermesh status --context "$(KUBECONFIG="$kc_second" kubectl config current-context)" --wait=false 2>&1 || true
+ KUBECONFIG="$kc_second" kubectl -n "$ns" describe svc echo 2>&1 || true
+ KUBECONFIG="$kc_second" kubectl -n "$ns" get endpoints echo -o wide 2>&1 || true
+ echo "------- $second_role: cilium-config (clustermesh-relevant flags) -------"
+ KUBECONFIG="$kc_second" kubectl -n kube-system get cm cilium-config -o yaml 2>&1 \
+ | grep -iE 'cluster-mesh|clustermesh|service|global|identity' || true
+ echo "------- $second_role: cilium service list (full, head 40) -------"
+ KUBECONFIG="$kc_second" kubectl -n kube-system exec ds/cilium -c cilium-agent -- cilium service list 2>&1 | head -40 || true
+ echo "------- $second_role: cilium-operator logs (tail 60) -------"
+ KUBECONFIG="$kc_second" kubectl -n kube-system logs -l io.cilium/app=operator --tail=60 2>&1 \
+ | grep -iE 'global|clustermesh|cluster-mesh|cm-smoke|service' || true
+
+ echo
+ echo "------- DNS + direct-pod-IP probe from curl pod (bypass ClusterIP) -------"
+ # ClusterIP plumbing is a Cilium-clustermesh concern; direct pod-IP
+ # connectivity is a VNet-peering concern. Hitting a backend pod IP
+ # directly disambiguates the two failure modes.
+ KUBECONFIG="$kc_second" kubectl -n "$ns" exec curl -- nslookup echo.cm-smoke.svc.cluster.local 2>&1 || true
+ backend_ip=$(KUBECONFIG="$kc_first" kubectl -n "$ns" get pod -l app=echo -o jsonpath='{.items[0].status.podIP}' 2>/dev/null || true)
+ echo "first cluster's echo pod IP: ${backend_ip:-}"
+ if [ -n "${backend_ip:-}" ]; then
+ KUBECONFIG="$kc_second" kubectl -n "$ns" exec curl -- \
+ curl -fsS -m 5 "http://${backend_ip}:8080/hostname" 2>&1 || \
+ echo " direct pod-IP curl ALSO failed → cross-VNet routing issue (peering / pod-CIDR routes)"
+ fi
+ echo "============================ END SMOKE DIAG ============================"
+ # =========================== END SMOKE-FAILURE-DEBUG-DUMP ===========================
+
+ echo "##vso[task.logissue type=error;] Cross-cluster data-path smoke failed: $second_role could not reach service in $first_role"
+ exit 1
+ fi
+ displayName: "Cross-cluster data-path smoke (global service curl)"