From 6f8f45c717de1804cf8b77347cd1a43d044ee764 Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Fri, 26 Jun 2026 07:09:59 -0500 Subject: [PATCH] feat: add DLQ & Policy Health dashboard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Single-pane triage dashboard for on-call engineers responding to DLQ and ActivityPolicy alerts. Surfaces the failing policy, error type, and resource kind in one view, distinguishes growing backlog from retry churn, and links directly to Loki logs filtered to the clicked policy. Key features: - Row 1 stat row: DLQ backlog, publish rate, net drain (growing vs draining), retry success rate, publish errors — all with or vector(0) for zero-series safety - Row 2 top-N table: topk(25) by (policy_name, api_group, kind, error_type) matching the ActivityPolicyDLQErrors alert tuple; each row links to Loki Explore pre-filtered to that policy - Row 3 trend series: DLQ rate by error_type, policy, and kind - Row 4 retry recovery: outcomes by result, still-failing re-eval table (dlq_retry_failed_total), poison events (high_retry_total), batch duration p99 - Row 5 publish-path: errors by phase, latency p99/p95/p50 - Row 6 logs: JSON-parsed DLQ lines filtered by $policy_name and $error_type vars; catch-all regex fallback panel All metric label constraints respected per §6: retry_attempts and publish_errors not filtered by policy_name/error_type (those labels don't exist on those metrics). Dashboard ships via existing Flux observability OCI sync — no infra change required. --- ...ty-dlq-policy-health-grafanadashboard.yaml | 16 + .../generated/activity-dlq-policy-health.json | 1150 +++++++++++++++++ .../dashboards/kustomization.yaml | 9 + .../activity-dlq-policy-health.jsonnet | 549 ++++++++ 4 files changed, 1724 insertions(+) create mode 100644 config/components/observability/dashboards/activity-dlq-policy-health-grafanadashboard.yaml create mode 100644 config/components/observability/dashboards/generated/activity-dlq-policy-health.json create mode 100644 observability/dashboards/activity-dlq-policy-health.jsonnet diff --git a/config/components/observability/dashboards/activity-dlq-policy-health-grafanadashboard.yaml b/config/components/observability/dashboards/activity-dlq-policy-health-grafanadashboard.yaml new file mode 100644 index 00000000..0d27a05d --- /dev/null +++ b/config/components/observability/dashboards/activity-dlq-policy-health-grafanadashboard.yaml @@ -0,0 +1,16 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: activity-dlq-policy-health-dashboard + labels: + dashboards: grafana +spec: + folder: "Platform / Activity" + allowCrossNamespaceImport: true + instanceSelector: + matchLabels: + dashboards: grafana + resyncPeriod: 30s + configMapRef: + name: activity-dlq-policy-health-dashboard + key: activity-dlq-policy-health.json diff --git a/config/components/observability/dashboards/generated/activity-dlq-policy-health.json b/config/components/observability/dashboards/generated/activity-dlq-policy-health.json new file mode 100644 index 00000000..f63dd61e --- /dev/null +++ b/config/components/observability/dashboards/generated/activity-dlq-policy-health.json @@ -0,0 +1,1150 @@ +{ + "description": "Single-pane triage dashboard for DLQ backlog, failing policies, retry recovery, and processor logs", + "editable": true, + "graphTooltip": 1, + "links": [ + { + "icon": "external link", + "targetBlank": true, + "title": "DLQ Runbooks", + "type": "link", + "url": "https://github.com/milo-os/activity/tree/main/docs/runbooks/dlq/" + } + ], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 0, + "x": 0, + "y": 0 + }, + "id": 1, + "panels": [ ], + "title": "At-a-Glance", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "Current number of events stuck in the DLQ", + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 100 + }, + { + "color": "red", + "value": 1000 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "max(nats_stream_total_messages{stream_name=\"ACTIVITY_DEAD_LETTER\", cluster=~\"$cluster\"})", + "legendFormat": "Messages" + } + ], + "title": "DLQ Backlog", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "Sequence gap in DLQ stream — proxy for backlog age", + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1000 + }, + { + "color": "red", + "value": 10000 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 6, + "y": 1 + }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "max(nats_stream_last_seq{stream_name=\"ACTIVITY_DEAD_LETTER\", cluster=~\"$cluster\"} - on() nats_stream_first_seq{stream_name=\"ACTIVITY_DEAD_LETTER\", cluster=~\"$cluster\"})", + "legendFormat": "Seq gap" + } + ], + "title": "Backlog Age (oldest)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "Rate of new events being published to the DLQ", + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.10000000000000001 + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 12, + "y": 1 + }, + "id": 4, + "options": { + "colorMode": "background", + "graphMode": "area", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(rate(activity_processor_dlq_events_published_total{cluster=~\"$cluster\", api_group=~\"$api_group\", kind=~\"$kind\", policy_name=~\"$policy_name\", error_type=~\"$error_type\"}[5m])) or vector(0)", + "legendFormat": "Events/s" + } + ], + "title": "DLQ Publish Rate", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "Rate at which retries are clearing DLQ events", + "fieldConfig": { + "defaults": { + "unit": "ops" + } + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 18, + "y": 1 + }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(rate(activity_processor_dlq_retry_attempts_total{result=~\"succeeded|republished\", cluster=~\"$cluster\", api_group=~\"$api_group\", kind=~\"$kind\"}[5m])) or vector(0)", + "legendFormat": "Resolved/s" + } + ], + "title": "Retry Resolve Rate", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "Publish rate minus resolve rate — positive means backlog is growing", + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0 + }, + { + "color": "red", + "value": 0.01 + } + ] + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 6 + }, + "id": 6, + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "(sum(rate(activity_processor_dlq_events_published_total{cluster=~\"$cluster\", api_group=~\"$api_group\", kind=~\"$kind\", policy_name=~\"$policy_name\", error_type=~\"$error_type\"}[5m])) or vector(0)) - (sum(rate(activity_processor_dlq_retry_attempts_total{result=~\"succeeded|republished\", cluster=~\"$cluster\", api_group=~\"$api_group\", kind=~\"$kind\"}[5m])) or vector(0))", + "legendFormat": "Net drain" + } + ], + "title": "Net Drain", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "Fraction of DLQ retry attempts that succeeded", + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "yellow", + "value": 0.80000000000000004 + }, + { + "color": "green", + "value": 0.94999999999999996 + } + ] + }, + "unit": "percentunit" + } + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 6, + "y": 6 + }, + "id": 7, + "options": { + "colorMode": "background", + "graphMode": "area", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "(sum(rate(activity_processor_dlq_retry_attempts_total{result=\"succeeded\", cluster=~\"$cluster\"}[5m])) or vector(0)) / clamp_min(sum(rate(activity_processor_dlq_retry_attempts_total{cluster=~\"$cluster\"}[5m])), 1)", + "legendFormat": "Success rate" + } + ], + "title": "Retry Success Rate", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "Rate of errors when publishing to DLQ — non-zero means events are being lost", + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0.01 + } + ] + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 12, + "y": 6 + }, + "id": 8, + "options": { + "colorMode": "background", + "graphMode": "area", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(rate(activity_processor_dlq_publish_errors_total{cluster=~\"$cluster\"}[5m])) or vector(0)", + "legendFormat": "Errors/s" + } + ], + "title": "DLQ Publish Errors", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 0, + "x": 18, + "y": 11 + }, + "id": 9, + "panels": [ ], + "title": "What is broken NOW", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "Top 25 policies currently publishing to DLQ — the primary triage view for ActivityPolicyDLQErrors", + "fieldConfig": { + "defaults": { + "links": [ + { + "targetBlank": true, + "title": "View in Loki", + "url": "/explore?orgId=1&left={\"datasource\":\"loki\",\"queries\":[{\"expr\":\"{namespace=\\\"activity-system\\\", container=\\\"processor\\\"} | json | policy=\\\"${__data.fields.policy_name}\\\" | errorType=~\\\".+\\\"\",\"refId\":\"A\"}],\"range\":{\"from\":\"${__from}\",\"to\":\"${__to}\"}}" + } + ], + "unit": "ops" + } + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 12 + }, + "id": 10, + "options": { + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Value" + } + ] + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "topk(25, sum by (policy_name, api_group, kind, error_type) (rate(activity_processor_dlq_events_published_total{policy_name!=\"\", cluster=~\"$cluster\", api_group=~\"$api_group\", kind=~\"$kind\", policy_name=~\"$policy_name\", error_type=~\"$error_type\"}[10m])))", + "instant": true, + "legendFormat": "{{policy_name}}" + } + ], + "title": "Top Failing Policies", + "type": "table" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 0, + "x": 24, + "y": 22 + }, + "id": 11, + "panels": [ ], + "title": "Trends", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "DLQ rate by failure class — identifies dominant error mode", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 30, + "showPoints": "never", + "stacking": { + "mode": "normal" + } + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 23 + }, + "id": 12, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by (error_type) (rate(activity_processor_dlq_events_published_total{cluster=~\"$cluster\", api_group=~\"$api_group\", kind=~\"$kind\", policy_name=~\"$policy_name\", error_type=~\"$error_type\"}[5m]))", + "legendFormat": "{{error_type}}" + } + ], + "title": "DLQ Rate by error_type", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "DLQ rate by policy — identifies persistent per-policy failures (DLQSlowLeak)", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 30, + "showPoints": "never", + "stacking": { + "mode": "normal" + } + }, + "links": [ + { + "targetBlank": true, + "title": "View in Loki", + "url": "/explore?orgId=1&left={\"datasource\":\"loki\",\"queries\":[{\"expr\":\"{namespace=\\\"activity-system\\\", container=\\\"processor\\\"} | json | policy=\\\"${__field.labels.policy_name}\\\"\",\"refId\":\"A\"}],\"range\":{\"from\":\"${__from}\",\"to\":\"${__to}\"}}" + } + ], + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 23 + }, + "id": 13, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "label_replace(sum by (policy_name) (rate(activity_processor_dlq_events_published_total{cluster=~\"$cluster\", api_group=~\"$api_group\", kind=~\"$kind\", policy_name=~\"$policy_name\", error_type=~\"$error_type\"}[5m])), \"policy_name\", \"(no policy)\", \"policy_name\", \"^$\")", + "legendFormat": "{{policy_name}}" + } + ], + "title": "DLQ Rate by policy", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "DLQ rate by resource kind — identifies affected resource types", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 30, + "showPoints": "never", + "stacking": { + "mode": "normal" + } + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 31 + }, + "id": 14, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by (api_group, kind) (rate(activity_processor_dlq_events_published_total{cluster=~\"$cluster\", api_group=~\"$api_group\", kind=~\"$kind\", policy_name=~\"$policy_name\", error_type=~\"$error_type\"}[5m]))", + "legendFormat": "{{api_group}}/{{kind}}" + } + ], + "title": "DLQ Rate by kind", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 0, + "x": 12, + "y": 39 + }, + "id": 15, + "panels": [ ], + "title": "Retry & Recovery", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "Retry attempt outcomes over time — succeeded vs republished vs failed", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 30, + "showPoints": "never", + "stacking": { + "mode": "normal" + } + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 40 + }, + "id": 16, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by (result) (rate(activity_processor_dlq_retry_attempts_total{cluster=~\"$cluster\", api_group=~\"$api_group\", kind=~\"$kind\"}[5m]))", + "legendFormat": "{{result}}" + } + ], + "title": "Retry outcomes", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "Policies NOT recovering after retry — triage for DLQRetryIneffective", + "fieldConfig": { + "defaults": { + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 40 + }, + "id": 17, + "options": { + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Value" + } + ] + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "topk(25, sum by (policy_name, error_type) (rate(activity_processor_dlq_retry_failed_total{cluster=~\"$cluster\", api_group=~\"$api_group\", kind=~\"$kind\", policy_name=~\"$policy_name\", error_type=~\"$error_type\"}[10m])))", + "instant": true, + "legendFormat": "{{policy_name}}" + } + ], + "title": "Still-failing re-eval by policy", + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "Events exceeding retry threshold by policy — identifies poison events (DLQHighRetryCount)", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 48 + }, + "id": 18, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by (policy_name, api_group, kind) (increase(activity_processor_dlq_retry_events_high_retry_total{cluster=~\"$cluster\", api_group=~\"$api_group\", kind=~\"$kind\", policy_name=~\"$policy_name\"}[1h]))", + "legendFormat": "{{policy_name}}" + } + ], + "title": "High-retry (poison) events by policy", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "Retry batch processing duration — high values indicate retry path stalling", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 48 + }, + "id": 19, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.99, sum by (le, trigger) (rate(activity_processor_dlq_retry_batch_duration_seconds_bucket{cluster=~\"$cluster\"}[5m])))", + "legendFormat": "{{trigger}} p99" + } + ], + "title": "Retry batch duration p99", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 0, + "x": 24, + "y": 56 + }, + "id": 20, + "panels": [ ], + "title": "Publish-Path Health", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "DLQ publish errors by phase (marshal/publish) — non-zero is data loss risk", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 30, + "showPoints": "never", + "stacking": { + "mode": "normal" + } + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 57 + }, + "id": 21, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by (error_phase) (rate(activity_processor_dlq_publish_errors_total{cluster=~\"$cluster\"}[5m]))", + "legendFormat": "{{error_phase}}" + } + ], + "title": "Publish errors by phase", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "DLQ publish write path latency distribution", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 57 + }, + "id": 22, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.99, sum by (le) (rate(activity_processor_dlq_publish_latency_seconds_bucket{cluster=~\"$cluster\"}[5m])))", + "legendFormat": "p99" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.95, sum by (le) (rate(activity_processor_dlq_publish_latency_seconds_bucket{cluster=~\"$cluster\"}[5m])))", + "legendFormat": "p95" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.50, sum by (le) (rate(activity_processor_dlq_publish_latency_seconds_bucket{cluster=~\"$cluster\"}[5m])))", + "legendFormat": "p50" + } + ], + "title": "DLQ publish latency", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 0, + "x": 24, + "y": 65 + }, + "id": 23, + "panels": [ ], + "title": "Processor Logs", + "type": "row" + }, + { + "datasource": { + "type": "loki", + "uid": "$loki_datasource" + }, + "description": "DLQ processor logs filtered by selected policy and error type — shows Published event to DLQ lines with policy and errorType fields", + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 66 + }, + "id": 24, + "options": { + "displayedFields": [ + "policy", + "errorType", + "msg" + ], + "enableLogDetails": true, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": false + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "$loki_datasource" + }, + "expr": "{namespace=\"activity-system\", container=\"processor\"} | json | errorType != \"\" | policy=~\"${policy_name:regex}\" | errorType=~\"${error_type:regex}\"", + "refId": "A" + } + ], + "title": "DLQ Events — Processor Logs", + "type": "logs" + }, + { + "datasource": { + "type": "loki", + "uid": "$loki_datasource" + }, + "description": "Catch-all DLQ log filter — includes non-JSON error lines and all dlq/dead-letter references", + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 76 + }, + "id": 25, + "options": { + "enableLogDetails": true, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": false + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "$loki_datasource" + }, + "expr": "{namespace=\"activity-system\", container=\"processor\"} |~ \"(?i)dlq|dead.letter|failed to evaluate|failed to republish|Published event to DLQ\"", + "refId": "A" + } + ], + "title": "DLQ/Policy Errors (raw)", + "type": "logs" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": [ + "activity", + "dlq", + "policy", + "health", + "on-call" + ], + "templating": { + "list": [ + { + "label": "Prometheus Datasource", + "name": "datasource", + "query": "prometheus", + "regex": "", + "type": "datasource" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "includeAll": true, + "label": "Cluster", + "multi": true, + "name": "cluster", + "query": "label_values(activity_processor_dlq_events_published_total, cluster)", + "refresh": 2, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "includeAll": true, + "label": "API Group", + "multi": true, + "name": "api_group", + "query": "label_values(activity_processor_dlq_events_published_total{cluster=~\"$cluster\"}, api_group)", + "refresh": 2, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "includeAll": true, + "label": "Kind", + "multi": true, + "name": "kind", + "query": "label_values(activity_processor_dlq_events_published_total{cluster=~\"$cluster\",api_group=~\"$api_group\"}, kind)", + "refresh": 2, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "includeAll": true, + "label": "Policy", + "multi": true, + "name": "policy_name", + "query": "label_values(activity_processor_dlq_events_published_total{cluster=~\"$cluster\",api_group=~\"$api_group\",kind=~\"$kind\"}, policy_name)", + "refresh": 2, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "includeAll": true, + "label": "Error Type", + "multi": true, + "name": "error_type", + "query": "label_values(activity_processor_dlq_events_published_total{cluster=~\"$cluster\"}, error_type)", + "refresh": 2, + "type": "query" + }, + { + "hide": 2, + "label": "Loki Datasource", + "name": "loki_datasource", + "query": "loki", + "type": "datasource" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timezone": "utc", + "title": "Activity — DLQ & Policy Health", + "uid": "activity-dlq-policy-health" +} diff --git a/config/components/observability/dashboards/kustomization.yaml b/config/components/observability/dashboards/kustomization.yaml index 228429c2..6bafb08d 100644 --- a/config/components/observability/dashboards/kustomization.yaml +++ b/config/components/observability/dashboards/kustomization.yaml @@ -52,6 +52,14 @@ configMapGenerator: grafana_dashboard: "1" disableNameSuffixHash: true + - name: activity-dlq-policy-health-dashboard + files: + - generated/activity-dlq-policy-health.json + options: + labels: + grafana_dashboard: "1" + disableNameSuffixHash: true + resources: - audit-pipeline-grafanadashboard.yaml - activity-apiserver-grafanadashboard.yaml @@ -59,3 +67,4 @@ resources: - events-pipeline-grafanadashboard.yaml - activity-system-overview-grafanadashboard.yaml - activity-slo-grafanadashboard.yaml + - activity-dlq-policy-health-grafanadashboard.yaml diff --git a/observability/dashboards/activity-dlq-policy-health.jsonnet b/observability/dashboards/activity-dlq-policy-health.jsonnet new file mode 100644 index 00000000..f6487d2b --- /dev/null +++ b/observability/dashboards/activity-dlq-policy-health.jsonnet @@ -0,0 +1,549 @@ +// Activity DLQ & Policy Health Grafana Dashboard +// Generated using Grafonnet v11.4.0 +// To build: jsonnet -J vendor dashboards/activity-dlq-policy-health.jsonnet > ../config/components/observability/dashboards/generated/activity-dlq-policy-health.json + +local g = import 'grafonnet-v11.4.0/main.libsonnet'; +local config = import '../config.libsonnet'; + +local dashboard = g.dashboard; +local panel = g.panel; +local stat = panel.stat; +local timeSeries = panel.timeSeries; +local tablePanel = panel.table; +local logsPanel = panel.logs; +local textPanel = panel.text; +local row = panel.row; +local prometheus = g.query.prometheus; +local loki = g.query.loki; +local util = g.util; + +local datasource = config.dashboards.datasource.name; +local datasourceRegex = config.dashboards.datasource.regex; +local refresh = config.dashboards.refresh; + +local statHeight = 5; +local statWidth = 6; +local timeSeriesHeight = 8; +local timeSeriesHalfWidth = 12; +local tableHeight = 10; +local tableFullWidth = 24; + +local SEL = 'cluster=~"$cluster", api_group=~"$api_group", kind=~"$kind", policy_name=~"$policy_name", error_type=~"$error_type"'; +local SEL_retry_attempts = 'cluster=~"$cluster", api_group=~"$api_group", kind=~"$kind"'; +local SEL_retry_failed = 'cluster=~"$cluster", api_group=~"$api_group", kind=~"$kind", policy_name=~"$policy_name", error_type=~"$error_type"'; +local SEL_high_retry = 'cluster=~"$cluster", api_group=~"$api_group", kind=~"$kind", policy_name=~"$policy_name"'; +local SEL_global = 'cluster=~"$cluster"'; + +local allPanels = util.grid.wrapPanels([ + row.new('At-a-Glance') + + row.withCollapsed(false), + + stat.new('DLQ Backlog') + + stat.options.withColorMode('background') + + stat.options.withGraphMode('none') + + stat.options.reduceOptions.withCalcs(['lastNotNull']) + + stat.standardOptions.withUnit('short') + + stat.datasource.withType('prometheus') + + stat.datasource.withUid(datasource) + + stat.queryOptions.withTargets([ + prometheus.new( + datasource, + 'max(nats_stream_total_messages{stream_name="ACTIVITY_DEAD_LETTER", ' + SEL_global + '})' + ) + + prometheus.withLegendFormat('Messages'), + ]) + + stat.standardOptions.thresholds.withSteps([ + { color: 'green', value: null }, + { color: 'yellow', value: 100 }, + { color: 'red', value: 1000 }, + ]) + + stat.panelOptions.withDescription('Current number of events stuck in the DLQ') + + stat.gridPos.withW(statWidth) + + stat.gridPos.withH(statHeight), + + stat.new('Backlog Age (oldest)') + + stat.options.withColorMode('value') + + stat.options.withGraphMode('none') + + stat.options.reduceOptions.withCalcs(['lastNotNull']) + + stat.standardOptions.withUnit('short') + + stat.datasource.withType('prometheus') + + stat.datasource.withUid(datasource) + + stat.queryOptions.withTargets([ + prometheus.new( + datasource, + 'max(nats_stream_last_seq{stream_name="ACTIVITY_DEAD_LETTER", ' + SEL_global + '} - on() nats_stream_first_seq{stream_name="ACTIVITY_DEAD_LETTER", ' + SEL_global + '})' + ) + + prometheus.withLegendFormat('Seq gap'), + ]) + + stat.standardOptions.thresholds.withSteps([ + { color: 'green', value: null }, + { color: 'yellow', value: 1000 }, + { color: 'red', value: 10000 }, + ]) + + stat.panelOptions.withDescription('Sequence gap in DLQ stream — proxy for backlog age') + + stat.gridPos.withW(statWidth) + + stat.gridPos.withH(statHeight), + + stat.new('DLQ Publish Rate') + + stat.options.withColorMode('background') + + stat.options.withGraphMode('area') + + stat.options.reduceOptions.withCalcs(['lastNotNull']) + + stat.standardOptions.withUnit('ops') + + stat.datasource.withType('prometheus') + + stat.datasource.withUid(datasource) + + stat.queryOptions.withTargets([ + prometheus.new( + datasource, + 'sum(rate(activity_processor_dlq_events_published_total{' + SEL + '}[5m])) or vector(0)' + ) + + prometheus.withLegendFormat('Events/s'), + ]) + + stat.standardOptions.thresholds.withSteps([ + { color: 'green', value: null }, + { color: 'yellow', value: 0.1 }, + { color: 'red', value: 1 }, + ]) + + stat.panelOptions.withDescription('Rate of new events being published to the DLQ') + + stat.gridPos.withW(statWidth) + + stat.gridPos.withH(statHeight), + + stat.new('Retry Resolve Rate') + + stat.options.withColorMode('value') + + stat.options.withGraphMode('area') + + stat.options.reduceOptions.withCalcs(['lastNotNull']) + + stat.standardOptions.withUnit('ops') + + stat.datasource.withType('prometheus') + + stat.datasource.withUid(datasource) + + stat.queryOptions.withTargets([ + prometheus.new( + datasource, + 'sum(rate(activity_processor_dlq_retry_attempts_total{result=~"succeeded|republished", ' + SEL_retry_attempts + '}[5m])) or vector(0)' + ) + + prometheus.withLegendFormat('Resolved/s'), + ]) + + stat.panelOptions.withDescription('Rate at which retries are clearing DLQ events') + + stat.gridPos.withW(statWidth) + + stat.gridPos.withH(statHeight), + + stat.new('Net Drain') + + stat.options.withColorMode('background') + + stat.options.withGraphMode('none') + + stat.options.reduceOptions.withCalcs(['lastNotNull']) + + stat.standardOptions.withUnit('ops') + + stat.datasource.withType('prometheus') + + stat.datasource.withUid(datasource) + + stat.queryOptions.withTargets([ + prometheus.new( + datasource, + '(sum(rate(activity_processor_dlq_events_published_total{' + SEL + '}[5m])) or vector(0)) - (sum(rate(activity_processor_dlq_retry_attempts_total{result=~"succeeded|republished", ' + SEL_retry_attempts + '}[5m])) or vector(0))' + ) + + prometheus.withLegendFormat('Net drain'), + ]) + + stat.standardOptions.thresholds.withSteps([ + { color: 'green', value: null }, + { color: 'yellow', value: 0 }, + { color: 'red', value: 0.01 }, + ]) + + stat.panelOptions.withDescription('Publish rate minus resolve rate — positive means backlog is growing') + + stat.gridPos.withW(statWidth) + + stat.gridPos.withH(statHeight), + + stat.new('Retry Success Rate') + + stat.options.withColorMode('background') + + stat.options.withGraphMode('area') + + stat.options.reduceOptions.withCalcs(['lastNotNull']) + + stat.standardOptions.withUnit('percentunit') + + stat.datasource.withType('prometheus') + + stat.datasource.withUid(datasource) + + stat.queryOptions.withTargets([ + prometheus.new( + datasource, + '(sum(rate(activity_processor_dlq_retry_attempts_total{result="succeeded", ' + SEL_global + '}[5m])) or vector(0)) / clamp_min(sum(rate(activity_processor_dlq_retry_attempts_total{' + SEL_global + '}[5m])), 1)' + ) + + prometheus.withLegendFormat('Success rate'), + ]) + + stat.standardOptions.thresholds.withSteps([ + { color: 'red', value: null }, + { color: 'yellow', value: 0.8 }, + { color: 'green', value: 0.95 }, + ]) + + stat.panelOptions.withDescription('Fraction of DLQ retry attempts that succeeded') + + stat.gridPos.withW(statWidth) + + stat.gridPos.withH(statHeight), + + stat.new('DLQ Publish Errors') + + stat.options.withColorMode('background') + + stat.options.withGraphMode('area') + + stat.options.reduceOptions.withCalcs(['lastNotNull']) + + stat.standardOptions.withUnit('ops') + + stat.datasource.withType('prometheus') + + stat.datasource.withUid(datasource) + + stat.queryOptions.withTargets([ + prometheus.new( + datasource, + 'sum(rate(activity_processor_dlq_publish_errors_total{' + SEL_global + '}[5m])) or vector(0)' + ) + + prometheus.withLegendFormat('Errors/s'), + ]) + + stat.standardOptions.thresholds.withSteps([ + { color: 'green', value: null }, + { color: 'red', value: 0.01 }, + ]) + + stat.panelOptions.withDescription('Rate of errors when publishing to DLQ — non-zero means events are being lost') + + stat.gridPos.withW(statWidth) + + stat.gridPos.withH(statHeight), + + row.new('What is broken NOW') + + row.withCollapsed(false), + + tablePanel.new('Top Failing Policies') + + tablePanel.datasource.withType('prometheus') + + tablePanel.datasource.withUid(datasource) + + tablePanel.options.withShowHeader(true) + + tablePanel.options.withSortBy([ + { displayName: 'Value', desc: true }, + ]) + + tablePanel.queryOptions.withTargets([ + prometheus.new( + datasource, + 'topk(25, sum by (policy_name, api_group, kind, error_type) (rate(activity_processor_dlq_events_published_total{policy_name!="", ' + SEL + '}[10m])))' + ) + + prometheus.withInstant(true) + + prometheus.withLegendFormat('{{policy_name}}'), + ]) + + tablePanel.standardOptions.withUnit('ops') + + tablePanel.standardOptions.withLinks([ + { + title: 'View in Loki', + url: '/explore?orgId=1&left={"datasource":"loki","queries":[{"expr":"{namespace=\\"activity-system\\", container=\\"processor\\"} | json | policy=\\"${__data.fields.policy_name}\\" | errorType=~\\".+\\"","refId":"A"}],"range":{"from":"${__from}","to":"${__to}"}}', + targetBlank: true, + }, + ]) + + tablePanel.panelOptions.withDescription('Top 25 policies currently publishing to DLQ — the primary triage view for ActivityPolicyDLQErrors') + + tablePanel.gridPos.withW(tableFullWidth) + + tablePanel.gridPos.withH(tableHeight), + + row.new('Trends') + + row.withCollapsed(false), + + timeSeries.new('DLQ Rate by error_type') + + timeSeries.options.legend.withDisplayMode('table') + + timeSeries.options.legend.withPlacement('bottom') + + timeSeries.options.legend.withShowLegend(true) + + timeSeries.options.legend.withCalcs(['lastNotNull', 'mean']) + + timeSeries.standardOptions.withUnit('ops') + + timeSeries.fieldConfig.defaults.custom.withFillOpacity(30) + + timeSeries.fieldConfig.defaults.custom.withShowPoints('never') + + timeSeries.fieldConfig.defaults.custom.stacking.withMode('normal') + + timeSeries.datasource.withType('prometheus') + + timeSeries.datasource.withUid(datasource) + + timeSeries.queryOptions.withTargets([ + prometheus.new( + datasource, + 'sum by (error_type) (rate(activity_processor_dlq_events_published_total{' + SEL + '}[5m]))' + ) + + prometheus.withLegendFormat('{{error_type}}'), + ]) + + timeSeries.panelOptions.withDescription('DLQ rate by failure class — identifies dominant error mode') + + timeSeries.gridPos.withW(timeSeriesHalfWidth) + + timeSeries.gridPos.withH(timeSeriesHeight), + + timeSeries.new('DLQ Rate by policy') + + timeSeries.options.legend.withDisplayMode('table') + + timeSeries.options.legend.withPlacement('bottom') + + timeSeries.options.legend.withShowLegend(true) + + timeSeries.options.legend.withCalcs(['lastNotNull', 'mean']) + + timeSeries.standardOptions.withUnit('ops') + + timeSeries.fieldConfig.defaults.custom.withFillOpacity(30) + + timeSeries.fieldConfig.defaults.custom.withShowPoints('never') + + timeSeries.fieldConfig.defaults.custom.stacking.withMode('normal') + + timeSeries.datasource.withType('prometheus') + + timeSeries.datasource.withUid(datasource) + + timeSeries.queryOptions.withTargets([ + prometheus.new( + datasource, + 'label_replace(sum by (policy_name) (rate(activity_processor_dlq_events_published_total{' + SEL + '}[5m])), "policy_name", "(no policy)", "policy_name", "^$")' + ) + + prometheus.withLegendFormat('{{policy_name}}'), + ]) + + timeSeries.standardOptions.withLinks([ + { + title: 'View in Loki', + url: '/explore?orgId=1&left={"datasource":"loki","queries":[{"expr":"{namespace=\\"activity-system\\", container=\\"processor\\"} | json | policy=\\"${__field.labels.policy_name}\\"","refId":"A"}],"range":{"from":"${__from}","to":"${__to}"}}', + targetBlank: true, + }, + ]) + + timeSeries.panelOptions.withDescription('DLQ rate by policy — identifies persistent per-policy failures (DLQSlowLeak)') + + timeSeries.gridPos.withW(timeSeriesHalfWidth) + + timeSeries.gridPos.withH(timeSeriesHeight), + + timeSeries.new('DLQ Rate by kind') + + timeSeries.options.legend.withDisplayMode('table') + + timeSeries.options.legend.withPlacement('bottom') + + timeSeries.options.legend.withShowLegend(true) + + timeSeries.options.legend.withCalcs(['lastNotNull', 'mean']) + + timeSeries.standardOptions.withUnit('ops') + + timeSeries.fieldConfig.defaults.custom.withFillOpacity(30) + + timeSeries.fieldConfig.defaults.custom.withShowPoints('never') + + timeSeries.fieldConfig.defaults.custom.stacking.withMode('normal') + + timeSeries.datasource.withType('prometheus') + + timeSeries.datasource.withUid(datasource) + + timeSeries.queryOptions.withTargets([ + prometheus.new( + datasource, + 'sum by (api_group, kind) (rate(activity_processor_dlq_events_published_total{' + SEL + '}[5m]))' + ) + + prometheus.withLegendFormat('{{api_group}}/{{kind}}'), + ]) + + timeSeries.panelOptions.withDescription('DLQ rate by resource kind — identifies affected resource types') + + timeSeries.gridPos.withW(timeSeriesHalfWidth) + + timeSeries.gridPos.withH(timeSeriesHeight), + + row.new('Retry & Recovery') + + row.withCollapsed(false), + + timeSeries.new('Retry outcomes') + + timeSeries.options.legend.withDisplayMode('table') + + timeSeries.options.legend.withPlacement('bottom') + + timeSeries.options.legend.withShowLegend(true) + + timeSeries.options.legend.withCalcs(['lastNotNull', 'mean']) + + timeSeries.standardOptions.withUnit('ops') + + timeSeries.fieldConfig.defaults.custom.withFillOpacity(30) + + timeSeries.fieldConfig.defaults.custom.withShowPoints('never') + + timeSeries.fieldConfig.defaults.custom.stacking.withMode('normal') + + timeSeries.datasource.withType('prometheus') + + timeSeries.datasource.withUid(datasource) + + timeSeries.queryOptions.withTargets([ + prometheus.new( + datasource, + 'sum by (result) (rate(activity_processor_dlq_retry_attempts_total{' + SEL_retry_attempts + '}[5m]))' + ) + + prometheus.withLegendFormat('{{result}}'), + ]) + + timeSeries.panelOptions.withDescription('Retry attempt outcomes over time — succeeded vs republished vs failed') + + timeSeries.gridPos.withW(timeSeriesHalfWidth) + + timeSeries.gridPos.withH(timeSeriesHeight), + + tablePanel.new('Still-failing re-eval by policy') + + tablePanel.datasource.withType('prometheus') + + tablePanel.datasource.withUid(datasource) + + tablePanel.options.withShowHeader(true) + + tablePanel.options.withSortBy([ + { displayName: 'Value', desc: true }, + ]) + + tablePanel.queryOptions.withTargets([ + prometheus.new( + datasource, + 'topk(25, sum by (policy_name, error_type) (rate(activity_processor_dlq_retry_failed_total{' + SEL_retry_failed + '}[10m])))' + ) + + prometheus.withInstant(true) + + prometheus.withLegendFormat('{{policy_name}}'), + ]) + + tablePanel.standardOptions.withUnit('ops') + + tablePanel.panelOptions.withDescription('Policies NOT recovering after retry — triage for DLQRetryIneffective') + + tablePanel.gridPos.withW(timeSeriesHalfWidth) + + tablePanel.gridPos.withH(timeSeriesHeight), + + timeSeries.new('High-retry (poison) events by policy') + + timeSeries.options.legend.withDisplayMode('table') + + timeSeries.options.legend.withPlacement('bottom') + + timeSeries.options.legend.withShowLegend(true) + + timeSeries.options.legend.withCalcs(['lastNotNull', 'mean']) + + timeSeries.standardOptions.withUnit('short') + + timeSeries.fieldConfig.defaults.custom.withFillOpacity(10) + + timeSeries.fieldConfig.defaults.custom.withShowPoints('never') + + timeSeries.datasource.withType('prometheus') + + timeSeries.datasource.withUid(datasource) + + timeSeries.queryOptions.withTargets([ + prometheus.new( + datasource, + 'sum by (policy_name, api_group, kind) (increase(activity_processor_dlq_retry_events_high_retry_total{' + SEL_high_retry + '}[1h]))' + ) + + prometheus.withLegendFormat('{{policy_name}}'), + ]) + + timeSeries.panelOptions.withDescription('Events exceeding retry threshold by policy — identifies poison events (DLQHighRetryCount)') + + timeSeries.gridPos.withW(timeSeriesHalfWidth) + + timeSeries.gridPos.withH(timeSeriesHeight), + + timeSeries.new('Retry batch duration p99') + + timeSeries.options.legend.withDisplayMode('table') + + timeSeries.options.legend.withPlacement('bottom') + + timeSeries.options.legend.withShowLegend(true) + + timeSeries.options.legend.withCalcs(['lastNotNull', 'mean', 'max']) + + timeSeries.standardOptions.withUnit('s') + + timeSeries.fieldConfig.defaults.custom.withFillOpacity(10) + + timeSeries.fieldConfig.defaults.custom.withShowPoints('never') + + timeSeries.datasource.withType('prometheus') + + timeSeries.datasource.withUid(datasource) + + timeSeries.queryOptions.withTargets([ + prometheus.new( + datasource, + 'histogram_quantile(0.99, sum by (le, trigger) (rate(activity_processor_dlq_retry_batch_duration_seconds_bucket{' + SEL_global + '}[5m])))' + ) + + prometheus.withLegendFormat('{{trigger}} p99'), + ]) + + timeSeries.panelOptions.withDescription('Retry batch processing duration — high values indicate retry path stalling') + + timeSeries.gridPos.withW(timeSeriesHalfWidth) + + timeSeries.gridPos.withH(timeSeriesHeight), + + row.new('Publish-Path Health') + + row.withCollapsed(false), + + timeSeries.new('Publish errors by phase') + + timeSeries.options.legend.withDisplayMode('table') + + timeSeries.options.legend.withPlacement('bottom') + + timeSeries.options.legend.withShowLegend(true) + + timeSeries.options.legend.withCalcs(['lastNotNull', 'mean']) + + timeSeries.standardOptions.withUnit('ops') + + timeSeries.fieldConfig.defaults.custom.withFillOpacity(30) + + timeSeries.fieldConfig.defaults.custom.withShowPoints('never') + + timeSeries.fieldConfig.defaults.custom.stacking.withMode('normal') + + timeSeries.datasource.withType('prometheus') + + timeSeries.datasource.withUid(datasource) + + timeSeries.queryOptions.withTargets([ + prometheus.new( + datasource, + 'sum by (error_phase) (rate(activity_processor_dlq_publish_errors_total{' + SEL_global + '}[5m]))' + ) + + prometheus.withLegendFormat('{{error_phase}}'), + ]) + + timeSeries.panelOptions.withDescription('DLQ publish errors by phase (marshal/publish) — non-zero is data loss risk') + + timeSeries.gridPos.withW(timeSeriesHalfWidth) + + timeSeries.gridPos.withH(timeSeriesHeight), + + timeSeries.new('DLQ publish latency') + + timeSeries.options.legend.withDisplayMode('table') + + timeSeries.options.legend.withPlacement('bottom') + + timeSeries.options.legend.withShowLegend(true) + + timeSeries.options.legend.withCalcs(['lastNotNull', 'mean', 'max']) + + timeSeries.standardOptions.withUnit('s') + + timeSeries.fieldConfig.defaults.custom.withFillOpacity(10) + + timeSeries.fieldConfig.defaults.custom.withShowPoints('never') + + timeSeries.datasource.withType('prometheus') + + timeSeries.datasource.withUid(datasource) + + timeSeries.queryOptions.withTargets([ + prometheus.new( + datasource, + 'histogram_quantile(0.99, sum by (le) (rate(activity_processor_dlq_publish_latency_seconds_bucket{' + SEL_global + '}[5m])))' + ) + + prometheus.withLegendFormat('p99'), + prometheus.new( + datasource, + 'histogram_quantile(0.95, sum by (le) (rate(activity_processor_dlq_publish_latency_seconds_bucket{' + SEL_global + '}[5m])))' + ) + + prometheus.withLegendFormat('p95'), + prometheus.new( + datasource, + 'histogram_quantile(0.50, sum by (le) (rate(activity_processor_dlq_publish_latency_seconds_bucket{' + SEL_global + '}[5m])))' + ) + + prometheus.withLegendFormat('p50'), + ]) + + timeSeries.panelOptions.withDescription('DLQ publish write path latency distribution') + + timeSeries.gridPos.withW(timeSeriesHalfWidth) + + timeSeries.gridPos.withH(timeSeriesHeight), + + row.new('Processor Logs') + + row.withCollapsed(false), + + logsPanel.new('DLQ Events — Processor Logs') + + logsPanel.datasource.withType('loki') + + logsPanel.datasource.withUid('$loki_datasource') + + logsPanel.options.withShowTime(true) + + logsPanel.options.withSortOrder('Descending') + + logsPanel.options.withWrapLogMessage(false) + + logsPanel.options.withEnableLogDetails(true) + + logsPanel.options.withDisplayedFields(['policy', 'errorType', 'msg']) + + logsPanel.queryOptions.withTargets([ + loki.new( + '$loki_datasource', + '{namespace="activity-system", container="processor"} | json | errorType != "" | policy=~"${policy_name:regex}" | errorType=~"${error_type:regex}"' + ) + + loki.withRefId('A'), + ]) + + logsPanel.panelOptions.withDescription('DLQ processor logs filtered by selected policy and error type — shows Published event to DLQ lines with policy and errorType fields') + + logsPanel.gridPos.withW(tableFullWidth) + + logsPanel.gridPos.withH(tableHeight), + + logsPanel.new('DLQ/Policy Errors (raw)') + + logsPanel.datasource.withType('loki') + + logsPanel.datasource.withUid('$loki_datasource') + + logsPanel.options.withShowTime(true) + + logsPanel.options.withSortOrder('Descending') + + logsPanel.options.withWrapLogMessage(false) + + logsPanel.options.withEnableLogDetails(true) + + logsPanel.queryOptions.withTargets([ + loki.new( + '$loki_datasource', + '{namespace="activity-system", container="processor"} |~ "(?i)dlq|dead.letter|failed to evaluate|failed to republish|Published event to DLQ"' + ) + + loki.withRefId('A'), + ]) + + logsPanel.panelOptions.withDescription('Catch-all DLQ log filter — includes non-JSON error lines and all dlq/dead-letter references') + + logsPanel.gridPos.withW(tableFullWidth) + + logsPanel.gridPos.withH(tableHeight), +], panelWidth=statWidth, panelHeight=statHeight); + +dashboard.new('Activity — DLQ & Policy Health') ++ dashboard.withDescription('Single-pane triage dashboard for DLQ backlog, failing policies, retry recovery, and processor logs') ++ dashboard.withTags(['activity', 'dlq', 'policy', 'health', 'on-call']) ++ dashboard.withUid('activity-dlq-policy-health') ++ dashboard.time.withFrom('now-6h') ++ dashboard.time.withTo('now') ++ dashboard.withTimezone(config.dashboards.timezone) ++ dashboard.withRefresh(refresh) ++ dashboard.withEditable(true) ++ dashboard.graphTooltip.withSharedCrosshair() ++ dashboard.withLinks([ + { + title: 'DLQ Runbooks', + url: 'https://github.com/milo-os/activity/tree/main/docs/runbooks/dlq/', + type: 'link', + targetBlank: true, + icon: 'external link', + }, +]) ++ dashboard.withVariablesMixin([ + g.dashboard.variable.datasource.new('datasource', 'prometheus') + + g.dashboard.variable.datasource.generalOptions.withLabel('Prometheus Datasource') + + g.dashboard.variable.datasource.withRegex(datasourceRegex), + + g.dashboard.variable.query.new('cluster', 'label_values(activity_processor_dlq_events_published_total, cluster)') + + g.dashboard.variable.query.withDatasource('prometheus', datasource) + + g.dashboard.variable.query.generalOptions.withLabel('Cluster') + + g.dashboard.variable.query.selectionOptions.withMulti() + + g.dashboard.variable.query.selectionOptions.withIncludeAll() + + g.dashboard.variable.query.refresh.onTime(), + + g.dashboard.variable.query.new('api_group', 'label_values(activity_processor_dlq_events_published_total{cluster=~"$cluster"}, api_group)') + + g.dashboard.variable.query.withDatasource('prometheus', datasource) + + g.dashboard.variable.query.generalOptions.withLabel('API Group') + + g.dashboard.variable.query.selectionOptions.withMulti() + + g.dashboard.variable.query.selectionOptions.withIncludeAll() + + g.dashboard.variable.query.refresh.onTime(), + + g.dashboard.variable.query.new('kind', 'label_values(activity_processor_dlq_events_published_total{cluster=~"$cluster",api_group=~"$api_group"}, kind)') + + g.dashboard.variable.query.withDatasource('prometheus', datasource) + + g.dashboard.variable.query.generalOptions.withLabel('Kind') + + g.dashboard.variable.query.selectionOptions.withMulti() + + g.dashboard.variable.query.selectionOptions.withIncludeAll() + + g.dashboard.variable.query.refresh.onTime(), + + g.dashboard.variable.query.new('policy_name', 'label_values(activity_processor_dlq_events_published_total{cluster=~"$cluster",api_group=~"$api_group",kind=~"$kind"}, policy_name)') + + g.dashboard.variable.query.withDatasource('prometheus', datasource) + + g.dashboard.variable.query.generalOptions.withLabel('Policy') + + g.dashboard.variable.query.selectionOptions.withMulti() + + g.dashboard.variable.query.selectionOptions.withIncludeAll() + + g.dashboard.variable.query.refresh.onTime(), + + g.dashboard.variable.query.new('error_type', 'label_values(activity_processor_dlq_events_published_total{cluster=~"$cluster"}, error_type)') + + g.dashboard.variable.query.withDatasource('prometheus', datasource) + + g.dashboard.variable.query.generalOptions.withLabel('Error Type') + + g.dashboard.variable.query.selectionOptions.withMulti() + + g.dashboard.variable.query.selectionOptions.withIncludeAll() + + g.dashboard.variable.query.refresh.onTime(), + + g.dashboard.variable.datasource.new('loki_datasource', 'loki') + + g.dashboard.variable.datasource.generalOptions.withLabel('Loki Datasource') + + g.dashboard.variable.datasource.generalOptions.showOnDashboard.withNothing(), +]) ++ dashboard.withPanels(allPanels)