diff --git a/config/components/observability/dashboards/activity-dlq-policy-health-grafanadashboard.yaml b/config/components/observability/dashboards/activity-dlq-policy-health-grafanadashboard.yaml new file mode 100644 index 00000000..0d27a05d --- /dev/null +++ b/config/components/observability/dashboards/activity-dlq-policy-health-grafanadashboard.yaml @@ -0,0 +1,16 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: activity-dlq-policy-health-dashboard + labels: + dashboards: grafana +spec: + folder: "Platform / Activity" + allowCrossNamespaceImport: true + instanceSelector: + matchLabels: + dashboards: grafana + resyncPeriod: 30s + configMapRef: + name: activity-dlq-policy-health-dashboard + key: activity-dlq-policy-health.json diff --git a/config/components/observability/dashboards/generated/activity-dlq-policy-health.json b/config/components/observability/dashboards/generated/activity-dlq-policy-health.json new file mode 100644 index 00000000..f63dd61e --- /dev/null +++ b/config/components/observability/dashboards/generated/activity-dlq-policy-health.json @@ -0,0 +1,1150 @@ +{ + "description": "Single-pane triage dashboard for DLQ backlog, failing policies, retry recovery, and processor logs", + "editable": true, + "graphTooltip": 1, + "links": [ + { + "icon": "external link", + "targetBlank": true, + "title": "DLQ Runbooks", + "type": "link", + "url": "https://github.com/milo-os/activity/tree/main/docs/runbooks/dlq/" + } + ], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 0, + "x": 0, + "y": 0 + }, + "id": 1, + "panels": [ ], + "title": "At-a-Glance", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "Current number of events stuck in the DLQ", + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 100 + }, + { + "color": "red", + "value": 1000 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "max(nats_stream_total_messages{stream_name=\"ACTIVITY_DEAD_LETTER\", cluster=~\"$cluster\"})", + "legendFormat": "Messages" + } + ], + "title": "DLQ Backlog", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "Sequence gap in DLQ stream — proxy for backlog age", + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1000 + }, + { + "color": "red", + "value": 10000 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 6, + "y": 1 + }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "max(nats_stream_last_seq{stream_name=\"ACTIVITY_DEAD_LETTER\", cluster=~\"$cluster\"} - on() nats_stream_first_seq{stream_name=\"ACTIVITY_DEAD_LETTER\", cluster=~\"$cluster\"})", + "legendFormat": "Seq gap" + } + ], + "title": "Backlog Age (oldest)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "Rate of new events being published to the DLQ", + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.10000000000000001 + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 12, + "y": 1 + }, + "id": 4, + "options": { + "colorMode": "background", + "graphMode": "area", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(rate(activity_processor_dlq_events_published_total{cluster=~\"$cluster\", api_group=~\"$api_group\", kind=~\"$kind\", policy_name=~\"$policy_name\", error_type=~\"$error_type\"}[5m])) or vector(0)", + "legendFormat": "Events/s" + } + ], + "title": "DLQ Publish Rate", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "Rate at which retries are clearing DLQ events", + "fieldConfig": { + "defaults": { + "unit": "ops" + } + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 18, + "y": 1 + }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(rate(activity_processor_dlq_retry_attempts_total{result=~\"succeeded|republished\", cluster=~\"$cluster\", api_group=~\"$api_group\", kind=~\"$kind\"}[5m])) or vector(0)", + "legendFormat": "Resolved/s" + } + ], + "title": "Retry Resolve Rate", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "Publish rate minus resolve rate — positive means backlog is growing", + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0 + }, + { + "color": "red", + "value": 0.01 + } + ] + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 6 + }, + "id": 6, + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "(sum(rate(activity_processor_dlq_events_published_total{cluster=~\"$cluster\", api_group=~\"$api_group\", kind=~\"$kind\", policy_name=~\"$policy_name\", error_type=~\"$error_type\"}[5m])) or vector(0)) - (sum(rate(activity_processor_dlq_retry_attempts_total{result=~\"succeeded|republished\", cluster=~\"$cluster\", api_group=~\"$api_group\", kind=~\"$kind\"}[5m])) or vector(0))", + "legendFormat": "Net drain" + } + ], + "title": "Net Drain", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "Fraction of DLQ retry attempts that succeeded", + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "yellow", + "value": 0.80000000000000004 + }, + { + "color": "green", + "value": 0.94999999999999996 + } + ] + }, + "unit": "percentunit" + } + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 6, + "y": 6 + }, + "id": 7, + "options": { + "colorMode": "background", + "graphMode": "area", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "(sum(rate(activity_processor_dlq_retry_attempts_total{result=\"succeeded\", cluster=~\"$cluster\"}[5m])) or vector(0)) / clamp_min(sum(rate(activity_processor_dlq_retry_attempts_total{cluster=~\"$cluster\"}[5m])), 1)", + "legendFormat": "Success rate" + } + ], + "title": "Retry Success Rate", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "Rate of errors when publishing to DLQ — non-zero means events are being lost", + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0.01 + } + ] + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 12, + "y": 6 + }, + "id": 8, + "options": { + "colorMode": "background", + "graphMode": "area", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(rate(activity_processor_dlq_publish_errors_total{cluster=~\"$cluster\"}[5m])) or vector(0)", + "legendFormat": "Errors/s" + } + ], + "title": "DLQ Publish Errors", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 0, + "x": 18, + "y": 11 + }, + "id": 9, + "panels": [ ], + "title": "What is broken NOW", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "Top 25 policies currently publishing to DLQ — the primary triage view for ActivityPolicyDLQErrors", + "fieldConfig": { + "defaults": { + "links": [ + { + "targetBlank": true, + "title": "View in Loki", + "url": "/explore?orgId=1&left={\"datasource\":\"loki\",\"queries\":[{\"expr\":\"{namespace=\\\"activity-system\\\", container=\\\"processor\\\"} | json | policy=\\\"${__data.fields.policy_name}\\\" | errorType=~\\\".+\\\"\",\"refId\":\"A\"}],\"range\":{\"from\":\"${__from}\",\"to\":\"${__to}\"}}" + } + ], + "unit": "ops" + } + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 12 + }, + "id": 10, + "options": { + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Value" + } + ] + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "topk(25, sum by (policy_name, api_group, kind, error_type) (rate(activity_processor_dlq_events_published_total{policy_name!=\"\", cluster=~\"$cluster\", api_group=~\"$api_group\", kind=~\"$kind\", policy_name=~\"$policy_name\", error_type=~\"$error_type\"}[10m])))", + "instant": true, + "legendFormat": "{{policy_name}}" + } + ], + "title": "Top Failing Policies", + "type": "table" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 0, + "x": 24, + "y": 22 + }, + "id": 11, + "panels": [ ], + "title": "Trends", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "DLQ rate by failure class — identifies dominant error mode", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 30, + "showPoints": "never", + "stacking": { + "mode": "normal" + } + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 23 + }, + "id": 12, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by (error_type) (rate(activity_processor_dlq_events_published_total{cluster=~\"$cluster\", api_group=~\"$api_group\", kind=~\"$kind\", policy_name=~\"$policy_name\", error_type=~\"$error_type\"}[5m]))", + "legendFormat": "{{error_type}}" + } + ], + "title": "DLQ Rate by error_type", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "DLQ rate by policy — identifies persistent per-policy failures (DLQSlowLeak)", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 30, + "showPoints": "never", + "stacking": { + "mode": "normal" + } + }, + "links": [ + { + "targetBlank": true, + "title": "View in Loki", + "url": "/explore?orgId=1&left={\"datasource\":\"loki\",\"queries\":[{\"expr\":\"{namespace=\\\"activity-system\\\", container=\\\"processor\\\"} | json | policy=\\\"${__field.labels.policy_name}\\\"\",\"refId\":\"A\"}],\"range\":{\"from\":\"${__from}\",\"to\":\"${__to}\"}}" + } + ], + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 23 + }, + "id": 13, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "label_replace(sum by (policy_name) (rate(activity_processor_dlq_events_published_total{cluster=~\"$cluster\", api_group=~\"$api_group\", kind=~\"$kind\", policy_name=~\"$policy_name\", error_type=~\"$error_type\"}[5m])), \"policy_name\", \"(no policy)\", \"policy_name\", \"^$\")", + "legendFormat": "{{policy_name}}" + } + ], + "title": "DLQ Rate by policy", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "DLQ rate by resource kind — identifies affected resource types", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 30, + "showPoints": "never", + "stacking": { + "mode": "normal" + } + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 31 + }, + "id": 14, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by (api_group, kind) (rate(activity_processor_dlq_events_published_total{cluster=~\"$cluster\", api_group=~\"$api_group\", kind=~\"$kind\", policy_name=~\"$policy_name\", error_type=~\"$error_type\"}[5m]))", + "legendFormat": "{{api_group}}/{{kind}}" + } + ], + "title": "DLQ Rate by kind", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 0, + "x": 12, + "y": 39 + }, + "id": 15, + "panels": [ ], + "title": "Retry & Recovery", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "Retry attempt outcomes over time — succeeded vs republished vs failed", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 30, + "showPoints": "never", + "stacking": { + "mode": "normal" + } + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 40 + }, + "id": 16, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by (result) (rate(activity_processor_dlq_retry_attempts_total{cluster=~\"$cluster\", api_group=~\"$api_group\", kind=~\"$kind\"}[5m]))", + "legendFormat": "{{result}}" + } + ], + "title": "Retry outcomes", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "Policies NOT recovering after retry — triage for DLQRetryIneffective", + "fieldConfig": { + "defaults": { + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 40 + }, + "id": 17, + "options": { + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Value" + } + ] + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "topk(25, sum by (policy_name, error_type) (rate(activity_processor_dlq_retry_failed_total{cluster=~\"$cluster\", api_group=~\"$api_group\", kind=~\"$kind\", policy_name=~\"$policy_name\", error_type=~\"$error_type\"}[10m])))", + "instant": true, + "legendFormat": "{{policy_name}}" + } + ], + "title": "Still-failing re-eval by policy", + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "Events exceeding retry threshold by policy — identifies poison events (DLQHighRetryCount)", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 48 + }, + "id": 18, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by (policy_name, api_group, kind) (increase(activity_processor_dlq_retry_events_high_retry_total{cluster=~\"$cluster\", api_group=~\"$api_group\", kind=~\"$kind\", policy_name=~\"$policy_name\"}[1h]))", + "legendFormat": "{{policy_name}}" + } + ], + "title": "High-retry (poison) events by policy", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "Retry batch processing duration — high values indicate retry path stalling", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 48 + }, + "id": 19, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.99, sum by (le, trigger) (rate(activity_processor_dlq_retry_batch_duration_seconds_bucket{cluster=~\"$cluster\"}[5m])))", + "legendFormat": "{{trigger}} p99" + } + ], + "title": "Retry batch duration p99", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 0, + "x": 24, + "y": 56 + }, + "id": 20, + "panels": [ ], + "title": "Publish-Path Health", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "DLQ publish errors by phase (marshal/publish) — non-zero is data loss risk", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 30, + "showPoints": "never", + "stacking": { + "mode": "normal" + } + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 57 + }, + "id": 21, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by (error_phase) (rate(activity_processor_dlq_publish_errors_total{cluster=~\"$cluster\"}[5m]))", + "legendFormat": "{{error_phase}}" + } + ], + "title": "Publish errors by phase", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "DLQ publish write path latency distribution", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 57 + }, + "id": 22, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.99, sum by (le) (rate(activity_processor_dlq_publish_latency_seconds_bucket{cluster=~\"$cluster\"}[5m])))", + "legendFormat": "p99" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.95, sum by (le) (rate(activity_processor_dlq_publish_latency_seconds_bucket{cluster=~\"$cluster\"}[5m])))", + "legendFormat": "p95" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.50, sum by (le) (rate(activity_processor_dlq_publish_latency_seconds_bucket{cluster=~\"$cluster\"}[5m])))", + "legendFormat": "p50" + } + ], + "title": "DLQ publish latency", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 0, + "x": 24, + "y": 65 + }, + "id": 23, + "panels": [ ], + "title": "Processor Logs", + "type": "row" + }, + { + "datasource": { + "type": "loki", + "uid": "$loki_datasource" + }, + "description": "DLQ processor logs filtered by selected policy and error type — shows Published event to DLQ lines with policy and errorType fields", + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 66 + }, + "id": 24, + "options": { + "displayedFields": [ + "policy", + "errorType", + "msg" + ], + "enableLogDetails": true, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": false + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "$loki_datasource" + }, + "expr": "{namespace=\"activity-system\", container=\"processor\"} | json | errorType != \"\" | policy=~\"${policy_name:regex}\" | errorType=~\"${error_type:regex}\"", + "refId": "A" + } + ], + "title": "DLQ Events — Processor Logs", + "type": "logs" + }, + { + "datasource": { + "type": "loki", + "uid": "$loki_datasource" + }, + "description": "Catch-all DLQ log filter — includes non-JSON error lines and all dlq/dead-letter references", + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 76 + }, + "id": 25, + "options": { + "enableLogDetails": true, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": false + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "$loki_datasource" + }, + "expr": "{namespace=\"activity-system\", container=\"processor\"} |~ \"(?i)dlq|dead.letter|failed to evaluate|failed to republish|Published event to DLQ\"", + "refId": "A" + } + ], + "title": "DLQ/Policy Errors (raw)", + "type": "logs" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": [ + "activity", + "dlq", + "policy", + "health", + "on-call" + ], + "templating": { + "list": [ + { + "label": "Prometheus Datasource", + "name": "datasource", + "query": "prometheus", + "regex": "", + "type": "datasource" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "includeAll": true, + "label": "Cluster", + "multi": true, + "name": "cluster", + "query": "label_values(activity_processor_dlq_events_published_total, cluster)", + "refresh": 2, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "includeAll": true, + "label": "API Group", + "multi": true, + "name": "api_group", + "query": "label_values(activity_processor_dlq_events_published_total{cluster=~\"$cluster\"}, api_group)", + "refresh": 2, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "includeAll": true, + "label": "Kind", + "multi": true, + "name": "kind", + "query": "label_values(activity_processor_dlq_events_published_total{cluster=~\"$cluster\",api_group=~\"$api_group\"}, kind)", + "refresh": 2, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "includeAll": true, + "label": "Policy", + "multi": true, + "name": "policy_name", + "query": "label_values(activity_processor_dlq_events_published_total{cluster=~\"$cluster\",api_group=~\"$api_group\",kind=~\"$kind\"}, policy_name)", + "refresh": 2, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "includeAll": true, + "label": "Error Type", + "multi": true, + "name": "error_type", + "query": "label_values(activity_processor_dlq_events_published_total{cluster=~\"$cluster\"}, error_type)", + "refresh": 2, + "type": "query" + }, + { + "hide": 2, + "label": "Loki Datasource", + "name": "loki_datasource", + "query": "loki", + "type": "datasource" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timezone": "utc", + "title": "Activity — DLQ & Policy Health", + "uid": "activity-dlq-policy-health" +} diff --git a/config/components/observability/dashboards/kustomization.yaml b/config/components/observability/dashboards/kustomization.yaml index 228429c2..6bafb08d 100644 --- a/config/components/observability/dashboards/kustomization.yaml +++ b/config/components/observability/dashboards/kustomization.yaml @@ -52,6 +52,14 @@ configMapGenerator: grafana_dashboard: "1" disableNameSuffixHash: true + - name: activity-dlq-policy-health-dashboard + files: + - generated/activity-dlq-policy-health.json + options: + labels: + grafana_dashboard: "1" + disableNameSuffixHash: true + resources: - audit-pipeline-grafanadashboard.yaml - activity-apiserver-grafanadashboard.yaml @@ -59,3 +67,4 @@ resources: - events-pipeline-grafanadashboard.yaml - activity-system-overview-grafanadashboard.yaml - activity-slo-grafanadashboard.yaml + - activity-dlq-policy-health-grafanadashboard.yaml diff --git a/observability/dashboards/activity-dlq-policy-health.jsonnet b/observability/dashboards/activity-dlq-policy-health.jsonnet new file mode 100644 index 00000000..f6487d2b --- /dev/null +++ b/observability/dashboards/activity-dlq-policy-health.jsonnet @@ -0,0 +1,549 @@ +// Activity DLQ & Policy Health Grafana Dashboard +// Generated using Grafonnet v11.4.0 +// To build: jsonnet -J vendor dashboards/activity-dlq-policy-health.jsonnet > ../config/components/observability/dashboards/generated/activity-dlq-policy-health.json + +local g = import 'grafonnet-v11.4.0/main.libsonnet'; +local config = import '../config.libsonnet'; + +local dashboard = g.dashboard; +local panel = g.panel; +local stat = panel.stat; +local timeSeries = panel.timeSeries; +local tablePanel = panel.table; +local logsPanel = panel.logs; +local textPanel = panel.text; +local row = panel.row; +local prometheus = g.query.prometheus; +local loki = g.query.loki; +local util = g.util; + +local datasource = config.dashboards.datasource.name; +local datasourceRegex = config.dashboards.datasource.regex; +local refresh = config.dashboards.refresh; + +local statHeight = 5; +local statWidth = 6; +local timeSeriesHeight = 8; +local timeSeriesHalfWidth = 12; +local tableHeight = 10; +local tableFullWidth = 24; + +local SEL = 'cluster=~"$cluster", api_group=~"$api_group", kind=~"$kind", policy_name=~"$policy_name", error_type=~"$error_type"'; +local SEL_retry_attempts = 'cluster=~"$cluster", api_group=~"$api_group", kind=~"$kind"'; +local SEL_retry_failed = 'cluster=~"$cluster", api_group=~"$api_group", kind=~"$kind", policy_name=~"$policy_name", error_type=~"$error_type"'; +local SEL_high_retry = 'cluster=~"$cluster", api_group=~"$api_group", kind=~"$kind", policy_name=~"$policy_name"'; +local SEL_global = 'cluster=~"$cluster"'; + +local allPanels = util.grid.wrapPanels([ + row.new('At-a-Glance') + + row.withCollapsed(false), + + stat.new('DLQ Backlog') + + stat.options.withColorMode('background') + + stat.options.withGraphMode('none') + + stat.options.reduceOptions.withCalcs(['lastNotNull']) + + stat.standardOptions.withUnit('short') + + stat.datasource.withType('prometheus') + + stat.datasource.withUid(datasource) + + stat.queryOptions.withTargets([ + prometheus.new( + datasource, + 'max(nats_stream_total_messages{stream_name="ACTIVITY_DEAD_LETTER", ' + SEL_global + '})' + ) + + prometheus.withLegendFormat('Messages'), + ]) + + stat.standardOptions.thresholds.withSteps([ + { color: 'green', value: null }, + { color: 'yellow', value: 100 }, + { color: 'red', value: 1000 }, + ]) + + stat.panelOptions.withDescription('Current number of events stuck in the DLQ') + + stat.gridPos.withW(statWidth) + + stat.gridPos.withH(statHeight), + + stat.new('Backlog Age (oldest)') + + stat.options.withColorMode('value') + + stat.options.withGraphMode('none') + + stat.options.reduceOptions.withCalcs(['lastNotNull']) + + stat.standardOptions.withUnit('short') + + stat.datasource.withType('prometheus') + + stat.datasource.withUid(datasource) + + stat.queryOptions.withTargets([ + prometheus.new( + datasource, + 'max(nats_stream_last_seq{stream_name="ACTIVITY_DEAD_LETTER", ' + SEL_global + '} - on() nats_stream_first_seq{stream_name="ACTIVITY_DEAD_LETTER", ' + SEL_global + '})' + ) + + prometheus.withLegendFormat('Seq gap'), + ]) + + stat.standardOptions.thresholds.withSteps([ + { color: 'green', value: null }, + { color: 'yellow', value: 1000 }, + { color: 'red', value: 10000 }, + ]) + + stat.panelOptions.withDescription('Sequence gap in DLQ stream — proxy for backlog age') + + stat.gridPos.withW(statWidth) + + stat.gridPos.withH(statHeight), + + stat.new('DLQ Publish Rate') + + stat.options.withColorMode('background') + + stat.options.withGraphMode('area') + + stat.options.reduceOptions.withCalcs(['lastNotNull']) + + stat.standardOptions.withUnit('ops') + + stat.datasource.withType('prometheus') + + stat.datasource.withUid(datasource) + + stat.queryOptions.withTargets([ + prometheus.new( + datasource, + 'sum(rate(activity_processor_dlq_events_published_total{' + SEL + '}[5m])) or vector(0)' + ) + + prometheus.withLegendFormat('Events/s'), + ]) + + stat.standardOptions.thresholds.withSteps([ + { color: 'green', value: null }, + { color: 'yellow', value: 0.1 }, + { color: 'red', value: 1 }, + ]) + + stat.panelOptions.withDescription('Rate of new events being published to the DLQ') + + stat.gridPos.withW(statWidth) + + stat.gridPos.withH(statHeight), + + stat.new('Retry Resolve Rate') + + stat.options.withColorMode('value') + + stat.options.withGraphMode('area') + + stat.options.reduceOptions.withCalcs(['lastNotNull']) + + stat.standardOptions.withUnit('ops') + + stat.datasource.withType('prometheus') + + stat.datasource.withUid(datasource) + + stat.queryOptions.withTargets([ + prometheus.new( + datasource, + 'sum(rate(activity_processor_dlq_retry_attempts_total{result=~"succeeded|republished", ' + SEL_retry_attempts + '}[5m])) or vector(0)' + ) + + prometheus.withLegendFormat('Resolved/s'), + ]) + + stat.panelOptions.withDescription('Rate at which retries are clearing DLQ events') + + stat.gridPos.withW(statWidth) + + stat.gridPos.withH(statHeight), + + stat.new('Net Drain') + + stat.options.withColorMode('background') + + stat.options.withGraphMode('none') + + stat.options.reduceOptions.withCalcs(['lastNotNull']) + + stat.standardOptions.withUnit('ops') + + stat.datasource.withType('prometheus') + + stat.datasource.withUid(datasource) + + stat.queryOptions.withTargets([ + prometheus.new( + datasource, + '(sum(rate(activity_processor_dlq_events_published_total{' + SEL + '}[5m])) or vector(0)) - (sum(rate(activity_processor_dlq_retry_attempts_total{result=~"succeeded|republished", ' + SEL_retry_attempts + '}[5m])) or vector(0))' + ) + + prometheus.withLegendFormat('Net drain'), + ]) + + stat.standardOptions.thresholds.withSteps([ + { color: 'green', value: null }, + { color: 'yellow', value: 0 }, + { color: 'red', value: 0.01 }, + ]) + + stat.panelOptions.withDescription('Publish rate minus resolve rate — positive means backlog is growing') + + stat.gridPos.withW(statWidth) + + stat.gridPos.withH(statHeight), + + stat.new('Retry Success Rate') + + stat.options.withColorMode('background') + + stat.options.withGraphMode('area') + + stat.options.reduceOptions.withCalcs(['lastNotNull']) + + stat.standardOptions.withUnit('percentunit') + + stat.datasource.withType('prometheus') + + stat.datasource.withUid(datasource) + + stat.queryOptions.withTargets([ + prometheus.new( + datasource, + '(sum(rate(activity_processor_dlq_retry_attempts_total{result="succeeded", ' + SEL_global + '}[5m])) or vector(0)) / clamp_min(sum(rate(activity_processor_dlq_retry_attempts_total{' + SEL_global + '}[5m])), 1)' + ) + + prometheus.withLegendFormat('Success rate'), + ]) + + stat.standardOptions.thresholds.withSteps([ + { color: 'red', value: null }, + { color: 'yellow', value: 0.8 }, + { color: 'green', value: 0.95 }, + ]) + + stat.panelOptions.withDescription('Fraction of DLQ retry attempts that succeeded') + + stat.gridPos.withW(statWidth) + + stat.gridPos.withH(statHeight), + + stat.new('DLQ Publish Errors') + + stat.options.withColorMode('background') + + stat.options.withGraphMode('area') + + stat.options.reduceOptions.withCalcs(['lastNotNull']) + + stat.standardOptions.withUnit('ops') + + stat.datasource.withType('prometheus') + + stat.datasource.withUid(datasource) + + stat.queryOptions.withTargets([ + prometheus.new( + datasource, + 'sum(rate(activity_processor_dlq_publish_errors_total{' + SEL_global + '}[5m])) or vector(0)' + ) + + prometheus.withLegendFormat('Errors/s'), + ]) + + stat.standardOptions.thresholds.withSteps([ + { color: 'green', value: null }, + { color: 'red', value: 0.01 }, + ]) + + stat.panelOptions.withDescription('Rate of errors when publishing to DLQ — non-zero means events are being lost') + + stat.gridPos.withW(statWidth) + + stat.gridPos.withH(statHeight), + + row.new('What is broken NOW') + + row.withCollapsed(false), + + tablePanel.new('Top Failing Policies') + + tablePanel.datasource.withType('prometheus') + + tablePanel.datasource.withUid(datasource) + + tablePanel.options.withShowHeader(true) + + tablePanel.options.withSortBy([ + { displayName: 'Value', desc: true }, + ]) + + tablePanel.queryOptions.withTargets([ + prometheus.new( + datasource, + 'topk(25, sum by (policy_name, api_group, kind, error_type) (rate(activity_processor_dlq_events_published_total{policy_name!="", ' + SEL + '}[10m])))' + ) + + prometheus.withInstant(true) + + prometheus.withLegendFormat('{{policy_name}}'), + ]) + + tablePanel.standardOptions.withUnit('ops') + + tablePanel.standardOptions.withLinks([ + { + title: 'View in Loki', + url: '/explore?orgId=1&left={"datasource":"loki","queries":[{"expr":"{namespace=\\"activity-system\\", container=\\"processor\\"} | json | policy=\\"${__data.fields.policy_name}\\" | errorType=~\\".+\\"","refId":"A"}],"range":{"from":"${__from}","to":"${__to}"}}', + targetBlank: true, + }, + ]) + + tablePanel.panelOptions.withDescription('Top 25 policies currently publishing to DLQ — the primary triage view for ActivityPolicyDLQErrors') + + tablePanel.gridPos.withW(tableFullWidth) + + tablePanel.gridPos.withH(tableHeight), + + row.new('Trends') + + row.withCollapsed(false), + + timeSeries.new('DLQ Rate by error_type') + + timeSeries.options.legend.withDisplayMode('table') + + timeSeries.options.legend.withPlacement('bottom') + + timeSeries.options.legend.withShowLegend(true) + + timeSeries.options.legend.withCalcs(['lastNotNull', 'mean']) + + timeSeries.standardOptions.withUnit('ops') + + timeSeries.fieldConfig.defaults.custom.withFillOpacity(30) + + timeSeries.fieldConfig.defaults.custom.withShowPoints('never') + + timeSeries.fieldConfig.defaults.custom.stacking.withMode('normal') + + timeSeries.datasource.withType('prometheus') + + timeSeries.datasource.withUid(datasource) + + timeSeries.queryOptions.withTargets([ + prometheus.new( + datasource, + 'sum by (error_type) (rate(activity_processor_dlq_events_published_total{' + SEL + '}[5m]))' + ) + + prometheus.withLegendFormat('{{error_type}}'), + ]) + + timeSeries.panelOptions.withDescription('DLQ rate by failure class — identifies dominant error mode') + + timeSeries.gridPos.withW(timeSeriesHalfWidth) + + timeSeries.gridPos.withH(timeSeriesHeight), + + timeSeries.new('DLQ Rate by policy') + + timeSeries.options.legend.withDisplayMode('table') + + timeSeries.options.legend.withPlacement('bottom') + + timeSeries.options.legend.withShowLegend(true) + + timeSeries.options.legend.withCalcs(['lastNotNull', 'mean']) + + timeSeries.standardOptions.withUnit('ops') + + timeSeries.fieldConfig.defaults.custom.withFillOpacity(30) + + timeSeries.fieldConfig.defaults.custom.withShowPoints('never') + + timeSeries.fieldConfig.defaults.custom.stacking.withMode('normal') + + timeSeries.datasource.withType('prometheus') + + timeSeries.datasource.withUid(datasource) + + timeSeries.queryOptions.withTargets([ + prometheus.new( + datasource, + 'label_replace(sum by (policy_name) (rate(activity_processor_dlq_events_published_total{' + SEL + '}[5m])), "policy_name", "(no policy)", "policy_name", "^$")' + ) + + prometheus.withLegendFormat('{{policy_name}}'), + ]) + + timeSeries.standardOptions.withLinks([ + { + title: 'View in Loki', + url: '/explore?orgId=1&left={"datasource":"loki","queries":[{"expr":"{namespace=\\"activity-system\\", container=\\"processor\\"} | json | policy=\\"${__field.labels.policy_name}\\"","refId":"A"}],"range":{"from":"${__from}","to":"${__to}"}}', + targetBlank: true, + }, + ]) + + timeSeries.panelOptions.withDescription('DLQ rate by policy — identifies persistent per-policy failures (DLQSlowLeak)') + + timeSeries.gridPos.withW(timeSeriesHalfWidth) + + timeSeries.gridPos.withH(timeSeriesHeight), + + timeSeries.new('DLQ Rate by kind') + + timeSeries.options.legend.withDisplayMode('table') + + timeSeries.options.legend.withPlacement('bottom') + + timeSeries.options.legend.withShowLegend(true) + + timeSeries.options.legend.withCalcs(['lastNotNull', 'mean']) + + timeSeries.standardOptions.withUnit('ops') + + timeSeries.fieldConfig.defaults.custom.withFillOpacity(30) + + timeSeries.fieldConfig.defaults.custom.withShowPoints('never') + + timeSeries.fieldConfig.defaults.custom.stacking.withMode('normal') + + timeSeries.datasource.withType('prometheus') + + timeSeries.datasource.withUid(datasource) + + timeSeries.queryOptions.withTargets([ + prometheus.new( + datasource, + 'sum by (api_group, kind) (rate(activity_processor_dlq_events_published_total{' + SEL + '}[5m]))' + ) + + prometheus.withLegendFormat('{{api_group}}/{{kind}}'), + ]) + + timeSeries.panelOptions.withDescription('DLQ rate by resource kind — identifies affected resource types') + + timeSeries.gridPos.withW(timeSeriesHalfWidth) + + timeSeries.gridPos.withH(timeSeriesHeight), + + row.new('Retry & Recovery') + + row.withCollapsed(false), + + timeSeries.new('Retry outcomes') + + timeSeries.options.legend.withDisplayMode('table') + + timeSeries.options.legend.withPlacement('bottom') + + timeSeries.options.legend.withShowLegend(true) + + timeSeries.options.legend.withCalcs(['lastNotNull', 'mean']) + + timeSeries.standardOptions.withUnit('ops') + + timeSeries.fieldConfig.defaults.custom.withFillOpacity(30) + + timeSeries.fieldConfig.defaults.custom.withShowPoints('never') + + timeSeries.fieldConfig.defaults.custom.stacking.withMode('normal') + + timeSeries.datasource.withType('prometheus') + + timeSeries.datasource.withUid(datasource) + + timeSeries.queryOptions.withTargets([ + prometheus.new( + datasource, + 'sum by (result) (rate(activity_processor_dlq_retry_attempts_total{' + SEL_retry_attempts + '}[5m]))' + ) + + prometheus.withLegendFormat('{{result}}'), + ]) + + timeSeries.panelOptions.withDescription('Retry attempt outcomes over time — succeeded vs republished vs failed') + + timeSeries.gridPos.withW(timeSeriesHalfWidth) + + timeSeries.gridPos.withH(timeSeriesHeight), + + tablePanel.new('Still-failing re-eval by policy') + + tablePanel.datasource.withType('prometheus') + + tablePanel.datasource.withUid(datasource) + + tablePanel.options.withShowHeader(true) + + tablePanel.options.withSortBy([ + { displayName: 'Value', desc: true }, + ]) + + tablePanel.queryOptions.withTargets([ + prometheus.new( + datasource, + 'topk(25, sum by (policy_name, error_type) (rate(activity_processor_dlq_retry_failed_total{' + SEL_retry_failed + '}[10m])))' + ) + + prometheus.withInstant(true) + + prometheus.withLegendFormat('{{policy_name}}'), + ]) + + tablePanel.standardOptions.withUnit('ops') + + tablePanel.panelOptions.withDescription('Policies NOT recovering after retry — triage for DLQRetryIneffective') + + tablePanel.gridPos.withW(timeSeriesHalfWidth) + + tablePanel.gridPos.withH(timeSeriesHeight), + + timeSeries.new('High-retry (poison) events by policy') + + timeSeries.options.legend.withDisplayMode('table') + + timeSeries.options.legend.withPlacement('bottom') + + timeSeries.options.legend.withShowLegend(true) + + timeSeries.options.legend.withCalcs(['lastNotNull', 'mean']) + + timeSeries.standardOptions.withUnit('short') + + timeSeries.fieldConfig.defaults.custom.withFillOpacity(10) + + timeSeries.fieldConfig.defaults.custom.withShowPoints('never') + + timeSeries.datasource.withType('prometheus') + + timeSeries.datasource.withUid(datasource) + + timeSeries.queryOptions.withTargets([ + prometheus.new( + datasource, + 'sum by (policy_name, api_group, kind) (increase(activity_processor_dlq_retry_events_high_retry_total{' + SEL_high_retry + '}[1h]))' + ) + + prometheus.withLegendFormat('{{policy_name}}'), + ]) + + timeSeries.panelOptions.withDescription('Events exceeding retry threshold by policy — identifies poison events (DLQHighRetryCount)') + + timeSeries.gridPos.withW(timeSeriesHalfWidth) + + timeSeries.gridPos.withH(timeSeriesHeight), + + timeSeries.new('Retry batch duration p99') + + timeSeries.options.legend.withDisplayMode('table') + + timeSeries.options.legend.withPlacement('bottom') + + timeSeries.options.legend.withShowLegend(true) + + timeSeries.options.legend.withCalcs(['lastNotNull', 'mean', 'max']) + + timeSeries.standardOptions.withUnit('s') + + timeSeries.fieldConfig.defaults.custom.withFillOpacity(10) + + timeSeries.fieldConfig.defaults.custom.withShowPoints('never') + + timeSeries.datasource.withType('prometheus') + + timeSeries.datasource.withUid(datasource) + + timeSeries.queryOptions.withTargets([ + prometheus.new( + datasource, + 'histogram_quantile(0.99, sum by (le, trigger) (rate(activity_processor_dlq_retry_batch_duration_seconds_bucket{' + SEL_global + '}[5m])))' + ) + + prometheus.withLegendFormat('{{trigger}} p99'), + ]) + + timeSeries.panelOptions.withDescription('Retry batch processing duration — high values indicate retry path stalling') + + timeSeries.gridPos.withW(timeSeriesHalfWidth) + + timeSeries.gridPos.withH(timeSeriesHeight), + + row.new('Publish-Path Health') + + row.withCollapsed(false), + + timeSeries.new('Publish errors by phase') + + timeSeries.options.legend.withDisplayMode('table') + + timeSeries.options.legend.withPlacement('bottom') + + timeSeries.options.legend.withShowLegend(true) + + timeSeries.options.legend.withCalcs(['lastNotNull', 'mean']) + + timeSeries.standardOptions.withUnit('ops') + + timeSeries.fieldConfig.defaults.custom.withFillOpacity(30) + + timeSeries.fieldConfig.defaults.custom.withShowPoints('never') + + timeSeries.fieldConfig.defaults.custom.stacking.withMode('normal') + + timeSeries.datasource.withType('prometheus') + + timeSeries.datasource.withUid(datasource) + + timeSeries.queryOptions.withTargets([ + prometheus.new( + datasource, + 'sum by (error_phase) (rate(activity_processor_dlq_publish_errors_total{' + SEL_global + '}[5m]))' + ) + + prometheus.withLegendFormat('{{error_phase}}'), + ]) + + timeSeries.panelOptions.withDescription('DLQ publish errors by phase (marshal/publish) — non-zero is data loss risk') + + timeSeries.gridPos.withW(timeSeriesHalfWidth) + + timeSeries.gridPos.withH(timeSeriesHeight), + + timeSeries.new('DLQ publish latency') + + timeSeries.options.legend.withDisplayMode('table') + + timeSeries.options.legend.withPlacement('bottom') + + timeSeries.options.legend.withShowLegend(true) + + timeSeries.options.legend.withCalcs(['lastNotNull', 'mean', 'max']) + + timeSeries.standardOptions.withUnit('s') + + timeSeries.fieldConfig.defaults.custom.withFillOpacity(10) + + timeSeries.fieldConfig.defaults.custom.withShowPoints('never') + + timeSeries.datasource.withType('prometheus') + + timeSeries.datasource.withUid(datasource) + + timeSeries.queryOptions.withTargets([ + prometheus.new( + datasource, + 'histogram_quantile(0.99, sum by (le) (rate(activity_processor_dlq_publish_latency_seconds_bucket{' + SEL_global + '}[5m])))' + ) + + prometheus.withLegendFormat('p99'), + prometheus.new( + datasource, + 'histogram_quantile(0.95, sum by (le) (rate(activity_processor_dlq_publish_latency_seconds_bucket{' + SEL_global + '}[5m])))' + ) + + prometheus.withLegendFormat('p95'), + prometheus.new( + datasource, + 'histogram_quantile(0.50, sum by (le) (rate(activity_processor_dlq_publish_latency_seconds_bucket{' + SEL_global + '}[5m])))' + ) + + prometheus.withLegendFormat('p50'), + ]) + + timeSeries.panelOptions.withDescription('DLQ publish write path latency distribution') + + timeSeries.gridPos.withW(timeSeriesHalfWidth) + + timeSeries.gridPos.withH(timeSeriesHeight), + + row.new('Processor Logs') + + row.withCollapsed(false), + + logsPanel.new('DLQ Events — Processor Logs') + + logsPanel.datasource.withType('loki') + + logsPanel.datasource.withUid('$loki_datasource') + + logsPanel.options.withShowTime(true) + + logsPanel.options.withSortOrder('Descending') + + logsPanel.options.withWrapLogMessage(false) + + logsPanel.options.withEnableLogDetails(true) + + logsPanel.options.withDisplayedFields(['policy', 'errorType', 'msg']) + + logsPanel.queryOptions.withTargets([ + loki.new( + '$loki_datasource', + '{namespace="activity-system", container="processor"} | json | errorType != "" | policy=~"${policy_name:regex}" | errorType=~"${error_type:regex}"' + ) + + loki.withRefId('A'), + ]) + + logsPanel.panelOptions.withDescription('DLQ processor logs filtered by selected policy and error type — shows Published event to DLQ lines with policy and errorType fields') + + logsPanel.gridPos.withW(tableFullWidth) + + logsPanel.gridPos.withH(tableHeight), + + logsPanel.new('DLQ/Policy Errors (raw)') + + logsPanel.datasource.withType('loki') + + logsPanel.datasource.withUid('$loki_datasource') + + logsPanel.options.withShowTime(true) + + logsPanel.options.withSortOrder('Descending') + + logsPanel.options.withWrapLogMessage(false) + + logsPanel.options.withEnableLogDetails(true) + + logsPanel.queryOptions.withTargets([ + loki.new( + '$loki_datasource', + '{namespace="activity-system", container="processor"} |~ "(?i)dlq|dead.letter|failed to evaluate|failed to republish|Published event to DLQ"' + ) + + loki.withRefId('A'), + ]) + + logsPanel.panelOptions.withDescription('Catch-all DLQ log filter — includes non-JSON error lines and all dlq/dead-letter references') + + logsPanel.gridPos.withW(tableFullWidth) + + logsPanel.gridPos.withH(tableHeight), +], panelWidth=statWidth, panelHeight=statHeight); + +dashboard.new('Activity — DLQ & Policy Health') ++ dashboard.withDescription('Single-pane triage dashboard for DLQ backlog, failing policies, retry recovery, and processor logs') ++ dashboard.withTags(['activity', 'dlq', 'policy', 'health', 'on-call']) ++ dashboard.withUid('activity-dlq-policy-health') ++ dashboard.time.withFrom('now-6h') ++ dashboard.time.withTo('now') ++ dashboard.withTimezone(config.dashboards.timezone) ++ dashboard.withRefresh(refresh) ++ dashboard.withEditable(true) ++ dashboard.graphTooltip.withSharedCrosshair() ++ dashboard.withLinks([ + { + title: 'DLQ Runbooks', + url: 'https://github.com/milo-os/activity/tree/main/docs/runbooks/dlq/', + type: 'link', + targetBlank: true, + icon: 'external link', + }, +]) ++ dashboard.withVariablesMixin([ + g.dashboard.variable.datasource.new('datasource', 'prometheus') + + g.dashboard.variable.datasource.generalOptions.withLabel('Prometheus Datasource') + + g.dashboard.variable.datasource.withRegex(datasourceRegex), + + g.dashboard.variable.query.new('cluster', 'label_values(activity_processor_dlq_events_published_total, cluster)') + + g.dashboard.variable.query.withDatasource('prometheus', datasource) + + g.dashboard.variable.query.generalOptions.withLabel('Cluster') + + g.dashboard.variable.query.selectionOptions.withMulti() + + g.dashboard.variable.query.selectionOptions.withIncludeAll() + + g.dashboard.variable.query.refresh.onTime(), + + g.dashboard.variable.query.new('api_group', 'label_values(activity_processor_dlq_events_published_total{cluster=~"$cluster"}, api_group)') + + g.dashboard.variable.query.withDatasource('prometheus', datasource) + + g.dashboard.variable.query.generalOptions.withLabel('API Group') + + g.dashboard.variable.query.selectionOptions.withMulti() + + g.dashboard.variable.query.selectionOptions.withIncludeAll() + + g.dashboard.variable.query.refresh.onTime(), + + g.dashboard.variable.query.new('kind', 'label_values(activity_processor_dlq_events_published_total{cluster=~"$cluster",api_group=~"$api_group"}, kind)') + + g.dashboard.variable.query.withDatasource('prometheus', datasource) + + g.dashboard.variable.query.generalOptions.withLabel('Kind') + + g.dashboard.variable.query.selectionOptions.withMulti() + + g.dashboard.variable.query.selectionOptions.withIncludeAll() + + g.dashboard.variable.query.refresh.onTime(), + + g.dashboard.variable.query.new('policy_name', 'label_values(activity_processor_dlq_events_published_total{cluster=~"$cluster",api_group=~"$api_group",kind=~"$kind"}, policy_name)') + + g.dashboard.variable.query.withDatasource('prometheus', datasource) + + g.dashboard.variable.query.generalOptions.withLabel('Policy') + + g.dashboard.variable.query.selectionOptions.withMulti() + + g.dashboard.variable.query.selectionOptions.withIncludeAll() + + g.dashboard.variable.query.refresh.onTime(), + + g.dashboard.variable.query.new('error_type', 'label_values(activity_processor_dlq_events_published_total{cluster=~"$cluster"}, error_type)') + + g.dashboard.variable.query.withDatasource('prometheus', datasource) + + g.dashboard.variable.query.generalOptions.withLabel('Error Type') + + g.dashboard.variable.query.selectionOptions.withMulti() + + g.dashboard.variable.query.selectionOptions.withIncludeAll() + + g.dashboard.variable.query.refresh.onTime(), + + g.dashboard.variable.datasource.new('loki_datasource', 'loki') + + g.dashboard.variable.datasource.generalOptions.withLabel('Loki Datasource') + + g.dashboard.variable.datasource.generalOptions.showOnDashboard.withNothing(), +]) ++ dashboard.withPanels(allPanels)